diff --git a/.github/workflows/content-package.yml b/.github/workflows/content-package.yml new file mode 100644 index 00000000..f8212ce7 --- /dev/null +++ b/.github/workflows/content-package.yml @@ -0,0 +1,54 @@ +name: Content Package Contract + +on: + pull_request: + push: + branches: + - main + +permissions: + contents: read + +jobs: + validate-content-package: + runs-on: ubuntu-latest + defaults: + run: + working-directory: microplex-us + steps: + - name: Check out microplex-us + uses: actions/checkout@v4 + with: + path: microplex-us + + - name: Check out Microplex core + uses: actions/checkout@v4 + with: + repository: PolicyEngine/microplex + ref: 5a1ea5e107334f45e850678774efc0c613dce250 + path: microplex + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Set up uv + uses: astral-sh/setup-uv@v6 + + - name: Reject Python files + run: | + if find . -name '*.py' -print -quit | grep -q .; then + find . -name '*.py' -print + exit 1 + fi + + - name: Validate YAML and JSON contract + run: | + PYTHONPATH="../microplex/src:src" uv run --no-project --python 3.13 \ + --with pydantic --with pyyaml \ + python -m microplex.content_package \ + --package microplex_us \ + --spec specs/us-2024.yaml \ + --contract manifests/ecps_export_contract.json \ + --src-root src/microplex_us diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index d970e89d..00000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: Docs - -on: - pull_request: - push: - branches: - - main - workflow_dispatch: - -permissions: - contents: read - -jobs: - build-docs: - runs-on: ubuntu-latest - defaults: - run: - working-directory: microplex-us - steps: - - name: Check out microplex-us - uses: actions/checkout@v4 - with: - path: microplex-us - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.13" - - - name: Set up uv - uses: astral-sh/setup-uv@v6 - with: - version: "0.11.14" - - - name: Build docs - run: uv run --python 3.13 --extra docs jupyter-book build docs diff --git a/.github/workflows/export-columns.yml b/.github/workflows/export-columns.yml deleted file mode 100644 index 50eccbc8..00000000 --- a/.github/workflows/export-columns.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Export columns (eCPS parity) - -# Fast, first-line contract gate. Compares an export's column set against -# the frozen eCPS contract in milliseconds, with no H5 build, no GPU, and -# none of the heavy ML deps (microplex / torch / policyengine-us). This is -# a standalone workflow on purpose so column drift is caught before the -# slow artifact-gate and site-snapshot jobs run. - -on: - push: - branches: [main] - pull_request: - -permissions: - contents: read - -jobs: - column-parity: - runs-on: ubuntu-latest - steps: - - name: Check out repository - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.13" - - name: Set up uv - uses: astral-sh/setup-uv@v6 - with: - version: "0.11.14" - - name: Install minimal deps - # Only what the gate and its tests need; no microplex / torch. - run: uv pip install --system pytest h5py numpy - - name: Run column-parity tests - run: PYTHONPATH=src python -m pytest tests/pipelines/test_check_export_columns.py -q - - name: Self-check against committed clean fixture - # Run the module as a file (not `-m`) so the package __init__ - # (which imports microplex/torch) never loads. Proves the gate - # exits 0 on a known-good column set, with no data file at all. - run: python src/microplex_us/pipelines/check_export_columns.py --columns-json tests/pipelines/fixtures/ecps_clean_columns.json diff --git a/.github/workflows/mp300k-artifact-gates.yml b/.github/workflows/mp300k-artifact-gates.yml deleted file mode 100644 index 0c1c7814..00000000 --- a/.github/workflows/mp300k-artifact-gates.yml +++ /dev/null @@ -1,295 +0,0 @@ -name: mp-300k Artifact Gates - -on: - pull_request: - push: - branches: - - main - workflow_dispatch: - inputs: - gate_inputs_artifact: - description: Optional Actions artifact name containing artifact.tar.gz and evidence JSONs. - required: false - default: "" - type: string - artifact_archive_url: - description: URL to a .zip, .tar, or .tar.gz artifact bundle containing manifest.json. - required: false - default: "" - type: string - ecps_comparison_url: - description: Optional URL to precomputed PE-native eCPS comparison JSON. - required: false - type: string - runtime_smoke_url: - description: Optional URL to runtime smoke benchmark JSON. - required: false - type: string - arch_coverage_url: - description: Optional URL to Arch target coverage JSON. - required: false - type: string - benchmark_manifest_url: - description: Optional URL to the frozen microsimulation benchmark manifest. - required: false - type: string - target_period: - description: PolicyEngine period to validate. - required: false - default: "2024" - type: string - runtime_ratio_threshold: - description: Maximum candidate/baseline runtime ratio. - required: false - default: "1.25" - type: string - artifact_size_ratio_threshold: - description: Maximum candidate/baseline H5 size ratio. - required: false - default: "2.0" - type: string - require_ecps_comparison: - description: Keep the eCPS comparison as a blocking gate. - required: false - default: true - type: boolean - workflow_call: - inputs: - gate_inputs_artifact: - required: false - default: "" - type: string - artifact_archive_url: - required: false - default: "" - type: string - ecps_comparison_url: - required: false - type: string - runtime_smoke_url: - required: false - type: string - arch_coverage_url: - required: false - type: string - benchmark_manifest_url: - required: false - type: string - target_period: - required: false - default: "2024" - type: string - runtime_ratio_threshold: - required: false - default: "1.25" - type: string - artifact_size_ratio_threshold: - required: false - default: "2.0" - type: string - require_ecps_comparison: - required: false - default: true - type: boolean - -permissions: - actions: read - contents: read - -jobs: - implementation-tests: - if: github.event_name == 'pull_request' || github.event_name == 'push' - runs-on: ubuntu-latest - defaults: - run: - working-directory: microplex-us - steps: - - name: Check out microplex-us - uses: actions/checkout@v4 - with: - path: microplex-us - - - name: Check out core microplex - uses: actions/checkout@v4 - with: - repository: PolicyEngine/microplex - ref: 773106e3a159a0417ed15025b507ab05c0b93b5d - path: microplex - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.13" - - - name: Set up uv - uses: astral-sh/setup-uv@v6 - - - name: Test artifact gate implementation - run: | - uv run --python 3.13 --extra dev --with pydantic --with-editable ../microplex pytest -q \ - tests/pipelines/test_mp300k_artifact_gates.py \ - tests/pipelines/test_mp300k_gate_inputs.py - - artifact-gates: - if: github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' - runs-on: ubuntu-latest - defaults: - run: - working-directory: microplex-us - steps: - - name: Check out microplex-us - uses: actions/checkout@v4 - with: - path: microplex-us - - - name: Check out core microplex - uses: actions/checkout@v4 - with: - repository: PolicyEngine/microplex - ref: 773106e3a159a0417ed15025b507ab05c0b93b5d - path: microplex - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.13" - - - name: Set up uv - uses: astral-sh/setup-uv@v6 - - - name: Download packaged gate inputs - if: inputs.gate_inputs_artifact != '' - uses: actions/download-artifact@v4 - with: - name: ${{ inputs.gate_inputs_artifact }} - path: gate-inputs - - - name: Download artifact and evidence from URLs - if: inputs.gate_inputs_artifact == '' - run: | - mkdir -p ../gate-inputs/evidence - if [ -z "${{ inputs.artifact_archive_url }}" ]; then - echo "Either gate_inputs_artifact or artifact_archive_url is required." - exit 1 - fi - curl --fail --location "${{ inputs.artifact_archive_url }}" --output ../gate-inputs/artifact-archive - - if [ -n "${{ inputs.ecps_comparison_url }}" ]; then - curl --fail --location "${{ inputs.ecps_comparison_url }}" --output ../gate-inputs/evidence/ecps_comparison.json - fi - if [ -n "${{ inputs.runtime_smoke_url }}" ]; then - curl --fail --location "${{ inputs.runtime_smoke_url }}" --output ../gate-inputs/evidence/runtime_smoke.json - fi - if [ -n "${{ inputs.arch_coverage_url }}" ]; then - curl --fail --location "${{ inputs.arch_coverage_url }}" --output ../gate-inputs/evidence/arch_coverage.json - fi - if [ -n "${{ inputs.benchmark_manifest_url }}" ]; then - curl --fail --location "${{ inputs.benchmark_manifest_url }}" --output ../gate-inputs/evidence/benchmark_manifest.json - fi - - - name: Normalize gate inputs - run: | - uv run --python 3.13 python - <<'PY' - import shutil - from pathlib import Path - - root = Path("../gate-inputs") - evidence_dir = root / "evidence" - evidence_dir.mkdir(parents=True, exist_ok=True) - - archive_target = root / "artifact-archive" - if not archive_target.exists(): - archive_candidates = [] - for pattern in ("artifact.tar.gz", "artifact.tgz", "artifact.tar", "artifact.zip", "*.tar.gz", "*.tgz", "*.tar", "*.zip"): - archive_candidates.extend(root.glob(pattern)) - archive_candidates = [ - path for path in archive_candidates if path.is_file() - ] - if not archive_candidates: - raise SystemExit( - "Packaged gate inputs did not contain artifact.tar.gz, " - "artifact.tgz, artifact.tar, or artifact.zip" - ) - shutil.copyfile(archive_candidates[0], archive_target) - - for name in ("ecps_comparison", "runtime_smoke", "arch_coverage", "benchmark_manifest"): - source = root / f"{name}.json" - destination = evidence_dir / f"{name}.json" - if source.exists() and not destination.exists(): - shutil.copyfile(source, destination) - PY - - - name: Resolve artifact directory - run: | - uv run --python 3.13 python - <<'PY' - import tarfile - import zipfile - from pathlib import Path - - archive = Path("../gate-inputs/artifact-archive") - extract_root = Path("../gate-inputs/artifact-root") - extract_root.mkdir(parents=True, exist_ok=True) - - if tarfile.is_tarfile(archive): - with tarfile.open(archive) as handle: - handle.extractall(extract_root, filter="data") - elif zipfile.is_zipfile(archive): - with zipfile.ZipFile(archive) as handle: - for member in handle.infolist(): - destination = (extract_root / member.filename).resolve() - if not destination.is_relative_to(extract_root.resolve()): - raise SystemExit( - f"zip archive member escapes artifact root: {member.filename}" - ) - handle.extract(member, extract_root) - else: - raise SystemExit("artifact_archive_url must point to a tar or zip archive") - - manifests = sorted( - extract_root.rglob("manifest.json"), - key=lambda path: len(path.relative_to(extract_root).parts), - ) - if not manifests: - raise SystemExit("artifact archive does not contain manifest.json") - Path("../gate-inputs/artifact_dir.txt").write_text(str(manifests[0].parent.resolve())) - PY - - - name: Run artifact gates - run: | - artifact_dir="$(cat ../gate-inputs/artifact_dir.txt)" - args=( - --artifact-dir "$artifact_dir" - --target-period "${{ inputs.target_period }}" - --artifact-size-ratio-threshold "${{ inputs.artifact_size_ratio_threshold }}" - --runtime-ratio-threshold "${{ inputs.runtime_ratio_threshold }}" - --output-json ../gate-inputs/mp300k_artifact_gates.json - --no-update-manifest - ) - - if [ -f ../gate-inputs/evidence/ecps_comparison.json ]; then - args+=(--ecps-comparison-json ../gate-inputs/evidence/ecps_comparison.json) - else - args+=(--skip-ecps-computation) - fi - if [ -f ../gate-inputs/evidence/runtime_smoke.json ]; then - args+=(--runtime-smoke-json ../gate-inputs/evidence/runtime_smoke.json) - fi - if [ -f ../gate-inputs/evidence/arch_coverage.json ]; then - args+=(--arch-coverage-json ../gate-inputs/evidence/arch_coverage.json) - fi - if [ -f ../gate-inputs/evidence/benchmark_manifest.json ]; then - args+=(--benchmark-manifest ../gate-inputs/evidence/benchmark_manifest.json) - fi - if [ "${{ inputs.require_ecps_comparison }}" != "true" ]; then - args+=(--no-require-ecps-comparison) - fi - - uv run --python 3.13 --extra dev --with pydantic --with-editable ../microplex \ - microplex-us-mp300k-artifact-gates "${args[@]}" - - - name: Upload gate report - if: always() - uses: actions/upload-artifact@v4 - with: - name: mp300k-artifact-gates - path: gate-inputs/mp300k_artifact_gates.json diff --git a/.github/workflows/publish-hf-artifacts.yml b/.github/workflows/publish-hf-artifacts.yml deleted file mode 100644 index 83a9be2b..00000000 --- a/.github/workflows/publish-hf-artifacts.yml +++ /dev/null @@ -1,238 +0,0 @@ -name: Publish Hugging Face Artifacts - -on: - workflow_dispatch: - inputs: - artifact_name: - description: Optional Actions artifact name containing a bundle or archive. - required: false - default: "" - type: string - artifact_archive_url: - description: Optional URL to a .zip, .tar, .tar.gz, or .tgz artifact bundle. - required: false - default: "" - type: string - run_id: - description: Optional stable run ID. Defaults to the bundle directory name. - required: false - default: "" - type: string - diagnostics_repo: - description: Hugging Face diagnostics dataset repo. - required: false - default: policyengine/microplex-us-diagnostics - type: string - dataset_repo: - description: Hugging Face deployed dataset repo. - required: false - default: policyengine/microplex-us-deployed-datasets - type: string - publish_dataset: - description: Upload policyengine_us.h5 to the dataset repo staging path. - required: false - default: false - type: boolean - promote_dataset: - description: Promote policyengine_us.h5 and manifest.json to dataset repo root. - required: false - default: false - type: boolean - dry_run: - description: Plan the upload without committing to Hugging Face. - required: false - default: true - type: boolean - workflow_call: - inputs: - artifact_name: - required: false - default: "" - type: string - artifact_archive_url: - required: false - default: "" - type: string - run_id: - required: false - default: "" - type: string - diagnostics_repo: - required: false - default: policyengine/microplex-us-diagnostics - type: string - dataset_repo: - required: false - default: policyengine/microplex-us-deployed-datasets - type: string - publish_dataset: - required: false - default: false - type: boolean - promote_dataset: - required: false - default: false - type: boolean - dry_run: - required: false - default: true - type: boolean - secrets: - HF_TOKEN: - required: false - -permissions: - actions: read - contents: read - -jobs: - publish-hf-artifacts: - runs-on: ubuntu-latest - defaults: - run: - working-directory: microplex-us - steps: - - name: Check out microplex-us - uses: actions/checkout@v4 - with: - path: microplex-us - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.13" - - - name: Set up uv - uses: astral-sh/setup-uv@v6 - - - name: Download packaged artifact from Actions - if: inputs.artifact_name != '' - uses: actions/download-artifact@v4 - with: - name: ${{ inputs.artifact_name }} - path: publish-inputs - - - name: Download packaged artifact from URL - if: inputs.artifact_name == '' && inputs.artifact_archive_url != '' - run: | - mkdir -p ../publish-inputs - curl --fail --location "${{ inputs.artifact_archive_url }}" --output ../publish-inputs/artifact-archive - - - name: Resolve artifact directory - run: | - uv run --python 3.13 python - <<'PY' - import shutil - import tarfile - import zipfile - from pathlib import Path - - root = Path("../publish-inputs") - root.mkdir(parents=True, exist_ok=True) - archive_target = root / "artifact-archive" - - if not archive_target.exists(): - archives = [] - for pattern in ("*.tar.gz", "*.tgz", "*.tar", "*.zip"): - archives.extend(root.rglob(pattern)) - archives = [path for path in archives if path.is_file()] - if archives: - shutil.copyfile(archives[0], archive_target) - - search_root = root - if archive_target.exists(): - extract_root = root / "artifact-root" - extract_root.mkdir(parents=True, exist_ok=True) - try: - if zipfile.is_zipfile(archive_target): - with zipfile.ZipFile(archive_target) as archive: - for member in archive.infolist(): - target = (extract_root / member.filename).resolve() - if not target.is_relative_to(extract_root.resolve()): - raise SystemExit( - f"zip archive member escapes artifact root: {member.filename}" - ) - archive.extractall(extract_root) - else: - with tarfile.open(archive_target, "r:*") as archive: - for member in archive.getmembers(): - target = (extract_root / member.name).resolve() - if not target.is_relative_to(extract_root.resolve()): - raise SystemExit( - f"tar archive member escapes artifact root: {member.name}" - ) - archive.extractall(extract_root) - except (tarfile.TarError, zipfile.BadZipFile) as error: - raise SystemExit(f"Could not extract artifact archive: {error}") from error - search_root = extract_root - - manifests = sorted( - ( - path - for path in search_root.rglob("manifest.json") - if path.name == "manifest.json" - ), - key=lambda path: (len(path.relative_to(search_root).parts), str(path)), - ) - if not manifests: - raise SystemExit( - "Artifact input must contain manifest.json directly or inside a supported archive." - ) - Path("../publish-inputs/artifact_dir.txt").write_text( - str(manifests[0].parent.resolve()) - ) - PY - - - name: Test Hugging Face publisher - run: | - uv run --extra dev --python 3.13 python -m pytest -q tests/pipelines/test_hf_artifacts.py - - - name: Publish artifact bundle - env: - HUGGING_FACE_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - artifact_dir="$(cat ../publish-inputs/artifact_dir.txt)" - args=( - "$artifact_dir" - --diagnostics-repo "${{ inputs.diagnostics_repo }}" - --dataset-repo "${{ inputs.dataset_repo }}" - ) - - if [ -n "${{ inputs.run_id }}" ]; then - args+=(--run-id "${{ inputs.run_id }}") - fi - if [ "${{ inputs.publish_dataset }}" = "true" ]; then - args+=(--publish-dataset) - fi - if [ "${{ inputs.promote_dataset }}" = "true" ]; then - args+=(--promote-dataset) - fi - if [ "${{ inputs.dry_run }}" = "true" ]; then - args+=(--dry-run) - elif [ -z "$HUGGING_FACE_TOKEN" ]; then - echo "HF_TOKEN secret is required when dry_run is false." - exit 1 - fi - - uv run --extra hf --python 3.13 microplex-us-publish-hf-artifacts "${args[@]}" - - - name: Smoke-check published artifact - if: inputs.dry_run == false - env: - HUGGING_FACE_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - args=( - --diagnostics-repo "${{ inputs.diagnostics_repo }}" - --dataset-repo "${{ inputs.dataset_repo }}" - ) - - if [ -n "${{ inputs.run_id }}" ]; then - args+=(--run-id "${{ inputs.run_id }}") - fi - if [ "${{ inputs.publish_dataset }}" != "true" ] && [ "${{ inputs.promote_dataset }}" != "true" ]; then - args+=(--no-dataset) - fi - if [ "${{ inputs.promote_dataset }}" != "true" ]; then - args+=(--no-promoted-dataset) - fi - - uv run --extra hf --python 3.13 microplex-us-smoke-hf-artifact "${args[@]}" diff --git a/.github/workflows/site-snapshot.yml b/.github/workflows/site-snapshot.yml deleted file mode 100644 index fd9d8bc4..00000000 --- a/.github/workflows/site-snapshot.yml +++ /dev/null @@ -1,138 +0,0 @@ -name: Site Snapshot - -on: - pull_request: - push: - branches: - - main - workflow_dispatch: - -permissions: - contents: read - -jobs: - site-snapshot: - runs-on: ubuntu-latest - defaults: - run: - working-directory: microplex-us - steps: - - name: Check out microplex-us - uses: actions/checkout@v4 - with: - path: microplex-us - - - name: Check out core microplex - uses: actions/checkout@v4 - with: - repository: PolicyEngine/microplex - ref: 773106e3a159a0417ed15025b507ab05c0b93b5d - path: microplex - - - name: Check out microunit - uses: actions/checkout@v4 - with: - repository: PolicyEngine/microunit - ref: main - path: microunit - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.14" - - - name: Set up uv - uses: astral-sh/setup-uv@v6 - with: - version: "0.11.14" - working-directory: microplex-us - - - name: Verify snapshot tooling - run: | - uv run --extra dev --with pydantic --with-editable ../microplex pytest -q \ - tests/test_package_imports.py \ - tests/test_calibration_harness.py \ - tests/targets/test_supabase.py \ - tests/pipelines/test_check_site_snapshot.py \ - tests/pipelines/test_imputation_ablation.py \ - tests/pipelines/test_site_snapshot.py \ - tests/pipelines/test_version_benchmark.py - - - name: Check generated site snapshot - run: | - snapshot_path="$(uv run python - <<'PY' - import json - import tempfile - from pathlib import Path - - from microplex_us.pipelines.site_snapshot import write_us_microplex_site_snapshot - - root = Path(tempfile.mkdtemp()).resolve() - artifact_dir = root / "run-1" - artifact_dir.mkdir() - for filename in ( - "seed_data.parquet", - "synthetic_data.parquet", - "calibrated_data.parquet", - "targets.json", - ): - (artifact_dir / filename).write_text("{}" if filename == "targets.json" else "") - - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-03-29T00:00:00+00:00", - "config": {"n_synthetic": 2000}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_harness": "policyengine_harness.json", - }, - "synthesis": { - "scaffold_source": "cps_asec_2023", - "state_program_support_proxies": { - "available": ["ssi"], - "missing": ["snap"], - }, - }, - "calibration": { - "n_loaded_targets": 100, - "n_supported_targets": 90, - "converged": False, - "weight_collapse_suspected": False, - }, - "policyengine_harness": { - "candidate_mean_abs_relative_error": 0.9, - "baseline_mean_abs_relative_error": 1.1, - "mean_abs_relative_error_delta": -0.2, - }, - } - ) - ) - (artifact_dir / "policyengine_harness.json").write_text( - json.dumps( - { - "summary": { - "candidate_mean_abs_relative_error": 0.9, - "baseline_mean_abs_relative_error": 1.1, - "mean_abs_relative_error_delta": -0.2, - "candidate_composite_parity_loss": 0.8, - "baseline_composite_parity_loss": 1.2, - "target_win_rate": 0.2, - "slice_win_rate": 0.5, - "supported_target_rate": 0.9, - "tag_summaries": {}, - "parity_scorecard": {}, - "attribute_cell_summaries": {}, - } - } - ) - ) - snapshot_path = root / "snapshots" / "site_snapshot_us.json" - write_us_microplex_site_snapshot(artifact_dir, snapshot_path) - print(snapshot_path) - PY - )" - uv run microplex-us-check-site-snapshot "$snapshot_path" diff --git a/.python-version b/.python-version deleted file mode 100644 index 6324d401..00000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.14 diff --git a/AGENTS.md b/AGENTS.md index d16b5106..41cf08a9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,208 +1,44 @@ # AGENTS.md -This repo is the US country pack for `microplex`. Keep it thin where possible and push shared abstractions upstream into core. +This repository is the US content package for `microplex`. -## Default posture +## Contract -- Prefer spec-driven behavior over ad hoc logic in large pipeline files. -- If a seam is useful for both UK and US, move it to `microplex` instead of polishing a US-only local helper. -- Keep PolicyEngine-US execution details local unless there is a clean shared protocol. +- Keep the package declarative: YAML specs and JSON manifests only. +- Do not add runtime Python, tests, scripts, notebooks, dashboards, generated + artifacts, or local environment files to this repository. +- Move execution machinery to `microplex`, donor imputation machinery to + `microimpute`, and calibration machinery to `microcalibrate`. +- Treat any country-specific imperative logic as a missing generic operator or + adapter in `microplex`, not as a reason to recreate Python here. -## Regression discipline (every fix ships a guard) +## Validation -A bug is not fixed until it ships an **automated guard on the run path** — a -gate/assertion/CI test that fails loud if it recurs. Do not rely on a human or -agent re-noticing a known failure mode. Definition of done for any fix: "I added -the check that would have caught this." - -In particular, the eCPS-replacement comparison must refuse to emit a verdict when -its inputs or result are invalid (see `ecps_replacement_comparison.py` gates): -the baseline must be production-pinned and score sanely (baseline-sanity gate), -and the refit must materially reduce loss (refit-effectiveness gate). When a new -comparison failure mode is found, add a gate there rather than fixing it once. - -## Current architectural intent - -- `microplex-us` owns: - - US source manifests and raw source adapters - - PolicyEngine-US execution/materialization - - US-specific target providers and benchmark harnesses - - US-local pipeline orchestration -- `microplex` core owns: - - targets specs/providers/protocols - - reweighting bundles and solver - - benchmark metrics/comparisons/suites - - shared result-based benchmark builders - -## Current mission notes - -- For US, the canonical mission metric is the PE-native broad loss frontier, not composite parity. -- When evaluating progress, prefer: - - matched-size `Microplex@N` vs `PE@N` - - full `enhanced_cps_2024` only as a stretch reference -- Recent direct-objective testing showed that changing only the post-export weight objective moves loss very little on the same fixed candidate. -- Bias effort toward: - - better candidate records - - fuller support coverage - - budgeted selection on larger candidates -- Bias away from: - - repeated small-candidate donor-backend A/Bs - - more entropy tuning without evidence that the candidate population itself improved - -## Review checklist - -When reviewing recent changes here, check: - -1. Is this still duplicating something that should now live in core? -2. Is the US harness using shared core benchmarking helpers instead of rebuilding them inline? -3. Are any benchmark claims relying on non-common-target comparisons? -4. Is the work using PE-native broad loss when it claims mission progress? -5. Does PE-US materialization handle dependency chains and partial failures safely? -6. Is this baking in fixed tax-unit structure more deeply than necessary? - -## Be careful around - -- `src/microplex_us/policyengine/us.py` - - Large file with execution/materialization logic and remaining monolith risk. -- `src/microplex_us/policyengine/harness.py` - - Should keep delegating more suite/result logic to core. -- `src/microplex_us/pipelines/local_reweighting.py` - - Should remain a thin adapter over core bundle/reweighting surfaces. - -## Standard commands - -- Install, production/runtime: `./scripts/install.sh --prod` -- Install, development: `./scripts/install.sh --dev` -- Install, Intel macOS development: `./scripts/install.sh --dev-intel-mac` -- Ruff: `uv run ruff check src tests` -- Focused comparison/harness tests: `uv run pytest -q tests/policyengine/test_comparison.py tests/policyengine/test_harness.py` -- Local reweighting tests: `uv run pytest -q tests/pipelines/test_local_reweighting.py` - -## Environment guidance - -- Production macOS installs require Apple Silicon (`arm64`). -- Intel macOS (`x86_64`) is development/testing-only. If uv/PyPI fails on - `torch` wheels there, use `./scripts/install.sh --dev-intel-mac`. -- Do not add torch stubs or no-torch runtime workarounds for Intel macOS; use - the conda-forge developer environment instead. - -## Claude/Codex review shortcut - -For a quick review, read: - -1. [`/Users/maxghenis/PolicyEngine/microplex-us/AGENTS.md`](/Users/maxghenis/PolicyEngine/microplex-us/AGENTS.md) -2. [`/Users/maxghenis/PolicyEngine/microplex-us/_WORKSPACE.md`](/Users/maxghenis/PolicyEngine/microplex-us/_WORKSPACE.md) -3. [`/Users/maxghenis/PolicyEngine/microplex-us/_BUILD_LOG.md`](/Users/maxghenis/PolicyEngine/microplex-us/_BUILD_LOG.md) - -Then inspect changed files and return findings first. - -## Review handoff - -To avoid rebuilding long prompts in chat: - -1. Treat [`/Users/maxghenis/PolicyEngine/microplex-us/reviews/PENDING_CLAUDE_REVIEW.md`](/Users/maxghenis/PolicyEngine/microplex-us/reviews/PENDING_CLAUDE_REVIEW.md) as the current review request. -2. Read that file after the standard repo context files above. -3. Write the full review to a dated file under [`/Users/maxghenis/PolicyEngine/microplex-us/reviews/`](/Users/maxghenis/PolicyEngine/microplex-us/reviews/). -4. Append only a concise summary to [`/Users/maxghenis/PolicyEngine/microplex-us/_BUILD_LOG.md`](/Users/maxghenis/PolicyEngine/microplex-us/_BUILD_LOG.md). - - -# GitNexus — Code Intelligence - -This project is indexed by GitNexus as **microplex-us** (4778 symbols, 12879 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely. - -> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first. - -## Always Do - -- **MUST run impact analysis before editing any symbol.** Before modifying a function, class, or method, run `gitnexus_impact({target: "symbolName", direction: "upstream"})` and report the blast radius (direct callers, affected processes, risk level) to the user. -- **MUST run `gitnexus_detect_changes()` before committing** to verify your changes only affect expected symbols and execution flows. -- **MUST warn the user** if impact analysis returns HIGH or CRITICAL risk before proceeding with edits. -- When exploring unfamiliar code, use `gitnexus_query({query: "concept"})` to find execution flows instead of grepping. It returns process-grouped results ranked by relevance. -- When you need full context on a specific symbol — callers, callees, which execution flows it participates in — use `gitnexus_context({name: "symbolName"})`. - -## When Debugging - -1. `gitnexus_query({query: ""})` — find execution flows related to the issue -2. `gitnexus_context({name: ""})` — see all callers, callees, and process participation -3. `READ gitnexus://repo/microplex-us/process/{processName}` — trace the full execution flow step by step -4. For regressions: `gitnexus_detect_changes({scope: "compare", base_ref: "main"})` — see what your branch changed - -## When Refactoring - -- **Renaming**: MUST use `gitnexus_rename({symbol_name: "old", new_name: "new", dry_run: true})` first. Review the preview — graph edits are safe, text_search edits need manual review. Then run with `dry_run: false`. -- **Extracting/Splitting**: MUST run `gitnexus_context({name: "target"})` to see all incoming/outgoing refs, then `gitnexus_impact({target: "target", direction: "upstream"})` to find all external callers before moving code. -- After any refactor: run `gitnexus_detect_changes({scope: "all"})` to verify only expected files changed. - -## Never Do - -- NEVER edit a function, class, or method without first running `gitnexus_impact` on it. -- NEVER ignore HIGH or CRITICAL risk warnings from impact analysis. -- NEVER rename symbols with find-and-replace — use `gitnexus_rename` which understands the call graph. -- NEVER commit changes without running `gitnexus_detect_changes()` to check affected scope. - -## Tools Quick Reference - -| Tool | When to use | Command | -|------|-------------|---------| -| `query` | Find code by concept | `gitnexus_query({query: "auth validation"})` | -| `context` | 360-degree view of one symbol | `gitnexus_context({name: "validateUser"})` | -| `impact` | Blast radius before editing | `gitnexus_impact({target: "X", direction: "upstream"})` | -| `detect_changes` | Pre-commit scope check | `gitnexus_detect_changes({scope: "staged"})` | -| `rename` | Safe multi-file rename | `gitnexus_rename({symbol_name: "old", new_name: "new", dry_run: true})` | -| `cypher` | Custom graph queries | `gitnexus_cypher({query: "MATCH ..."})` | - -## Impact Risk Levels - -| Depth | Meaning | Action | -|-------|---------|--------| -| d=1 | WILL BREAK — direct callers/importers | MUST update these | -| d=2 | LIKELY AFFECTED — indirect deps | Should test | -| d=3 | MAY NEED TESTING — transitive | Test if critical path | - -## Resources - -| Resource | Use for | -|----------|---------| -| `gitnexus://repo/microplex-us/context` | Codebase overview, check index freshness | -| `gitnexus://repo/microplex-us/clusters` | All functional areas | -| `gitnexus://repo/microplex-us/processes` | All execution flows | -| `gitnexus://repo/microplex-us/process/{name}` | Step-by-step execution trace | - -## Self-Check Before Finishing - -Before completing any code modification task, verify: -1. `gitnexus_impact` was run for all modified symbols -2. No HIGH/CRITICAL risk warnings were ignored -3. `gitnexus_detect_changes()` confirms changes match expected scope -4. All d=1 (WILL BREAK) dependents were updated - -## Keeping the Index Fresh - -After committing code changes, the GitNexus index becomes stale. Re-run analyze to update it: +Use the generic Microplex content-package check: ```bash -npx gitnexus analyze +PYTHONPATH=/path/to/microplex/src:src uv run --no-project --python 3.13 \ + --with pydantic --with pyyaml \ + python -m microplex.content_package \ + --package microplex_us \ + --spec specs/us-2024.yaml \ + --contract manifests/ecps_export_contract.json \ + --src-root src/microplex_us ``` -If the index previously included embeddings, preserve them by adding `--embeddings`: +The check must confirm: -```bash -npx gitnexus analyze --embeddings -``` - -To check whether embeddings exist, inspect `.gitnexus/meta.json` — the `stats.embeddings` field shows the count (0 means no embeddings). **Running analyze without `--embeddings` will delete any previously generated embeddings.** +- the spec loads +- `spec.variables` exactly covers the frozen export contract plus declared + imputation variables +- `src/microplex_us` contains no Python files -> Claude Code users: A PostToolUse hook handles this automatically after `git commit` and `git merge`. +Also run: -## CLI - -| Task | Read this skill file | -|------|---------------------| -| Understand architecture / "How does X work?" | `.claude/skills/gitnexus/gitnexus-exploring/SKILL.md` | -| Blast radius / "What breaks if I change X?" | `.claude/skills/gitnexus/gitnexus-impact-analysis/SKILL.md` | -| Trace bugs / "Why is X failing?" | `.claude/skills/gitnexus/gitnexus-debugging/SKILL.md` | -| Rename / extract / split / refactor | `.claude/skills/gitnexus/gitnexus-refactoring/SKILL.md` | -| Tools, resources, schema reference | `.claude/skills/gitnexus/gitnexus-guide/SKILL.md` | -| Index, status, clean, wiki CLI commands | `.claude/skills/gitnexus/gitnexus-cli/SKILL.md` | +```bash +find . -name '*.py' -print +uv build +``` - +The first command should print nothing. The built wheel should contain only the +US spec, JSON manifests, and wheel metadata. diff --git a/CLAUDE.md b/CLAUDE.md index cc44fc8a..75e348fc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,113 +1,10 @@ -## Environment guidance for agents +# CLAUDE.md -- Use `./scripts/install.sh --prod` for production/runtime setup. -- Use `./scripts/install.sh --dev` for normal development setup on Apple - Silicon macOS and Linux. -- Use `./scripts/install.sh --dev-intel-mac` on Intel macOS (`x86_64`) or when - PyPI cannot provide modern `torch` wheels for that platform. -- Production macOS installs require Apple Silicon (`arm64`). Intel macOS is - development/testing-only. -- Do not add torch stubs or no-torch runtime workarounds for Intel macOS; use - the conda-forge developer environment instead. +`microplex-us` is a declarative content package. - -# GitNexus — Code Intelligence - -This project is indexed by GitNexus as **microplex-us** (4778 symbols, 12879 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely. - -> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first. - -## Always Do - -- **MUST run impact analysis before editing any symbol.** Before modifying a function, class, or method, run `gitnexus_impact({target: "symbolName", direction: "upstream"})` and report the blast radius (direct callers, affected processes, risk level) to the user. -- **MUST run `gitnexus_detect_changes()` before committing** to verify your changes only affect expected symbols and execution flows. -- **MUST warn the user** if impact analysis returns HIGH or CRITICAL risk before proceeding with edits. -- When exploring unfamiliar code, use `gitnexus_query({query: "concept"})` to find execution flows instead of grepping. It returns process-grouped results ranked by relevance. -- When you need full context on a specific symbol — callers, callees, which execution flows it participates in — use `gitnexus_context({name: "symbolName"})`. - -## When Debugging - -1. `gitnexus_query({query: ""})` — find execution flows related to the issue -2. `gitnexus_context({name: ""})` — see all callers, callees, and process participation -3. `READ gitnexus://repo/microplex-us/process/{processName}` — trace the full execution flow step by step -4. For regressions: `gitnexus_detect_changes({scope: "compare", base_ref: "main"})` — see what your branch changed - -## When Refactoring - -- **Renaming**: MUST use `gitnexus_rename({symbol_name: "old", new_name: "new", dry_run: true})` first. Review the preview — graph edits are safe, text_search edits need manual review. Then run with `dry_run: false`. -- **Extracting/Splitting**: MUST run `gitnexus_context({name: "target"})` to see all incoming/outgoing refs, then `gitnexus_impact({target: "target", direction: "upstream"})` to find all external callers before moving code. -- After any refactor: run `gitnexus_detect_changes({scope: "all"})` to verify only expected files changed. - -## Never Do - -- NEVER edit a function, class, or method without first running `gitnexus_impact` on it. -- NEVER ignore HIGH or CRITICAL risk warnings from impact analysis. -- NEVER rename symbols with find-and-replace — use `gitnexus_rename` which understands the call graph. -- NEVER commit changes without running `gitnexus_detect_changes()` to check affected scope. - -## Tools Quick Reference - -| Tool | When to use | Command | -|------|-------------|---------| -| `query` | Find code by concept | `gitnexus_query({query: "auth validation"})` | -| `context` | 360-degree view of one symbol | `gitnexus_context({name: "validateUser"})` | -| `impact` | Blast radius before editing | `gitnexus_impact({target: "X", direction: "upstream"})` | -| `detect_changes` | Pre-commit scope check | `gitnexus_detect_changes({scope: "staged"})` | -| `rename` | Safe multi-file rename | `gitnexus_rename({symbol_name: "old", new_name: "new", dry_run: true})` | -| `cypher` | Custom graph queries | `gitnexus_cypher({query: "MATCH ..."})` | - -## Impact Risk Levels - -| Depth | Meaning | Action | -|-------|---------|--------| -| d=1 | WILL BREAK — direct callers/importers | MUST update these | -| d=2 | LIKELY AFFECTED — indirect deps | Should test | -| d=3 | MAY NEED TESTING — transitive | Test if critical path | - -## Resources - -| Resource | Use for | -|----------|---------| -| `gitnexus://repo/microplex-us/context` | Codebase overview, check index freshness | -| `gitnexus://repo/microplex-us/clusters` | All functional areas | -| `gitnexus://repo/microplex-us/processes` | All execution flows | -| `gitnexus://repo/microplex-us/process/{name}` | Step-by-step execution trace | - -## Self-Check Before Finishing - -Before completing any code modification task, verify: -1. `gitnexus_impact` was run for all modified symbols -2. No HIGH/CRITICAL risk warnings were ignored -3. `gitnexus_detect_changes()` confirms changes match expected scope -4. All d=1 (WILL BREAK) dependents were updated - -## Keeping the Index Fresh - -After committing code changes, the GitNexus index becomes stale. Re-run analyze to update it: - -```bash -npx gitnexus analyze -``` - -If the index previously included embeddings, preserve them by adding `--embeddings`: - -```bash -npx gitnexus analyze --embeddings -``` - -To check whether embeddings exist, inspect `.gitnexus/meta.json` — the `stats.embeddings` field shows the count (0 means no embeddings). **Running analyze without `--embeddings` will delete any previously generated embeddings.** - -> Claude Code users: A PostToolUse hook handles this automatically after `git commit` and `git merge`. - -## CLI - -| Task | Read this skill file | -|------|---------------------| -| Understand architecture / "How does X work?" | `.claude/skills/gitnexus/gitnexus-exploring/SKILL.md` | -| Blast radius / "What breaks if I change X?" | `.claude/skills/gitnexus/gitnexus-impact-analysis/SKILL.md` | -| Trace bugs / "Why is X failing?" | `.claude/skills/gitnexus/gitnexus-debugging/SKILL.md` | -| Rename / extract / split / refactor | `.claude/skills/gitnexus/gitnexus-refactoring/SKILL.md` | -| Tools, resources, schema reference | `.claude/skills/gitnexus/gitnexus-guide/SKILL.md` | -| Index, status, clean, wiki CLI commands | `.claude/skills/gitnexus/gitnexus-cli/SKILL.md` | - - +- Keep package contents to specs and manifests. +- Do not add Python implementation code here. +- Put generic execution, validation, imputation, and calibration machinery in + `microplex`, `microimpute`, or `microcalibrate`. +- Validate this repository with `microplex.content_package` and a no-`.py` file + scan before publishing changes. diff --git a/README.md b/README.md index 05581417..8e75c2b1 100644 --- a/README.md +++ b/README.md @@ -1,82 +1,32 @@ # microplex-us -US-specific survey adapters, calibration targets, pipelines, and PolicyEngine integration -built on top of the generic `microplex` engine. - -## Installation - -Use the install script so platform-specific dependency handling stays explicit: - -```bash -./scripts/install.sh --prod -``` - -For development on Apple Silicon macOS or Linux: - -```bash -./scripts/install.sh --dev -``` - -Production macOS installs require Apple Silicon (`arm64`). Intel macOS -(`x86_64`) is supported only for development/testing through conda-forge: - -```bash -./scripts/install.sh --dev-intel-mac -``` - -See [developer testing environments](./envs/README.md) for details. - -## Docs - -- [Docs index](./docs/README.md) -- [Architecture](./docs/architecture.md) -- [Canonical pipeline stages](./docs/pipeline-stages.md) -- [Stage contracts and manifests](./docs/stage-contracts.md) -- [API reference](./docs/api.md) -- [Source semantics](./docs/source-semantics.md) -- [Imputation conditioning contract](./docs/imputation-conditioning-contract.md) -- [Benchmarking](./docs/benchmarking.md) -- [Methodology ledger](./docs/methodology-ledger.md) -- [PolicyEngine oracle compatibility path](./docs/policyengine-oracle-compatibility.md) -- [PE construction parity](./docs/pe-construction-parity.md) -- [Superseding `policyengine-us-data`](./docs/superseding-policyengine-us-data.md) -- [Hugging Face artifact publishing](./docs/huggingface-artifact-publishing.md) - -## Diagnostics dashboard - -The static dashboard in `dashboard/` loads the full PE-native per-target -diagnostic JSON written by: - -```bash -microplex-us-pe-native-target-diagnostics \ - --from-dataset /path/to/enhanced_cps_2024.h5 \ - --to-dataset /path/to/policyengine_us.h5 \ - --policyengine-targets-db /path/to/policy_data.db \ - --output-path artifacts/pe_native_target_diagnostics_current.json -``` - -The dashboard uses the exported PolicyEngine design tokens from -`@policyengine/config/theme.css`; run `python scripts/sync_policyengine_theme.py --check` -to verify the local browser-readable token copy is still synced. -When a PolicyEngine target DB is available, the JSON annotates PE-native legacy -labels with structured target IDs and flags legacy-only gaps. - -## Current focus - -`microplex-us` is being built as a library-first US runtime with -`policyengine-us` as the shared measurement operator and -`policyengine-us-data` as the incumbent comparator, not as the thing we are -trying to clone wholesale: - -- canonical source and target metadata -- PE-US-compatible export -- full-target benchmarking against the active targets DB -- run registry and DuckDB index for frontier analysis - -The architecture is still evolving, so the docs are deliberately technical and -operational rather than paper-like. - -Method-level decomposable-family bakeoffs now live in the sibling eval repo: -`/Users/maxghenis/PolicyEngine/microplex-evals`. `microplex-us` should keep the -runtime helpers and pipeline-adjacent diagnostics, not the long-lived eval -orchestration and artifact curation. +`microplex-us` is the US content package for Microplex. It ships declarative +specs and manifests only; Microplex owns the execution engine, microimpute owns +donor imputation, and microcalibrate owns calibration. + +## Package Contents + +- `src/microplex_us/specs/us-2024.yaml`: US 2024 construction spec. +- `src/microplex_us/manifests/ecps_export_contract.json`: frozen eCPS export + column contract. +- `src/microplex_us/manifests/frozen_production_ecps_2024_benchmark_manifest.json`: + pinned production-eCPS benchmark certificate metadata. +- `src/microplex_us/manifests/pe_source_impute_blocks.json`: source-imputation + block declarations. +- `src/microplex_us/manifests/puf.json`: PUF source manifest. + +## Construction Order + +1. Load ASEC/CPS and PUF sources. +2. Build the seeded 50/50 ASEC+PUF support spine. +3. Assign atomic census geography within the lowest available CPS geography. +4. Run SCF, SIPP, and ACS source imputations on the resolved support universe. +5. Apply declared transforms and target construction. +6. Calibrate through Microplex's microcalibrate adapter. +7. Export the PolicyEngine-compatible dataset. + +## Validation + +The generic Microplex content-package check validates that the spec loads, the +variable manifest covers the frozen eCPS contract plus declared imputation +surface, and the package contains no runtime Python files. diff --git a/_BUILD_LOG.md b/_BUILD_LOG.md deleted file mode 100644 index 223fb2c0..00000000 --- a/_BUILD_LOG.md +++ /dev/null @@ -1,3133 +0,0 @@ -# _BUILD_LOG.md - -Append-only notes for agents working in `microplex-us`. - -## 2026-04-11 - -- Corrected upstream EITC-recipient oracle semantics: - - the active PE targets DB now builds IRS SOI EITC child-count strata with - `eitc > 0` in addition to `eitc_child_count` - - Microplex's PE target-provider matching now treats `domain_variable` as a - set-membership field for target-cell selection, so corrected rows like - `eitc,eitc_child_count` still match the intended target profile -- Fresh evidence after the EITC-recipient oracle fix: - - corrected-oracle apples-to-apples reevaluation of the pre-fix large - no-donor artifact: - - artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1` - - corrected capped full-oracle loss `1.0149` - - corrected full-oracle loss `1.3233` - - matched large no-donor source rerun against the corrected oracle: - - artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_eitc_recipient_oracle_large_nodonors/large-nodonors-eitc-recipient-oracle-v2` - - `4609` calibrated rows - - capped full-oracle loss `0.9729` - - full-oracle loss `1.2352` - - active-solve capped loss `1.2345` - - `420` active constraints - - deferred stage still skipped - - focused deferred-stage confirmations: - - matched large no-donor source rerun with a forced narrow stage 2: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_large_nodonors/large-nodonors-age-agi-forced-stage2-v1` - - capped full-oracle loss improves from `0.9729` to `0.9498` - - active-solve capped loss improves from `1.2345` to `1.1237` - - stage 2 selects `24` constraints from the top 3 deferred families and - top 4 deferred geographies - - matched large donor-inclusive source rerun with the same narrow stage 2: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_large_donors/large-donors-age-agi-forced-stage2-v1` - - capped full-oracle loss improves from `0.9730` to `0.9502` - - active-solve capped loss improves from `1.2333` to `1.1238` - - stage 2 again selects `24` constraints from the same focused set - - fresh canonical donor-inclusive checkpoint through the default entrypoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_default_stage2_large_donors/large-donors-default-stage2-v1` - - reproduces the same donor-stage result exactly - - `trigger_threshold` is now `null` - - stage 2 keeps the same `24` focused constraints and the same - `0.9502` capped full-oracle loss - - broader canonical donor-inclusive checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_default_stage2_donors/broader-donors-default-stage2-v1` - - `5000` CPS + `5000` PUF source sample - - `12092` calibrated rows - - stage 1 reaches `0.9080` capped full-oracle loss - - stage 2 still helps, improving to `0.8933` - - the focused deferred geographies shift to `KY`, `MS`, `WV`, and `DC` - - matched broader canonical no-donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_default_stage2_nodonors/broader-nodonors-default-stage2-v1` - - `5000` CPS + `5000` PUF source sample - - `12092` calibrated rows - - stage 1 reaches `0.9056` capped full-oracle loss - - stage 2 still helps, improving to `0.8909` - - the focused deferred geographies are `KY`, `MS`, `WV`, and `AZ` - - donor surveys remain effectively neutral at this broader scale, with a - slight edge to the no-donor run: - - donors: `0.8933` - - no donors: `0.8909` - - broader no-donor row-level drilldown and selector check: - - drilldown artifact: - `artifacts/tmp_broader_nodonor_oracle_drilldown_20260411.json` - - age and AGI remain the dominant deferred families - - ACA is the next family down and its worst rows are capped at `10.0`, - but widening deferred family focus from 3 to 4 does nothing under the - current `24`-constraint cap - - matched top-4-family run: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_nodonor_top4family/broader-nodonors-top4family-v1` - - result is identical to the default broader no-donor run: - - capped full-oracle loss `0.8909` - - active-solve capped loss `0.8950` - - deferred selector switched from family/geography-share-only priority to - row-level deferred capped error plus family/geography loss share within - the same focused stage-2 cap - - focused regression coverage: - - `python -m py_compile src/microplex_us/pipelines/us.py tests/pipelines/test_us.py` - - `uv run pytest tests/pipelines/test_us.py -q -k 'prioritizes_target_level_loss or deferred_stage or feasibility_constraint_budget or materialization_failures_audit_only'` - - `uv run pytest tests/pipelines/test_pe_us_data_rebuild.py tests/pipelines/test_pe_us_data_rebuild_checkpoint.py -q` - - matched medium no-donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_rowrank_nodonors/medium-nodonors-rowrank-v1` - - unchanged headline result vs the prior medium default: - `1.0298017982 -> 1.0291445335` - - matched broader no-donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_rowrank_nodonors/broader-nodonors-rowrank-v1` - - capped full-oracle loss improves from `0.8908588020` to - `0.8907527501` - - active-solve capped loss worsens slightly from `0.8950` to `0.9152`, - but the default objective is full-oracle capped loss - - matched broader donor-inclusive rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_rowrank_donors/broader-donors-rowrank-v1` - - capped full-oracle loss improves from `0.8932869027` to - `0.8782556650` - - active-solve capped loss improves from `0.8969` to `0.8814` - - read: - - the surrounding stage-2 policy was already right; the missed piece was - which rows got the fixed `24` slots - - keep the row-aware selector and stop spending time on wider family - admission experiments for now - - medium no-donor source rerun with the same narrow stage 2: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_medium_nodonors/medium-nodonors-age-agi-forced-stage2-v1` - - capped full-oracle loss improves from `1.0298` to `1.0291` - - active-solve capped loss improves from `0.7356` to `0.7048` - - stage 2 only finds `7` eligible focused constraints and still helps - - extra ultra-thin support-1 deferred stage after the row-aware stage 2: - - matched broader donor-inclusive rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_donors/broader-donors-stage3-v1` - - capped full-oracle loss improves from `0.8782556650` to - `0.8212707783` - - active-solve capped loss improves from `0.8813634527` to - `0.8343080918` - - matched broader no-donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_nodonors/broader-nodonors-stage3-v1` - - capped full-oracle loss improves from `0.8907527501` to - `0.8362042462` - - active-solve capped loss improves from `0.9151883609` to - `0.8766713154` - - matched medium no-donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_stage3_nodonors/medium-nodonors-stage3-v1` - - capped full-oracle loss improves from `1.0291445335` to - `1.0028694956` - - active-solve capped loss worsens slightly from `0.7047951546` to - `0.7148843510`, but the full-oracle objective still improves - - fresh default-entrypoint medium no-donor confirmation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_default_stage3_nodonors/medium-nodonors-default-stage3-v1` - - reproduces the same three-stage result exactly - - read: - - the support-1 pass is now doing real work on the residual - ultra-thin age and AGI cells, not just adding noisy extra constraints - - promote the default deferred-stage schedule from `(10,)` to `(10, 1)` - - deferred family focus widened from `3` to `4` after the new stage-3 - residual drilldown showed ACA PTC as the next supported deferred family: - - added capped-error-mass rankings to the oracle drilldown helper so - family prioritization is based on loss contribution, not row counts - - broader donor-inclusive rerun with top-4 deferred families: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_top4family_donors/broader-donors-stage3-top4family-v1` - - capped full-oracle loss improves from `0.8212707783` to - `0.7908917500` - - broader no-donor rerun with top-4 deferred families: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_top4family_nodonors/broader-nodonors-stage3-top4family-v1` - - capped full-oracle loss improves from `0.8362042462` to - `0.7995775732` - - medium no-donor rerun with top-4 deferred families: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_stage3_top4family_nodonors/medium-nodonors-stage3-top4family-v1` - - capped full-oracle loss improves from `1.0028694956` to - `0.9968822972` - - fresh default-entrypoint medium no-donor confirmation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_default_top4family_nodonors/medium-nodonors-default-top4family-v1` - - reproduces the same top-4-family result exactly - - read: - - once stage 3 is in place, ACA is no longer a side issue; it is the - next admitted high-support deferred family - - promote the default deferred family focus from `3` to `4` - - fresh broader donor-inclusive default-entrypoint confirmation: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_default_top4family_donors_rerun/broader-donors-default-top4family-v2` - - reproduces the existing broader donor default exactly at - `0.7908917500` capped full-oracle loss - - rejected wider deferred geography focus: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_geo8_donors/broader-donors-geo8-v1` - - widening deferred geographies from `4` to `8` worsens capped - full-oracle loss from `0.7908917500` to `0.7991939177` - - read: - - the current deferred calibration policy is stable on the broader donor - default path - - stop widening calibration focus and move upstream to age/AGI structure - - fresh broader donor drilldown: - - `artifacts/tmp_broader_default_top4family_donor_drilldown_20260412.json` - - capped-error mass is still led by `person_count|domain=age`, - `person_count|domain=adjusted_gross_income`, - `tax_unit_count|domain=adjusted_gross_income`, and - `aca_ptc|domain=aca_ptc` - - state-floor source-sampling prototype: - - added optional source-side `state_floor` sampling support for CPS and - donor household samplers - - matched broader donor rerun with `state_floor=2` on CPS and donor - sources: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_statefloor2_donors/broader-donors-statefloor2-v1` - - read: - - this is a no-op at the current broader `5000/5000` scale; the big - metric, selected constraints, and deferred geographies are identical - to the current default artifact - - the remaining age/AGI problem is therefore not plain state-level - undercoverage; if we stay upstream, the next sharper idea is - state-by-age or state-by-AGI support structure rather than a generic - state floor - - raw PUF checkpoint sampling should use `S006` weights: - - fixed `_sample_tax_units()` so checkpoint-scale PUF samples respect raw - `S006` weights before variable mapping instead of uniformly sampling raw - PUF rows - - matched broader donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_donors/broader-donors-puf-weight-v1` - - improves capped full-oracle loss from `0.7908917500` to - `0.7681656356` - - matched broader no-donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_nodonors/broader-nodonors-puf-weight-v1` - - improves capped full-oracle loss from `0.7995775732` to - `0.7683205208` - - read: - - this is a direct incumbent-alignment fix, not a challenger modeling - tweak - - it improves the big metric more than the recent calibration-planner - experiments - - after the fix, age and AGI still dominate capped-error mass, but the - worst individual cells shift toward ACA PTC and rental/interest tails - - experiment index: - - created `artifacts/experiment_index.jsonl` - - records the intervention artifact, baseline artifact, big metric delta, - and kept/rejected decision for the recent matched experiments - - top-3 deferred families is now rejected again under the improved upstream - source sample: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_top3family_donors/broader-donors-puf-weight-top3family-v1` - - regresses capped full-oracle loss from `0.7681656356` to - `0.8021818710` - - read: - - ACA still belongs in the focused deferred family set under the new - source sample, even though ACA-family loss itself remains ugly - - CPS `state x age-band` checkpoint floor: - - added optional `state_age_floor` support to CPS checkpoint sampling and - promoted `state_age_floor=1` into the default checkpoint query builder - - matched broader donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_donors/broader-donors-cps-stateage1-v1` - - improves capped full-oracle loss from `0.7681656356` to - `0.7329149849` - - matched broader no-donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_nodonors/broader-nodonors-cps-stateage1-v1` - - improves capped full-oracle loss from `0.7683205208` to - `0.7368409543` - - stage attribution on the broader donor artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_donors/tmp_broader_puf_weight_donor_stage_attribution_20260412.json` - - read: - - `seed` and `synthetic` are identical on the PE oracle for this path, - so the remaining age/AGI miss is entering before synthesis - - calibration still reduces age/AGI/EITC substantially, but it worsens - ACA and rental - - the state-age floor is the first upstream CPS support tweak that - materially improves the big metric on both donor and no-donor runs - - comparative read: - - this is a real improvement under the corrected oracle, not a stale-manifest - artifact - - `tax_unit_count|domain=eitc_child_count` drops out of the top-3 residual - families after the rerun - - the remaining leading families are now age counts and AGI count families, - with leading geographies `OR`, `WI`, and `MI` -- Durable comparison artifact: - - `artifacts/tmp_eitc_recipient_oracle_large_nodonors_comparison_20260411.json` - -- Corrected full-oracle accounting: - - `full_oracle_*` metrics now include explicit penalty mass for unsupported - targets instead of silently scoring only the supported subset - - supported-only summaries remain available as separate diagnostics -- Corrected deferred-stage control flow: - - a skipped deferred stage no longer aborts later scheduled stages -- Current default PE-oracle rebuild policy: - - dense first calibration pass - - one deferred support-10 pass - - deferred-pass cap `24` - - deferred pass always considered - - deferred pass focused to the top 3 deferred families and top 4 deferred - geographies - - deferred pass only retained if capped full-oracle loss improves -- Fresh evidence after the correction: - - medium source checkpoint: - - artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_medium/medium-source-corrected-oracle-v1` - - `918` calibrated rows - - capped full-oracle loss `2.3931` - - stage 2 skipped under the new `2.45` trigger - - donor-inclusive source checkpoint: - - artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_donors/donors-source-corrected-oracle-v1` - - `918` calibrated rows - - capped full-oracle loss `2.3940` - - active-solve capped loss `2.0969` - - stage 2 also skipped under the new `2.45` trigger - - larger donor-inclusive source checkpoint: - - artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_large_donors/large-donors-source-corrected-oracle-v1` - - source mix: - `cps_asec_2023 + irs_soi_puf_2024 + acs_2022 + sipp_tips_2023 + sipp_assets_2023 + scf_2022` - - `4859` calibrated rows - - `490` active constraints after the feasibility filter - - capped full-oracle loss `2.4331` - - active-solve capped loss `2.7178` - - deferred stage still skipped under the new `2.45` trigger - - matched larger no-donor source checkpoint: - - artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_large_nodonors/large-nodonors-source-corrected-oracle-v1` - - source mix: - `cps_asec_2023 + irs_soi_puf_2024` - - `4859` calibrated rows - - `487` active constraints after the feasibility filter - - capped full-oracle loss `2.4329` - - active-solve capped loss `2.7284` - - deferred stage also skipped under the new `2.45` trigger -- larger replayed saved artifacts: - - `4859` rows: capped full-oracle loss `0.6803`, stage 2 skipped - - `24686` rows: capped full-oracle loss `1.9845`, stage 2 skipped -- Current interpretation: - - the corrected metric still preserves the useful tiny-run stage-2 gain - - at medium and above, the deferred pass should usually not fire under the - current incumbent-compatible default - - the fresh `4859`-row donor-inclusive source build lands very close to the - trigger, so `2.45` now looks like a real boundary value rather than a loose - conservative skip rule - - at this `2000/2000` source scale, donor surveys are basically neutral on - corrected full-oracle loss: - - donors: `2.4331` - - no donors: `2.4329` - - donors slightly improve active-solve loss but do not improve the - full-oracle score - - follow-up compiler diagnosis: - - the dominant remaining full-oracle families were not actually calibration - misses; they were `tax_unit_count` targets with person-entity domain - filters such as `dividend_income > 0` and `tax_unit_is_filer == 1` - - PE defines those domain variables on `person`, while the old compiler only - supported cross-entity filters for household targets - - extending the compiler to align `person -> tax_unit/family/spm_unit` - boolean filters removes that structural unsupported wall - - replay after the compiler fix on the saved `4859`-row large source - artifacts: - - supported targets move from `4070` to `4642` - - unsupported targets drop from `572` to `0` - - capped full-oracle replay loss falls from about `2.43` to about `1.33` - - donor vs no-donor remains effectively neutral on the replayed full-oracle - metric: - - donors: `1.3267` - - no donors: `1.3264` - - fresh large no-donor source rerun after the compiler fix: - - artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1` - - active constraints rise from `487` to `540` - - supported targets rise from `487` to `540` within the solve - - unsupported targets drop from `572` to `0` on the full oracle - - capped full-oracle loss falls from `2.4329` to `1.3274` - - active-solve capped loss improves slightly from `2.7284` to `2.6923` - - deferred stage still skips, now because the trigger metric is - `1.3274 < 2.45` - - fresh large donor-inclusive source rerun after the compiler fix: - - artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_donors/large-donors-cross-entity-fix-v1` - - capped full-oracle loss lands at `1.3277` - - active-solve capped loss lands at `2.6825` - - unsupported targets remain `0` - - donor inclusion is still basically neutral on the broad oracle at this scale - - added saved-artifact oracle summaries: - - recurring family/geography summary: - `artifacts/tmp_policyengine_oracle_regressions_cross_entity_fix_20260411.json` - - exact worst-cell drilldown: - `artifacts/tmp_policyengine_oracle_target_drilldown_cross_entity_fix_20260411.json` - - residual reading after the compiler fix: - - the largest remaining full-oracle families are now - `person_count|domain=age`, - `tax_unit_count|domain=eitc_child_count`, - `person_count|domain=adjusted_gross_income`, - `tax_unit_count|domain=adjusted_gross_income`, - `tax_unit_count|domain=salt`, and `aca_ptc|domain=aca_ptc` - - the leading geographies are `state:OR`, `state:GA`, and `state:MO` - - concrete worst cells inside those geographies include: - - `tax_exempt_interest_income` in `OR` - - AGI count targets in `OR` and `MO` - - ACA PTC in `OR`, `GA`, and `MO` - - EITC child-count and SALT targets in `GA` - - pass-through income in `MO` - - next work should target those residual families/geographies directly, not - more deferred-stage threshold tuning - - controlled smoke A/B on stored-input tails: - - accepted interest/rental conditioning change: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_current/smoke-nodonors-asset-tail-conditioning-current-v1` - - matched old-semantics baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_oldsemantics/smoke-nodonors-asset-tail-old-semantics-v1` - - rejected property-cost extension: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_v2/smoke-nodonors-asset-tail-conditioning-v2` - - outcome: - - the accepted change is a small honest win on the smoke A/B: - capped full-oracle loss improves from `1.4417803` to `1.4414441` - - active-solve capped loss also improves from `1.8878380` to `1.8829362` - - the capped stored-input mass attributed to - `tax_exempt_interest_income` in the top drilldown falls from `40` to `20` - - extending the same pattern to property-tax variables was worse and was - reverted: capped full-oracle loss rose to `1.4489770` - - tested a separate interest-family decomposition path and rejected it: - - medium no-donor candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_interest_family_medium_nodonors/medium-nodonors-interest-family-v1` - - matched large no-donor confirmation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_interest_family_large_nodonors/large-nodonors-interest-family-v1` - - matched large no-donor baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1` - - reading: - - the idea looked good at medium scale - - it does not hold at `2000/2000` - - capped full-oracle loss worsens from `1.3274` to `1.3555` - - raw full-oracle loss worsens from `2256.6` to `16980.7` - - active-solve capped loss worsens from `2.6923` to `2.8229` - - reverted the code change; default path stays on separate - `taxable_interest_income` and `tax_exempt_interest_income` - - tested donor-support sampling without replacement and rejected it: - - rejected smoke artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_donor_support_sampling_smoke_nodonors/smoke-nodonors-donor-support-sampling-v1` - - baseline smoke artifact: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_current/smoke-nodonors-asset-tail-conditioning-current-v1` - - reading: - - capped full-oracle loss worsens from `1.4414` to `1.6369` - - active-solve capped loss worsens from `1.8829` to `2.7402` - - keep donor-support sampling with replacement - - rejected rental export normalization from donor-integrated components: - - the saved large no-donor seed already carries - `rental_income_positive` and `rental_income_negative` - - replaying that saved seed with export-side normalization looked promising: - - capped full-oracle loss improves from `1.3274` to `1.3169` - - active-solve capped loss improves from `2.6923` to `2.6877` - - but the fresh `2000/2000` large no-donor source checkpoint failed: - - baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_rental_export_large_nodonors/large-nodonors-rental-export-v1` - - capped full-oracle loss worsens from `1.3274` to `1.3874` - - active-solve capped loss worsens from `2.6923` to `2.7722` - - active constraints fall from `540` to `522` - - verdict: do not keep this change in the default path; source checkpoints - override replay-only wins - - rejected direct zero-support-mask propagation in zero-inflated donor rank - matching: - - idea: - - the QRF path already trains a zero model for zero-inflated positives - - let final donor rank matching use the generated `scores > 0` support mask - instead of donor positive-rate counts - - rationale: - - this looked like a clean way to stop final rank matching from - reintroducing positive tail support after the zero model had already - predicted zeros - - but the fresh `2000/2000` large no-donor source checkpoint failed: - - baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260411_zero_support_mask_large_nodonors/large-nodonors-zero-support-mask-v1` - - capped full-oracle loss worsens from `1.3274` to `1.9223` - - active-solve capped loss worsens from `2.6923` to `4.3296` - - active constraints rise from `540` to `703` - - verdict: reject; do not replace donor-rate positive counts with the - generated zero mask in the default path - -## 2026-03-28 - -- The US country pack now consumes more shared core benchmark infrastructure. -- `benchmark_metrics` in `src/microplex_us/policyengine/comparison.py` now delegates to shared `normalize_metric_payload(...)` instead of hand-building `TargetMetric`. -- `src/microplex_us/policyengine/harness.py` now builds suites from shared result-oriented core helpers rather than local payload plumbing. -- `src/microplex_us/pipelines/local_reweighting.py` remains the thin adapter over core reweighting bundles and solver. -- `_materialize_policyengine_us_variables_one_by_one(...)` in `src/microplex_us/policyengine/us.py` was fixed to chain successful materialized outputs forward, so dependency chains work in fallback mode. -- US-specific legacy targets DB implementation now lives here instead of core: - - `src/microplex_us/targets_database.py` -- `src/microplex_us/pipelines/experiments.py` now has a first-class `n_synthetic` sweep helper: - - `build_us_n_synthetic_sweep_experiments(...)` - - `run_us_microplex_n_synthetic_sweep(...)` -- The performance-session experiment path now respects experiment-level `n_synthetic` and `random_seed` overrides instead of silently using the outer harness defaults. -- The corrected parity benchmark showed the real local gap is `state_programs_core`, not district slices. -- Current diagnosis: - - `cps_puf_500_auto_conditions_support_match` loses on state Medicaid and SNAP targets. - - the larger saved `cps_5000_puf_500_nsynthetic_5000_state_stratified_bootstrap` artifact is not a healthy counterexample; its calibrated weights collapse to near-zero mass, so it should not be used as evidence that scaling fixed the state gap. -- Worst current `state_programs_core` misses for `cps_puf_500_auto_conditions_support_match` are concentrated in a small set of zero/near-zero states: - - Medicaid: GA (`state_fips=13`), WV (`54`), AZ (`4`), OR (`41`), VT (`50`), TX (`48`), AK (`2`), RI (`44`) - - SNAP: IA (`19`), OR (`41`), NH (`33`), WI (`55`) - - candidate zeros by source: Medicaid `10/51`, SNAP `8/51` -- The saved `500_best` and `5000_state_stratified` artifacts are not comparable on scaffold richness: - - `500_best` seed carries `has_medicaid`, `public_assistance`, and `ssi` - - `5000_state_stratified` seed does not -- `src/microplex_us/pipelines/us.py` now prefers scaffold sources that carry state-program support proxies (`has_medicaid`, `public_assistance`, `ssi`, `social_security`) before falling back to raw observed-column count. -- `synthesis_metadata` now records `state_program_support_proxies.available/missing` so artifact triage can see whether a run ever had Medicaid/SNAP support proxies in the scaffolded seed. -- `src/microplex_us/pipelines/us.py` now records explicit household/person weight diagnostics in `calibration_summary`, including effective sample size, tiny-weight share, and a `weight_collapse_suspected` flag so broken calibration runs are obvious in saved manifests. -- `src/microplex_us/pipelines/registry.py` now carries `calibration_converged` and `weight_collapse_suspected`, and frontier selection ignores runs flagged as weight-collapsed. -- A direct CPS scaffold A/B on `state_programs_core` confirms scaffold richness matters at fixed `n_synthetic=500`: - - stripped parquet CPS scaffold (`cps_asec_parquet`): candidate MARE `1.1675`, composite parity loss `1.0630` - - rich cached CPS scaffold (`cps_asec_2023`): candidate MARE `0.7861`, composite parity loss `0.7257` - - both compared against the same PE baseline (`0.4682` MARE, `0.4530` composite) -- The rich cached CPS scaffold is materially better specifically because it carries `has_medicaid`, `public_assistance`, `ssi`, and `social_security`. This is now a confirmed causal lever, not just a suspicion from artifact comparison. -- The next empirical question is whether that scaffold gain survives once PUF is added back in and `n_synthetic` is increased beyond `500`. -- PE-US bridge fix landed after that A/B: - - `src/microplex_us/policyengine/us.py` now exports `ssi` into temporary PE datasets when available. - - `src/microplex_us/pipelines/us.py` no longer lets fallback `employment_income_before_lsr` absorb `ssi` or `public_assistance` when explicit wages are missing. -- Interpretation: older state-program benchmark runs understate what a rich CPS scaffold can do, because they were dropping a program-relevant PE input (`ssi`) at the export boundary. -- Direct-override policy alignment: - - do not model around `*_reported` variables here - - PE rules should remain canonical by default; direct program overrides should be explicit, not automatic - - `src/microplex_us/policyengine/us.py` now supports explicit direct-override variable names in `build_policyengine_us_export_variable_maps(...)`, so callers can intentionally short-circuit with values like `snap` or `ssi` when they mean to -- Slack context for that policy lives in: - - `#us-snap` thread on PR `policyengine-us#7858` removing `snap_reported` - - `#mfb-policy-engine` thread stating that callers should pass direct values like `snap`/`tanf` when they want to short-circuit, rather than rely on `*_reported` -- Tonight's post-diagnosis empirical check on `state_programs_core`: - - current rich CPS-only run (`n_synthetic=500`, default PE rules): candidate MARE `0.9530`, baseline MARE `0.4682`, candidate composite `0.8616`, baseline composite `0.4530` - - explicit `candidate_direct_override_variables=('ssi',)` made no observable difference on that slice - - mixed rich CPS + PUF runs are better than current CPS-only: - - `n_synthetic=500`: candidate MARE `0.8198`, composite `0.7495` - - `n_synthetic=2000`: candidate MARE `0.7808`, composite `0.7129` - - but both still lose clearly to the PE baseline on `state_programs_core` -- Interpretation: - - richer scaffold and more rows help - - explicit `ssi` short-circuiting is not the lever - - the remaining gap still looks like real state-program support / structure, not a simple PE-bridge switch -- Canonical artifact discipline tightened: - - `src/microplex_us/pipelines/site_snapshot.py` now builds a site-facing snapshot directly from one saved artifact bundle (`manifest.json` + `policyengine_harness.json`). - - Canonical website input now lives at `artifacts/site_snapshot_us.json`, not in `tmp_*.json` diagnostics. - - New blessed version-bump benchmark command: - - `uv run microplex-us-version-bump-benchmark --output-root ... --cps-parquet-dir ... --targets-db ... --baseline-dataset ...` - - The command can also refresh the canonical site snapshot with `--site-snapshot-path /Users/maxghenis/PolicyEngine/microplex-us/artifacts/site_snapshot_us.json`. -- Enforcement direction: - - scratch diagnostics can still exist, but the website should only read the canonical snapshot file - - versioned benchmark runs should emit manifest + harness + registry entry, then optionally refresh the canonical snapshot - -## Current review bar - -- Prefer pushing reusable benchmark/evaluation abstractions into `microplex`. -- PE-US materialization changes need focused regression coverage. -- Be skeptical of any benchmark delta that does not clearly state whether it is common-target or full-set based. - -## Known remaining risks - -- `src/microplex_us/policyengine/us.py` is still a large concentration of concerns. -- Composite-loss reporting and generic suite MARE are both present; do not conflate them. -- Future tax-unit endogeneity work will likely force another boundary review with core. - -## 2026-03-29 - -- US artifact persistence and site snapshot generation now validate saved bundles against the shared core manifest contract before using them. -- The shared contract is intentionally structural: - - top-level manifest keys - - required benchmark summary keys for harness-backed bundles - - referenced artifact files must exist -- This means the website snapshot path now fails fast on incomplete saved bundles instead of quietly reading partial manifests. -- Canonical version-bump benchmarking now refreshes the site snapshot by default. - - `uv run microplex-us-version-bump-benchmark ...` writes to `artifacts/site_snapshot_us.json` unless `--site-snapshot-path` overrides it. -- Added deterministic snapshot freshness check: - - `uv run microplex-us-check-site-snapshot artifacts/site_snapshot_us.json` -- Added GitHub Actions workflow: - - `.github/workflows/site-snapshot.yml` -- CI design is intentionally narrow: - - checkout `microplex-us` plus sibling core `microplex` - - run focused snapshot/version-benchmark tests - - regenerate the canonical snapshot from its source artifact and fail if the committed JSON differs - -## 2026-03-29 state-program follow-up - -- US `state_programs_core` diagnosis tightened: - - the remaining gap is concentrated in repeated low-mass states across both Medicaid and SNAP, not just one program family - - on the `n=2000` diagnostic slice, candidate MARE is still materially worse than baseline: - - overall `0.8252` vs `0.4682` - - Medicaid `0.8766` vs `0.3098` - - SNAP `0.7738` vs `0.6265` - - current failure mode is severe under-support, not unsupported targets: - - `supported_target_rate = 1.0` - - `candidate_zero_count = 0` for both domains in the focused diagnostics - - worst states are often at `~0.1%` to `~3%` of target mass -- The pipeline now preserves state-program support proxies through synthesis by default instead of only carrying them implicitly in richer multi-source target sets: - - `src/microplex_us/pipelines/us.py` now auto-promotes available `has_medicaid`, `public_assistance`, `ssi`, and `social_security` columns into `condition_vars` - - this applies to the normal single-source CPS path as well as multi-source runs - - focused regression coverage now pins both paths in `tests/pipelines/test_us.py` -- The PE-US parity suite semantics were corrected for the state SNAP leg: - - `src/microplex_us/policyengine/harness.py` now uses `household_count` with domain `snap` in `state_programs_core` - - this matches the slice description (`recipiency`) and aligns with the district SNAP slice instead of treating state SNAP as a dollar-total benchmark - - focused regression coverage now pins the slice filters in `tests/policyengine/test_harness.py` -- Current interpretation: - - household-weight-only calibration is not failing to compile these targets - - the bigger ceiling is synthetic support expressiveness and source coverage - - real CPS/PUF source coverage is still structurally thin for this problem: - - real CPS carries proxies like `has_medicaid`, `public_assistance`, `ssi`, `social_security` - - real CPS/PUF does not provide real `snap` values for donor integration - - Medicaid still enters as proxy support rather than a native target-aligned source variable -- Likely next move: - - rerun the corrected comparable state slice after the proxy-preservation fix - - then decide whether the next investment is: - - stronger source/backbone support for program participation, or - - a richer non-household weight entity path for US local calibration -- Focused rerun on the saved `n=2000` candidate with the corrected `state_programs_core` semantics: - - candidate MARE `0.8492` - - PE baseline MARE `0.7298` - - delta `+0.1194` (PE still better) - - candidate composite parity loss `0.7754` - - PE baseline composite parity loss `0.7408` - - supported targets `102` for both - - target win rate `29.41%` -- Interpretation of that rerun: - - the old state SNAP amount/count mismatch was materially inflating the apparent local gap - - correcting the slice semantics narrows the loss substantially - - but it does not remove the underlying state-program weakness - - next reruns should use the corrected count-based state SNAP slice as canonical -- Fresh real-source rerun after the proxy-preserving synthesis change: - - output saved at `artifacts/tmp_state_programs_corrected_rerun_20260329.json` - - source mix: `cps_asec_2023 + irs_soi_puf_2024` - - sample size: `500` source households / tax units - - corrected state slice only - - results: - - `n_synthetic=500`: candidate MARE `0.9619`, baseline MARE `0.7298`, delta `+0.2321`, candidate composite `0.8678` - - `n_synthetic=2000`: candidate MARE `0.8729`, baseline MARE `0.7298`, delta `+0.1432`, candidate composite `0.7925` - - both runs preserved the proxies in synthesis `condition_vars`: - - `age`, `sex`, `education`, `employment_status`, `state_fips`, `tenure`, `has_medicaid`, `public_assistance`, `ssi`, `social_security` - - both runs were healthy enough numerically: - - no weight collapse - - all `102` corrected state targets supported -- Interpretation of the fresh rerun: - - preserving the CPS state-program proxies through synthesis is not enough to beat PE on the corrected state slice - - scaling from `500` to `2000` still helps, but only modestly - - the remaining gap now looks even more like a structural source/backbone problem than a lost-proxy problem - - specifically: - - real CPS/PUF still lacks true SNAP donor support - - Medicaid still enters mostly as proxy support rather than a target-native source variable - - household-weight-only calibration can rescale what exists, but cannot create the missing state-program structure - -2026-03-29 -- Scope reviewed: - - US `state_programs_core` after focused Claude review - - DB calibration feasibility vs solver non-convergence - - proxy semantics and synthesizer-path safety -- What changed: - - DB calibration now applies a feasibility filter before solving: - - config supports `policyengine_calibration_max_constraints` - - config supports `policyengine_calibration_max_constraints_per_household` - - config supports `policyengine_calibration_min_active_households` - - calibration summaries now record: - - `n_constraints_before_feasibility_filter` - - `n_constraints_after_feasibility_filter` - - low-support / over-capacity drops - - weight diagnostics now flag low effective-sample-ratio collapse, not just tiny-weight share - - registered semantic specs for: - - `has_medicaid` - - `public_assistance` - - `ssi` - - `social_security` - - fixed a core synthesizer bug where zero-inflated variables with all-zero training support could crash on inverse transform during sampling -- New canonical bootstrap rerun with the feasibility filter: - - output saved at `artifacts/tmp_state_programs_feasible_bootstrap_rerun_20260329.json` - - exact calibration DB: `/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/calibration/policy_data.db` - - corrected state-only calibration + benchmark scope: - - variables: `household_count`, `person_count` - - domains: `snap`, `medicaid_enrolled` - - geo level: `state` - - results: - - `n_synthetic=500` - - candidate MARE `0.9232` - - PE baseline MARE `0.7386` - - delta `+0.1846` - - candidate composite `0.8358` - - PE composite `0.7704` - - target win rate `33.33%` - - feasibility filter reduced constraints `102 -> 81` - - `n_synthetic=2000` - - candidate MARE `0.7335` - - PE baseline MARE `0.7386` - - delta `-0.0051` - - candidate composite `0.6770` - - PE composite `0.7704` - - target win rate `37.25%` - - feasibility filter reduced constraints `102 -> 100` -- Interpretation: - - the Claude review was directionally right that calibration feasibility mattered more than the earlier “backbone only” diagnosis - - once the state-program solve stops trying to absorb an infeasible flat constraint set, the `n=2000` CPS+PUF bootstrap run slightly beats PE on the corrected state slice - - this does not prove the final production architecture is solved, but it does show the immediate local gap was not just a source-support story - - remaining open issues: - - synthesizer-backed state-program reruns still need a clean end-to-end pass - - proxy preservation alone is not the main lever; feasible calibration is -- Follow-up synthesizer unblock: - - fixed core zero-inflated inverse-transform handling when a target has all-zero training support - - fixed `ensure_target_support()` to coerce boolean exemplar values before writing back into numeric synthetic columns - - added a real synthesizer-path regression with the promoted state-program proxy condition vars - - synthesizer rerun output saved at `artifacts/tmp_state_programs_feasible_synth_rerun_20260329.json` - - results: - - `n_synthetic=500` - - candidate MARE `0.8918` - - PE baseline MARE `0.7386` - - delta `+0.1533` - - candidate composite `0.8143` - - PE composite `0.7704` - - target win rate `29.41%` - - `n_synthetic=2000` - - candidate MARE `0.6811` - - PE baseline MARE `0.7386` - - delta `-0.0574` - - candidate composite `0.6481` - - PE composite `0.7704` - - target win rate `42.16%` -- Updated interpretation: - - feasible calibration was the main missing lever - - once the solve is narrowed to the corrected state-program target estate, both bootstrap and synthesizer improve sharply - - the synthesizer path now also clears PE at `n=2000`, and by a healthier margin than bootstrap - - the remaining US state-program work should now focus on: - - stabilizing this feasible-target calibration path - - deciding whether to keep the default cap at `1.0 * household_count` or tune it lower - - then broadening back out carefully instead of returning to the flat 3,611-constraint solve - -2026-03-29 — focused code review (Claude agent team) -- Scope: state-program accuracy work across microplex-us and microplex core -- Top findings: - 1. **Critical**: all saved artifacts show `converged: false` — headline n=2000 results are on unconverged weights. The "win" vs PE is narrow and not reliable. - 2. **High**: `min_active_households=1` lets degenerate single-household constraints through. Raise to 5-10. - 3. **High**: `has_medicaid` uses `BOUNDED_SHARE` but is binary — should be `ZERO_INFLATED_POSITIVE`. - 4. **High**: `ensure_target_support()` bool fix is correct but only guarantees 1 exemplar per category — not enough for calibration. - 5. **Medium**: zero project-level tests in microplex-us; zero direct unit tests for core transform fix. -- Diagnosis assessment: calibration infeasibility was a real blocker, but the deeper root cause is sparse small-state sample coverage (n=2000 across 51 states). Feasibility filtering delays the reckoning but doesn't resolve it. -- Benchmark assessment: corrected state-only path is valid as a diagnostic slice but should not replace the full canonical benchmark. Results are directionally encouraging but not credible until calibration converges. -- Top 3 next fixes: - 1. Add small-state oversampling floor (min 10 households/state) to bootstrap/synthesis - 2. Raise `min_active_households` to 5-10, warn when >20% constraints dropped - 3. Write regression tests for feasibility filter, ensure_target_support, condition var promotion, harness slice stability - -2026-03-29 -- Review handoff workflow: - - durable pending Claude review request now lives at `reviews/PENDING_CLAUDE_REVIEW.md` - - full Claude reviews should be written under `reviews/` - - `_BUILD_LOG.md` should keep only concise review summaries - - intended short Claude instruction is now just: - - `Please execute the pending review request in /Users/maxghenis/PolicyEngine/microplex-us/reviews/PENDING_CLAUDE_REVIEW.md` - -2026-03-29 -- Follow-up after focused review findings: - - tightened calibration feasibility defaults: - - `policyengine_calibration_min_active_households` now defaults to `5` - - feasibility diagnostics now record total dropped constraints, drop share, and warning messages - - calibration summaries now surface warnings for heavy feasibility dropping and non-convergence - - adjusted proxy handling: - - `has_medicaid` now uses `ZERO_INFLATED_POSITIVE` semantics - - only `has_medicaid` is auto-promoted into synthesis condition vars by default - - `public_assistance`, `ssi`, and `social_security` now remain synthesis targets instead of inflating the condition space - - core transform fallback now warns when a zero-inflated variable has no positive training support -- Focused verification: - - `microplex-us` focused pipeline tests: `13 passed` - - `microplex-us` variable semantics tests: `13 passed` - - `microplex` synthesizer tests: `17 passed` - - Ruff clean on touched files -- Updated corrected state-only reruns with stricter defaults: - - bootstrap artifact: `artifacts/tmp_state_programs_feasible_bootstrap_rerun_20260329.json` - - `n=2000`: candidate MARE `0.8094`, PE MARE `0.7386` - - `n=2000`: candidate composite `0.7408`, PE composite `0.7704` - - `n=2000`: `converged=false`, feasibility filter dropped `25/102` constraints (`24.5%`) - - interpretation: bootstrap no longer beats PE under the stricter floor - - synthesizer artifact: `artifacts/tmp_state_programs_feasible_synth_rerun_20260329.json` - - `n=2000`: candidate MARE `0.6910`, PE MARE `0.7386` - - `n=2000`: candidate composite `0.6537`, PE composite `0.7704` - - `n=2000`: `converged=false`, feasibility filter dropped `3/102` constraints (`2.9%`) - - interpretation: synthesizer still edges PE on the corrected state slice, but the solve is still unconverged, so this remains directional evidence rather than a settled win - -2026-03-29 -- PE-native mission-metric setup: - - `microplex-us` now has a real broad PE-native scorer in `src/microplex_us/pipelines/pe_native_scores.py` - - saved artifacts can persist `policyengine_native_scores.json` plus a `policyengine_native_scores` summary block in `manifest.json` - - `run_registry.jsonl` now understands: - - `candidate_enhanced_cps_native_loss` - - `baseline_enhanced_cps_native_loss` - - `enhanced_cps_native_loss_delta` - - unweighted MSRE companions - - canonical US version-bump flow now requires native scoring and ranks on `candidate_enhanced_cps_native_loss` -- Important boundary: - - the exact broad `enhanced_cps` native loss is now the primary PE mission metric - - PE local validation does not expose one single final scalar; the correct follow-up is a `validate_staging.py` wrapper plus saved `validation_results.csv` / summary JSON, not a fake “local PE loss” -- Focused verification: - - `tests/pipelines/test_pe_native_scores.py` - - `tests/pipelines/test_version_benchmark.py` - - `tests/pipelines/test_artifacts.py` - - `tests/pipelines/test_registry.py` - - result: `13 passed` - - Ruff clean on scorer/artifact/registry/version-benchmark files - -2026-03-29 -- PE-native mission loop tightened: - - canonical saved US version-bump flow now ranks frontier runs on `enhanced_cps_native_loss_delta`, not absolute candidate native loss - - saved native-score summaries now include an explicit `candidate_beats_baseline` flag - - `run_registry.jsonl` carries that boolean as `candidate_beats_baseline_native_loss` - - saved artifacts append to the registry even when only PE-native scoring is available and harness scoring is absent - - `microplex-us-version-benchmark` now supports `--require-beat-pe-native-loss` to fail fast when a run still loses on PE's own broad native loss -- Focused verification: - - `tests/pipelines/test_pe_native_scores.py` - - `tests/pipelines/test_version_benchmark.py` - - `tests/pipelines/test_registry.py -k "native_loss_frontier_selection or append_and_load_us_microplex_run_registry"` - - `tests/pipelines/test_artifacts.py -k "policyengine_native_scores_when_available"` - - Ruff clean on the touched scorer/artifact/registry/version-benchmark files - -2026-03-29 -- Historical PE-native backfill support: - - added `src/microplex_us/pipelines/backfill_pe_native_scores.py` - - new CLI: `microplex-us-backfill-pe-native-scores` - - backfill upgrades old bundles by writing `policyengine_native_scores.json`, updating `manifest.json`, and rebuilding `run_registry.jsonl` / `run_index.duckdb` for that artifact root -- Focused verification: - - `tests/pipelines/test_backfill_pe_native_scores.py` - - `tests/pipelines/test_pe_native_scores.py` - - `tests/pipelines/test_version_benchmark.py` - - `tests/pipelines/test_artifacts.py -k "policyengine_native_scores_when_available"` - - `tests/pipelines/test_registry.py -k "native_loss_frontier_selection or append_and_load_us_microplex_run_registry"` - - Ruff clean on the touched backfill/scorer/artifact/registry/version-benchmark files -- Important mission finding: - - backfilled `/artifacts/live_cps_puf_three_fixes_20260326/20260326T131756Z-4eaab451` - - despite beating PE on its own narrow saved harness (`candidate MARE 0.1737` vs baseline `0.1881`), it is catastrophic on PE's true broad native loss: - - candidate native loss `27.8382` - - PE baseline native loss `0.01748` - - delta `+27.8207` - - implication: the mission is not “go back to the older narrow tax-target config”; current broad/native-aligned candidates are much closer to PE even when they still lose - -2026-03-29 -- PE-native target-estate and local mission-loop wiring: - - added named exact-cell target profile support in `src/microplex_us/policyengine/target_profiles.py` - - added first mission profile: `pe_native_broad` - - provider now accepts exact `target_cells` filters through `TargetQuery.provider_filters` - - `USMicroplexBuildConfig` and local performance configs now carry `policyengine_target_profile` / `policyengine_calibration_target_profile` - - canonical `microplex-us-version-benchmark` now defaults both target-profile flags to `pe_native_broad` - - local performance harness can now optionally export the candidate and score PE-native broad loss directly via `evaluate_pe_native_loss=True` -- Important finding: - - for the current production target DB, `pe_native_broad` is exactly the active `national+state` surface: - - all geos: `37,755` - - national+state: `4,183` - - `pe_native_broad` profile: `4,183` - - so the value of the profile today is not a smaller target estate; it is making the mission surface explicit and future-stable, while excluding district/local drift from the canonical version-bump path -- Focused verification: - - targeted provider/pipeline/profile/version-benchmark/performance tests: `22 passed` - - `tests/pipelines/test_performance.py`: `13 passed` - - Ruff clean on touched target-profile/provider/pipeline/performance/version-benchmark files - -2026-03-29 -- Mission-loop throughput fix: - - `run_us_microplex_performance_harness()` was already computing PE-native scores, but `save_us_microplex_artifacts()` ignored them and recomputed the full PE-native scorer again while writing the bundle - - added `precomputed_policyengine_harness_payload` / `precomputed_policyengine_native_scores` passthrough support to artifact saving - - `run_us_microplex_source_experiments()` now forwards `performance_result.parity_run.to_dict()` and `performance_result.pe_native_scores` into the artifact saver - - implication: future sweeps stop paying the PE-native scorer twice per candidate -- PE-native broad target mix (from current scorer outputs + `policyengine-us-data` calibration targets): - - kept targets: `2,853` - - split: `677 national` / `2,176 state` - - state-heavy families are the real mission surface: - - age by state: `900` - - AGI bins by state: `918` - - SNAP state cost/households: `102` - - ACA spending/enrollment: `102` - - Medicaid enrollment: `51` - - real estate taxes by state: `51` - - state population: `51` - - implication: beating PE on the broad native loss requires state age/AGI structure, not just fixing SNAP/Medicaid -- Focused verification: - - `tests/pipelines/test_artifacts.py -k "precomputed_policyengine_native_scores or writes_policyengine_native_scores_when_available"`: `2 passed` - - `tests/pipelines/test_experiments.py -k "performance_session"`: `1 passed` - - Ruff clean on touched artifact/experiment files - -2026-03-29 -- Performance-harness scope fix for PE-native broad runs: - - found a real mission-loop bug: `USMicroplexPerformanceHarnessConfig` had hardcoded default target filters for five national tax variables, and those defaults were still applied even when `target_profile='pe_native_broad'` - - effect: the first live `cps+puf-rich` "broad" run under `/artifacts/live_pe_native_cps_puf_rich_sweep_20260329` was not actually broad; it calibrated only 5 national targets and produced a misleading PE-native score (`candidate native loss 1.1437` vs baseline `0.02024`) - - fixed `src/microplex_us/pipelines/performance.py` so named target profiles can own the scope unless the caller explicitly overrides variables/domains/geo levels - - parity/cache paths now read the resolved build scope, not stale config defaults - - relaunched the true broad mission run at `/artifacts/live_pe_native_cps_puf_rich_broad_fixed_20260329` -- Focused verification: - - `tests/pipelines/test_performance.py -k "preserves_target_profiles or warm_us_microplex_parity_cache"`: `3 passed` - - Ruff clean on touched performance/test files - -2026-03-29 -- Corrected broad PE-native result (`cps+puf-rich`, `sample_n=500`, `n_synthetic=2000`): - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_native_cps_puf_rich_broad_fixed_20260329/20260329T175330Z-057066af` - - the scope is now correct: `policyengine_target_profile='pe_native_broad'` with no extra variable/geo filters - - PE-native broad loss is still far from PE: - - candidate native loss `0.95856` - - PE baseline native loss `0.02024` - - delta `+0.93832` - - kept targets `2,817` (`641 national`, `2,176 state`) - - calibration remains the dominant failure mode on the broad mission surface: - - `converged=false` - - `1,413` supported constraints out of `4,183` loaded targets - - feasibility filter dropped `2,198 / 3,611` candidate constraints (`60.9%`) - - mean error `0.9234` - - implication: the PE-native mission is still primarily a scale/support problem; fixing the profile bug was necessary, but not enough -- Next live run: - - launched a larger broad mission candidate at `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_native_cps_puf_rich_broad_scaled_20260329` - - config: `sample_n=5000`, `n_synthetic=10000`, `target_profile='pe_native_broad'`, native loss only - -2026-03-29 -- PE-native scorer instrumentation: - - `src/microplex_us/pipelines/pe_native_scores.py` now supports `family_breakdown` in both single-candidate and batch native-loss scoring - - current family classifier covers the broad PE-native estate at the level we care about operationally: - - `state_age_distribution` - - `state_agi_distribution` - - `state_snap_cost` - - `state_snap_households` - - `state_medicaid_enrollment` - - `state_aca_spending` - - `state_aca_enrollment` - - `state_population` - - `state_population_under_5` - - `state_real_estate_taxes` - - plus national census / IRS / JCT / SSA / net-worth families - - goal: stop treating PE-native broad loss as one opaque scalar and identify which families dominate the mission gap -- Focused verification: - - `tests/pipelines/test_pe_native_scores.py`: `3 passed` - - Ruff clean on touched native-score files - -2026-03-29 -- Wired sparse/L0-style calibration into the actual PE-backed DB solve path: - - `src/microplex/calibration.py` now lets `SparseCalibrator` and `HardConcreteCalibrator` accept explicit `LinearConstraint` rows and report `linear_errors` / `converged` in the same shape as the classical calibrator - - `src/microplex_us/pipelines/us.py` now builds calibrators through one shared backend factory, so `policyengine_targets_db` calibration can use `sparse` and `hardconcrete` instead of hard-rejecting everything except `entropy/ipf/chi2` - - added focused regressions in: - - `microplex/tests/test_sparse_calibrator.py` - - `microplex/tests/test_sparse_calibration_comparison.py` - - `microplex-us/tests/pipelines/test_us.py` -- Focused verification: - - `microplex/tests/test_sparse_calibrator.py`, `microplex/tests/test_sparse_calibration_comparison.py`, `microplex/tests/test_calibration.py`: `48 passed` - - `microplex-us/tests/pipelines/test_us.py -k calibrate_policyengine_tables_from_db`: `4 passed` - - Ruff clean on touched core + US files -- Mission follow-up: - - attempted a broad sparse-vs-entropy sweep at `sample_n=5000`, `n_synthetic=10000`, but the first broad PE-native score alone was slow enough that it is not a practical overnight tuning loop yet - - replaced it with a smaller first broad sparse diagnostic at `sample_n=1000`, `n_synthetic=2000`, `target_sparsity=0.1`; result pending in `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pe_native_broad_sparse_n2000_20260329.json` - -2026-03-29 -- First broad sparse PE-native diagnostic landed: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pe_native_broad_sparse_n2000_20260329.json` - - result is much worse than entropy on the mission surface: - - candidate native loss `633.9884` - - PE baseline native loss `0.0202` - - delta `+633.9681` - - calibration summary: - - backend `policyengine_db_sparse` - - supported constraints `1,314 / 4,183` - - feasibility filter dropped `2,297 / 3,611` candidate constraints (`63.6%`) - - the dominant family blowups are not just Medicaid/SNAP: - - `state_agi_distribution` - - `state_age_distribution` - - `state_aca_spending` - - `state_aca_enrollment` - - `state_medicaid_enrollment` - - implication: the current sparse/L0-style solve path is not ready for the broad PE-native mission loop; it is a diagnostic branch, not a candidate frontier path -- Throughput fix for future mission sweeps: - - `src/microplex_us/pipelines/artifacts.py` now supports deferring native scoring when saving a batch of experiment bundles - - `src/microplex_us/pipelines/backfill_pe_native_scores.py` now has grouped batch backfill via `compute_batch_us_pe_native_scores(...)` - - `src/microplex_us/pipelines/experiments.py` now saves multi-experiment performance batches first, batch-scores native loss once per baseline, rebuilds the registry, and refreshes experiment results/frontier entries from the rebuilt registry - - goal: stop paying the fixed PE-native baseline/scorer cost candidate-by-candidate in experiment sweeps -- Focused verification: - - `tests/pipelines/test_experiments.py`, `tests/pipelines/test_backfill_pe_native_scores.py`: `10 passed` - - Ruff clean on touched artifact/backfill/experiment files - -2026-03-29 -- Native-only experiment throughput fix: - - the first batched `pe_native_broad` source/synthesis compare showed that `save_us_microplex_artifacts(...)` was still generating full `policyengine_harness.json` sidecars even when the performance run had `evaluate_parity=False` - - that was wasted work for the PE-native mission loop and produced huge harness files (`~100MB`) before native batch scoring even started - - fixed by threading `defer_policyengine_harness` through: - - `src/microplex_us/pipelines/artifacts.py` - - `src/microplex_us/pipelines/experiments.py` - - performance-session experiment batches now skip harness generation when there is no precomputed parity payload, while still deferring native scoring and backfilling it in batch -- Focused verification: - - `tests/pipelines/test_experiments.py::test_run_us_microplex_source_experiments_can_use_performance_session` - - `tests/pipelines/test_artifacts.py::TestSaveUSMicroplexArtifacts::test_can_defer_policyengine_harness_generation` - - Ruff clean on touched artifact/experiment files - - Current live run: - - relaunched the four-way PE-native broad compare on the no-harness path at `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_native_broad_entropy_batch_noharness_20260329` - - matrix: - - `cps-only-bootstrap` - - `cps-only-synthesizer` - - `cps-puf-bootstrap` - - `cps-puf-synthesizer` - - shared config: - - `sample_n=1000` - - `n_synthetic=2000` - - `calibration_backend='entropy'` - - `target_profile='pe_native_broad'` - -2026-03-29 -- First live donor-imputer A/B on the real PE-native broad mission path: - - added explicit donor-imputer backend switching in `src/microplex_us/pipelines/us.py` - - runtime now supports `donor_imputer_backend='maf' | 'qrf' | 'zi_qrf'` - - `qrf` / `zi_qrf` use a new columnwise forest-based donor imputer rather than the existing flow-based `Synthesizer` - - added focused route coverage in `tests/pipelines/test_us.py` -- Smoke-test result on `cps_asec_2023 + puf_2024`, `sample_n=500`, `n_synthetic=2000`, `target_profile='pe_native_broad'`, `calibration_backend='entropy'`: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_donor_backend_ab_pe_native_broad_20260329.json` - - `maf`: - - candidate native loss `0.8958` - - baseline native loss `0.02024` - - delta `+0.8755` - - calibration `converged=false` - - supported constraints `1,391` - - feasibility filter dropped `2,220 / 3,611` constraints (`61.5%`) - - `zi_qrf`: - - candidate native loss `0.9278` - - baseline native loss `0.02024` - - delta `+0.9076` - - calibration `converged=false` - - supported constraints `1,459` - - feasibility filter dropped `2,152 / 3,611` constraints (`59.6%`) -- Immediate read: - - the widened imputation eval winner (`zi_qrf`) did not improve total PE-native broad loss on the live runtime path; it made the smoke-test result slightly worse than `maf` - - translation caveat is likely real: the runtime donor-imputed variables on this path are mostly PUF tax variables (`capital_gains`, `dividends`, `interest`, `pension`, etc.), not the broader survey-support surfaces emphasized by the widened eval - - next control is plain `qrf` on the same path to see whether the miss is the zero-inflated gate or the whole forest donor-imputer branch -- Plain `qrf` control on the same config: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_donor_backend_qrf_pe_native_broad_20260329.json` - - candidate native loss `0.8931` - - baseline native loss `0.02024` - - delta `+0.8728` - - calibration `converged=false` - - supported constraints `1,398` - - feasibility filter dropped `2,213 / 3,611` constraints (`61.3%`) -- Current runtime read: - - `qrf` is slightly better than `maf` on PE-native broad total loss in this smoke test (`0.8931` vs `0.8958`) - - `zi_qrf` is worse than both (`0.9278`) - - none of these are remotely close to PE yet, so this is only a runtime-direction result, not a candidate-frontier change -- QRF control on the same live path: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_donor_backend_qrf_pe_native_broad_20260329.json` - - `qrf`: - - candidate native loss `0.8931` - - baseline native loss `0.02024` - - delta `+0.8728` - - calibration `converged=false` - - supported constraints `1,398` - - feasibility filter dropped `2,213 / 3,611` constraints (`61.3%`) -- Updated read: - - on the current live PE-native broad smoke test, plain `qrf` slightly beat the existing `maf` runtime donor path, while `zi_qrf` was worse - - ordering on this path was `qrf` (`0.8931`) better than `maf` (`0.8958`) better than `zi_qrf` (`0.9278`) - - the `qrf` vs `maf` gap is tiny and all three runs remain `converged=false`, so this is not enough to justify a production switch - - the widened eval is still useful, but it should not directly drive the PE-native production switch without a closer mission-surface benchmark - -2026-03-29 -- Broad PE-native family diagnosis: - - the huge broad-loss gap is not primarily a donor-imputer issue - - in `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_native_broad_entropy_batch_noharness_20260329/20260329T210427Z-057066af/policyengine_native_scores.json`, the top loss contributors are: - - `national_irs_other` `+0.2839` - - `state_agi_distribution` `+0.1893` - - `state_age_distribution` `+0.1860` - - `national_population_by_age` `+0.0605` - - `national_census_other` `+0.0445` - - `state_aca_spending` `+0.0333` - - donor-imputer choice only moves total broad loss by about `0.035` end-to-end (`0.8931` to `0.9278`), while the gap to PE is still about `0.87` - - current live donor-imputation only affects a 31-variable PUF tax block, so most of the broad native-loss delta is coming from seams outside the donor-imputer switch -- Failed bootstrap-target-scope experiments: - - tried auto-inferencing profile-driven bootstrap strata from `pe_native_broad` - - full profile strata (`state_fips`, `age_group`, `income_bracket`) made broad loss worse: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_profile_strata_pe_native_broad_20260329.json` - - candidate native loss `0.9371` - - delta `+0.9169` - - narrower state-only profile strata also made broad loss worse: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_state_strata_pe_native_broad_20260329.json` - - candidate native loss `0.9373` - - delta `+0.9170` - - conclusion: bootstrap stratification is not the missing broad-native lever here; the attempted default inference was reverted after the smoke tests - -2026-03-29 -- Broad PE-native structural export diagnosis: - - found a real upstream household-structure bug on the broad path: saved calibrated rows still carried healthy `family_relationship`, but `relationship_to_head` had already collapsed to mostly `{0,3}`, and `build_policyengine_entity_tables()` was preserving that bad column - - first fix: when `family_relationship` is richer than `relationship_to_head`, prefer it during PE-entity construction - - second fix: repair incoherent household relationship patterns before tax-unit construction so each household has exactly one head and at most one spouse - - before the repair on the saved broad artifact (`20260329T210427Z-057066af`): - - `4774` tax units for `4774` people - - filing status all `SINGLE` - - `1170 / 2000` households had no head at all - - after the repair on the same saved artifact: - - `4650` tax units for `4774` people - - filing status distribution `{'SINGLE': 4529, 'JOINT': 119, 'HEAD_OF_HOUSEHOLD': 2}` - - `0 / 2000` households with no head - - `0 / 2000` households with multiple heads - - quick PE probe on the repaired `cps+puf` broad export: - - `income_tax_sum` moved from `105.41B` to `104.01B` - - `tax_unit_is_filer_sum` moved from `4.889M` to `4.793M` - - raw IRS person-income sums like `qualified_dividend_income`, `taxable_interest_income`, and `taxable_pension_income` were unchanged, so this fix primarily affects filing/tax-unit structure rather than person-level donor values -- Broad donor/entity semantics diagnosis: - - several IRS donor-integrated inputs in `variables.py` were still marked tax-unit-native even though current `policyengine_us` defines them as person variables - - patched the confirmed person-native set: - - `dividend_income` - - `ordinary_dividend_income` - - `qualified_dividend_income` - - `non_qualified_dividend_income` - - `taxable_interest_income` - - `tax_exempt_interest_income` - - `taxable_pension_income` - - `taxable_social_security` - - `self_employment_income` - - `student_loan_interest` - - also moved `DIVIDEND_DONOR_BLOCK_SPEC` to `native_entity=PERSON` - - this stops the donor path from projecting those inputs onto tax units with default `FIRST` -- Verification: - - focused relationship tests in `tests/pipelines/test_us.py`: passed (`4`) - - focused variable-semantics tests in `tests/test_variables.py`: passed (`4`) - - Ruff clean on touched files -- Next step: - - clean PE-native broad rescoring is still running on the repaired `cps+puf` export to quantify how much the broad loss actually moves from these two structural fixes - -2026-03-29 -- Broad PE-native rescore on repaired `cps+puf` export: - - persisted repaired export: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_cps_puf_broad_relationship_entity_fix_20260329.h5` - - direct PE-native broad scoring under `policyengine-us-data` showed: - - candidate loss `0.9386384097643049` - - same kept-target surface as before (`2817` = `641` national + `2176` state) - - comparison to the saved pre-fix `cps+puf` broad artifact (`20260329T210540Z-057066af`): - - pre-fix candidate loss `0.9369853544124408` - - post-fix candidate loss `0.9386384097643049` - - change `+0.0016530553518641` (slightly worse) - - interpretation: - - the relationship/head repair and confirmed person-native IRS semantic fixes corrected real structural bugs - - but on this saved `cps+puf` broad candidate they did not improve the mission metric - - broad PE-native loss is still dominated by seams outside this export-structure fix, especially the already-identified `national_irs_other`, `state_agi_distribution`, and `state_age_distribution` families - -2026-03-29 -- PE pre-sim parity audit against `source_imputed_stratified_extended_cps_2024.h5`: - - added reusable audit helper: - - `src/microplex_us/pipelines/pre_sim_parity.py` - - `tests/pipelines/test_pre_sim_parity.py` - - real audit artifact written to: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pre_sim_parity_audit_20260329.json` - - saved broad candidate audited: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_native_broad_entropy_batch_noharness_20260329/20260329T210427Z-057066af/policyengine_us.h5` - - key findings: - - candidate schema recall vs PE pre-sim input surface is only `35 / 165 = 21.2%` - - missing critical pre-sim inputs include: - - `county_fips` - - `cps_race` - - `is_hispanic` - - `is_disabled` - - `rent` - - `real_estate_taxes` - - `net_worth` - - `has_esi` - - `has_marketplace_health_coverage` - - candidate tax-unit structure is still pathological pre-sim: - - `share_multi_person_tax_units = 0.0` - - reference `share_multi_person_tax_units = 0.446` - - candidate state-by-age pre-sim support recall is only `0.627` - - `576 / 918` nonempty `(state, 5-year-age-bin)` cells - - worst missing states by cell count include DC (`11`), WY (`56`), SD (`46`), VT (`50`) - - several mission-relevant IRS donor inputs have zero positive support in the candidate while PE pre-sim has real mass, notably: - - `long_term_capital_gains_before_response` - - `partnership_s_corp_income` - - `farm_income` - - interpretation: - - the broad PE-native gap is not just calibration - - we are feeding PE a far thinner and structurally weaker pre-sim dataset than PE-US-data feeds itself - - next step: - - build a parity-focused fix list around missing pre-sim inputs and tax-unit structure before spending more cycles on donor-backend A/B tests - -2026-03-29 -- PE pre-sim parity follow-up: - - re-exported the saved broad candidate under current code to isolate export/handoff vs upstream candidate quality: - - candidate source tables: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_native_broad_entropy_batch_noharness_20260329/20260329T210427Z-057066af/calibrated_data.parquet` - - re-exported H5: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pre_sim_parity_reexport_20260329.h5` - - updated audit: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pre_sim_parity_reexport_20260329.json` - - compared with the original saved candidate H5 audit: - - common PE pre-sim vars improved from `35` to `39` - - schema recall improved from `0.2121` to `0.2364` - - recovered exactly these PE inputs in the H5 handoff: - - `cps_race` - - `is_hispanic` - - `rent` - - `real_estate_taxes` - - missing critical vars dropped from: - - `county_fips`, `cps_race`, `is_hispanic`, `is_disabled`, `rent`, `real_estate_taxes`, `net_worth`, `has_esi`, `has_marketplace_health_coverage` - - to: - - `county_fips`, `is_disabled`, `net_worth`, `has_esi`, `has_marketplace_health_coverage` - - candidate tax-unit structure improved slightly under current entity-table/export code: - - `share_multi_person_tax_units` from `0.0` to `0.0260` - - interpretation: - - the export bridge was a real part of the problem, but not the dominant one - - after current-code re-export, the remaining broad gap is clearly upstream of H5 writing -- CPS pre-sim source-surface restoration: - - updated `src/microplex_us/data_sources/cps.py` so raw CPS loads now carry the same core CPS-derived pre-sim inputs that `policyengine-us-data` uses: - - `county_fips` from household `GTCO` - - `cps_race` from `PRDTRACE` - - `is_hispanic` from `PRDTHSP != 0` - - `is_disabled` from the CPS disability flags (`PEDISDRS`, `PEDISEAR`, `PEDISEYE`, `PEDISOUT`, `PEDISPHY`, `PEDISREM`) - - `has_esi` from `NOW_GRP == 1` - - `has_marketplace_health_coverage` from `NOW_MRK == 1` - - also tightened processed-cache freshness so stale cached CPS parquet will rebuild if those PE-style pre-sim columns are missing - - verified in `tests/test_cps_source_provider.py` (`6 passed`, Ruff clean) - - this is aimed at future broad reruns; it does not retroactively change the already-saved broad artifact - -2026-03-29 -- Fresh current-code parity audit correction: - - the earlier `tmp_pre_sim_parity_reexport_20260329.h5/json` pair turned out to be stale for entity-structure conclusions - - rebuilt a fresh current-code export directly from the saved broad `calibrated_data.parquet`: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_tax_unit_recheck_20260329.h5` - - audit: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pre_sim_parity_reexport_fresh_20260329.json` - - corrected fresh-audit findings: - - schema overlap is unchanged from the re-export check: - - `39 / 165` common PE pre-sim vars - - schema recall `0.2364` - - missing critical vars remain: - - `county_fips` - - `is_disabled` - - `net_worth` - - `has_esi` - - `has_marketplace_health_coverage` - - but entity structure is substantially healthier than the stale re-export audit implied: - - `tax_unit_rows = 2807` - - mean tax-unit size `1.7007` - - `share_multi_person_tax_units = 0.3997` - - `share_multi_person_households = 0.687` - - state-age support recall is still only `0.627` - - interpretation: - - current code no longer appears to be collapsing tax-unit membership at the PE export boundary - - the remaining pre-sim parity gap is now more clearly about: - - missing CPS-derived inputs that are not yet present upstream (`county_fips`, `is_disabled`, `has_esi`, `has_marketplace_health_coverage`) - - missing wealth input (`net_worth`) - - thin `(state, age)` support before calibration - -2026-03-29 -- CPS pre-sim parity smoke test on the real broad mission metric: - - ran a fresh CPS-only broad PE-native smoke build with the updated raw CPS loader and real PE targets DB: - - provider: `CPSASECSourceProvider(year=2023)` - - calibration DB: `/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/calibration/policy_data.db` - - PE baseline: `/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5` - - config: `sample_n=500`, `n_synthetic=2000`, `target_profile='pe_native_broad'`, `calibration_target_profile='pe_native_broad'`, `evaluate_pe_native_loss=True` - - result: - - candidate broad PE-native loss `0.9058149122381814` - - PE baseline `0.020243908529428433` - - delta `+0.885571003708753` - - calibration still `converged=false` - - feasibility filter still dropped `2506 / 3611` constraints (`69.4%`) - - comparison to the earlier CPS-only broad bootstrap frontier run: - - earlier saved candidate loss `0.9233365911702252` - - improvement from restored CPS pre-sim inputs `-0.0175216789320438` - - interpretation: - - restoring PE-style CPS pre-sim inputs is directionally correct and measurably improves the real mission metric - - but it is not remotely sufficient on its own; the remaining broad gap is still dominated by other structural issues - -2026-03-29 -- PE export + relationship parity corrections: - - updated `src/microplex_us/policyengine/us.py` so the PE export whitelist now includes pre-sim inputs we already carry upstream: - - `cps_race` - - `is_hispanic` - - `is_disabled` - - `rent` - - `real_estate_taxes` - - `has_esi` - - `has_marketplace_health_coverage` - - `net_worth` - - added a narrow export alias only for `race -> cps_race`; dropped the lossy raw `hispanic -> is_hispanic` rename - - updated `src/microplex_us/pipelines/us.py` so PE-oriented person-input augmentation now derives exact PE-native columns before export: - - `cps_race` from `race` - - `is_hispanic` from CPS-coded `hispanic` - - fixed `family_relationship` normalization to handle the common CPS 1-based coding per household: - - `1=head`, `2=spouse`, `3=child`, `4=other` - - this was the real reason rebuilt tax units had been collapsing toward singletons on many CPS-shaped households - - fixed `prepare_seed_data_from_source()` to preserve household `county_fips` instead of dropping it during the household-person merge - - focused verification: - - `tests/test_cps_source_provider.py`: `6 passed` - - `tests/pipelines/test_pre_sim_parity.py`: `1 passed` - - `tests/pipelines/test_us.py -k 'prepare_seed_data or build_policyengine_entity_tables or derives_tax_input_columns'`: `10 passed` - - `tests/policyengine/test_us.py -k 'export_variable_maps or projects_frame'`: `5 passed` - - Ruff clean on touched CPS / pipeline / PE-export files -- fresh current-code re-export from the saved broad candidate: - - candidate H5: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pre_sim_parity_export_fix_candidate_20260329.h5` - - parity audit: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pre_sim_parity_export_fix_audit_20260329.json` - - native score: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pre_sim_parity_export_fix_native_score_20260329.json` - - key results: - - schema recall remains `0.2364` (`39 / 165`) - - missing critical vars are now: - - `county_fips` - - `is_disabled` - - `net_worth` - - `has_esi` - - `has_marketplace_health_coverage` - - candidate tax-unit structure is now materially healthier under current code: - - `tax_unit_rows = 2807` - - mean tax-unit size `1.7007` - - `share_multi_person_tax_units = 0.3997` - - broad PE-native loss on the repaired re-export is: - - candidate `0.9339483631287737` - - PE baseline `0.020243908529428433` - - delta `+0.9137044545993452` - - interpretation: - - the PE handoff really was broken in specific ways, and the repaired handoff is more faithful now - - but even a substantially healthier export/tax-unit structure only buys a small broad-loss improvement on the saved candidate - - the dominant remaining gap is still upstream of export, especially: - - missing pre-sim input surfaces - - thin state-age support - - weak IRS / AGI cell mass before calibration - -2026-03-29 -- current-code CPS-only broad PE-native drilldown: - - built and exported the exact current-code CPS-only candidate H5: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_cps_only_currentcode_candidate_20260329.h5` - - broad smoke result: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_cps_only_currentcode_pe_native_broad_20260329.json` - - candidate broad PE-native loss `0.9159877997083388` - - PE baseline `0.020243908529428433` - - delta `+0.8957438911789103` - - exact worst targets: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pe_native_broad_worst_targets_currentcode_cps_20260329.json` - - pre-sim surface compare against PE's source-imputed CPS: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pre_sim_surface_compare_currentcode_cps_20260329.json` - - state-mass compare: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_state_mass_compare_currentcode_cps_20260329.json` -- main findings: - - `state_age_distribution` is a real large driver, not a scorer artifact: - - current-code candidate has only `434` nonempty `(state, 5-year-age-bin)` cells vs `911` in PE's source-imputed CPS - - many large exact cells are literally zero, e.g.: - - `state/census/age/PA/20-24`: candidate `0.0` vs target `798,935` - - `state/census/age/FL/40-44`: candidate `0.0000275` vs target `1,434,863` - - `state/census/age/TX/15-19`: candidate `0.0000626` vs target `2,198,388` - - `national_irs_other` is being driven by literal zeroed IRS surfaces: - - candidate has `0.0` on high-value exact targets where PE baseline is near-target, e.g.: - - `nation/irs/total pension income/total/AGI in 20k-25k/taxable/All` - - `nation/irs/qualified dividends/total/AGI in -inf-inf/taxable/All` - - `nation/irs/partnership and s corp income/total/AGI in 75k-100k/taxable/All` - - `nation/irs/adjusted gross income/total/AGI in 500k-1m/taxable/Single` - - `nation/irs/capital gains gross/total/AGI in 30k-40k/taxable/All` - - pre-sim IRS surface compare confirms the upstream mass problem: - - candidate weighted positive-share is `0.0` for `capital_gains_gross` - - candidate weighted positive-share is `0.0` for `partnership_and_s_corp_income` - - candidate weighted positive-share is `0.0` for `total_pension_income` - - candidate has no tax-unit mass above `$1m` AGI, while PE reference has weighted share `0.0597` - - `state_agi_distribution` is a mix of state-mass collapse and AGI-tail distortion: - - worst exact misses include: - - `state/MD/adjusted_gross_income/count/-inf_1`: candidate `127,417` vs target `40,530` - - `state/MS/adjusted_gross_income/count/500000_inf`: candidate `23,033` vs target `8,170` - - many state amount cells are still exactly zero, e.g.: - - `state/WY/adjusted_gross_income/amount/100000_200000` - - `state/WV/adjusted_gross_income/amount/500000_inf` - - `state/DC/adjusted_gross_income/amount/75000_100000` - - weighted state mass itself is heavily distorted before calibration: - - candidate state share ratios vs PE reference are effectively zero in some states: - - TN (`~6.1e-10`) - - SD (`~6.4e-10`) - - NV (`~9.5e-10`) - - large states are also badly underweighted: - - TX share ratio `0.0929` - - FL `0.3971` - - while some states are materially overweighted: - - VA `3.06` - - MA `2.36` - - GA `2.30` -- interpretation: - - the dominant broad-loss problem is now clearly upstream population/state allocation and missing IRS surface mass before calibration - - PE-native scorer correctness looks much less suspicious than candidate structure/support - - the next high-leverage fixes are: - - restore missing IRS/tax-unit mass (`capital_gains_gross`, `partnership_and_s_corp_income`, `total_pension_income`, high-AGI filers) - - repair state allocation before calibration - - then revisit ACA/coverage surfaces, which also show extreme exact misses (`nation/irs/aca_spending/hi`, `state/irs/aca_enrollment/hi`) - -## 2026-03-29 weighted-source sampling checkpoint - -- current-code donor path diagnosis: - - the critical PUF IRS variables are *not* disappearing in the live `cps+puf` build anymore - - a direct mini-build trace shows `qualified_dividend_income`, `long_term_capital_gains`, `partnership_s_corp_income`, `total_pension_income`, `taxable_pension_income`, and `taxable_interest_income` all survive: - - raw PUF frame - - donor integration into `seed_data` - - bootstrap `synthetic_data` - - `calibrated_data` - - that means the old zero-surface failure was a saved-artifact issue, not the current-code seam -- source loader fix: - - `CPSASEC` and `PUF` `sample_n` subsampling now use weight-aware sampling without replacement when there are enough positive-weight rows - - this is now covered by focused provider regressions for both CPS and PUF -- mission-surface effect: - - patched `cps+puf + qrf + bootstrap` broad PE-native smoke: - - candidate loss `0.8894089161` - - PE baseline `0.0202439085` - - delta `+0.8691650076` - - prior comparable `qrf` smoke was `0.8930645879` - - so the weighted-source patch improved broad loss by about `0.00366` -- remaining constraints: - - the same patched candidate still drops `2387 / 3611` calibration constraints (`66.1%`) - - a patched `cps+puf` pre-sim audit still only reaches `453 / 918` nonempty `(state, age-bin)` cells, support recall `0.493` -- interpretation: - - weight-aware source sampling is a real but small win - - it is not enough to close the broad-loss gap - - the remaining bottleneck is still structural state support / state allocation plus unconverged broad calibration, not donor-variable passage - -## 2026-03-29 weighted-source scale checkpoint - -- broad PE-native result on weighted `cps+puf + qrf + bootstrap` with `sample_n=1000`, `n_synthetic=2000`: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_weighted_sample1000_pe_native_broad_20260329.json` - - candidate loss `0.8696287975` - - PE baseline `0.0202439085` - - delta `+0.8493848890` -- this improves the weighted `sample_n=500` comparable run (`0.8894089161`) by about `0.01978` -- calibration also got a little healthier: - - dropped constraints improved from `2387 / 3611` to `2301 / 3611` - - feasibility-drop share improved from `66.1%` to `63.7%` -- family improvements are concentrated exactly where we need them: - - `state_age_distribution`: `-0.00579` loss-contribution delta improvement - - `state_agi_distribution`: `-0.00579` - - `national_irs_other`: `-0.00438` - - `national_population_by_age`: `-0.00158` -- pre-sim support also improved materially at this scale: - - `sample_n=500`: state-age support recall `0.464`, nonempty cells `426` - - `sample_n=1000`: state-age support recall `0.598`, nonempty cells `549` -- interpretation: - - scaling the source sample is a much stronger lever than the small weighted-subsampling patch alone - - the next main-line bet should stay on this axis: weighted-source path + larger `sample_n` - - state-stratified bootstrap still looks like the wrong direction at this sample size - -## 2026-03-29 broad-loop reversal checkpoint - -- weighted `cps+puf + qrf + bootstrap` with `sample_n=1000`, `n_synthetic=5000` regressed materially: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_weighted_sample1000_n5000_pe_native_broad_20260329.json` - - candidate loss `0.8907772820` vs the stronger `sample_n=1000`, `n_synthetic=2000` result `0.8696287975` - - calibration feasibility looked *broader* but fit quality got worse: - - dropped constraints improved to `1807 / 3611` (`50.0%`) - - but `weight_collapse_suspected = true` - - household effective sample ratio collapsed to `0.165` - - median household weight collapsed to `~1.37e-08` -- family-level regression from `1000/2000` to `1000/5000` is narrow, not broad-based: - - `national_irs_other`: `+0.01510` - - `state_agi_distribution`: `+0.00899` - - `state_aca_spending`: `+0.00133` - - meanwhile `state_age_distribution` *improved* slightly (`-0.00293`) -- exact target regressions confirm the failure mode is filer/tax/ACA structure, not generic state-age support: - - huge regressions in: - - high-AGI IRS bins (`1m+`, `500k-1m`) - - Head of Household bins - - business/capital-gains/taxable-interest cells - - state ACA spending cells - - a few extreme state high-AGI cells like `state/VT/adjusted_gross_income/amount/500000_inf` -- interpretation: - - more synthetic rows from the same support base destabilize broad PE-native fit - - this is not a monotone “more `n_synthetic` is better” regime - - for broad PE-native loss, the current bottleneck is tax/filer structure stability plus calibration interaction - -## 2026-03-29 source-mix and donor-path checkpoint - -- weighted `cps+puf + qrf + bootstrap` with `sample_n=2000`, `n_synthetic=2000` was worse, not better: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_weighted_sample2000_pe_native_broad_20260329.json` - - candidate loss `0.9251676593` - - supported constraints `1280` vs `1310` on the better `1000/2000` run - - household calibrated weight total `6.24M` vs `10.37M` on the better `1000/2000` run - - mean constraint error `0.879` vs `0.795` -- the raw weighted CPS source sample is not the obvious culprit: - - `sample_n=1000`: weight sum `4.37M`, `50` states - - `sample_n=2000`: weight sum `8.70M`, all `51` states -- the raw PUF source is effectively national-only in this path, which is expected: - - `state_count = 1` on the sampled PUF household table -- donor-condition audit for the PUF path on the current best `cps+puf` run: - - scaffold: `cps_asec_2023` - - selected donor condition vars are only: - - `age` - - `interest_income` - - `rental_income` - - `self_employment_income` - - `sex` - - `social_security` - - `unemployment_compensation` - - importantly, `state_fips` is *not* entering the PUF donor match -- `cps-only` isolation at the same `sample_n=2000`, `n_synthetic=2000` size: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_cps_only_sample2000_pe_native_broad_20260329.json` - - candidate loss `0.8846092807` - - this is much better than `cps+puf` at the same size (`0.9251676593`) - - but still worse than the best current broad run (`cps+puf`, `1000/2000`, `0.8696287975`) -- pre-sim parity at `sample_n=2000`, `n_synthetic=2000` also points the same way: - - `cps+puf`: state-age support recall `0.6100`, multi-person tax-unit share `0.3885` - - `cps-only`: state-age support recall `0.6296`, multi-person tax-unit share `0.4090` -- interpretation: - - the current PUF donor path is harming the broad PE-native mission surface at `sample_n=2000` - - the harm is not coming from `state_fips` being used in donor matching - - the sharper hypothesis is that donor-imputing tax/filer surfaces like `filing_status_code` from only a weak seven-variable numeric condition set is destabilizing `national_irs_other` and related ACA/high-AGI families - -## 2026-03-29 diagnostics tooling checkpoint - -- added reusable PE-native target-delta comparison helper in `src/microplex_us/pipelines/pe_native_scores.py` - - purpose: compare exact target-level weighted-loss deltas between two candidate H5s without ad hoc one-off scripts - - exported via `src/microplex_us/pipelines/__init__.py` - - covered in `tests/pipelines/test_pe_native_scores.py` -- focused verification: - - `pytest -q tests/pipelines/test_pe_native_scores.py` -> `4 passed` - - `ruff check` on the touched scorer/export/test files -> clean - -## In flight - -- direct ablation still running: - - `cps+puf`, weighted `qrf + bootstrap`, `sample_n=1000`, `n_synthetic=2000` - - but skip donor integration of `filing_status_code` - - output target: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_no_filing_status_pe_native_broad_20260329.json` -- this is the cleanest immediate test of the current filer-structure hypothesis. - -## 2026-03-29 filing-status donor checkpoint - -- the `filing_status_code` ablation landed and improved the real mission metric: - - baseline broad run: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_weighted_sample1000_pe_native_broad_20260329.json` - - candidate loss `0.8696287975` - - no-filing ablation: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_no_filing_status_pe_native_broad_20260329.json` - - candidate loss `0.8596198236` - - improvement: `-0.010009` -- the gains are concentrated in the same broad families we already care about: - - `national_irs_other`: `-0.00358` - - `state_aca_spending`: `-0.00281` - - `state_agi_distribution`: `-0.00136` - - `national_population_by_age`: `-0.00091` -- pre-sim parity did **not** improve on state-age support: - - best broad run: state-age support recall `0.5980` - - no-filing ablation: `0.5643` - - interpretation: this is a tax/filer-structure win, not a generic coverage win -- exported tax-unit structure changed modestly in the healthier direction: - - best broad run: - - `filing_status` shares `SINGLE 59.6%`, `JOINT 35.1%`, `HOH 5.3%` - - mean tax-unit size `1.7266` - - multi-person tax-unit share `0.4038` - - no-filing ablation: - - `filing_status` shares `SINGLE 58.0%`, `JOINT 37.8%`, `HOH 4.2%` - - mean tax-unit size `1.7432` - - multi-person tax-unit share `0.4199` -- raw PUF confirms why the donor path is risky here: - - `filing_status_code` exists only in PUF, not in the CPS scaffold seed - - raw sampled PUF distribution is strongly categorical and skewed: - - `JOINT 1112`, `SINGLE 316`, `HOH 103`, `SEPARATE 25` - - current donor logic was treating `filing_status_code` as a generic continuous donor target under weak shared numeric conditions -- code change: - - `src/microplex_us/pipelines/us.py` now supports `donor_imputer_excluded_variables` - - exclusion remains opt-in; do **not** make `filing_status_code` the default exclusion until the result is reproducible - - `synthesis_metadata` now records `donor_excluded_variables` - - focused test added in `tests/pipelines/test_us.py` -- next likely tax/filer ablation candidates, if broad loss plateaus here: - - `eitc_children` - - `exemptions_count` - - possibly other PUF-only count/categorical surfaces before touching zero-inflated amount variables - -## 2026-03-29 filing-status reproducibility warning - -- the supported-path rerun of the same broad `qrf + bootstrap` idea with opt-in exclusion - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_excluded_filing_status_config_pe_native_broad_20260329.json` - - candidate loss `1.3717579152` - - this is much worse than both: - - the earlier one-off no-filing artifact `0.8596198236` - - the ordinary broad run `0.8696287975` -- family comparison against the earlier no-filing artifact says the regression is dominated by: - - `national_irs_other` `+0.4980` - - `state_aca_spending` `+0.0040` - - `state_age_distribution` `+0.0031` - - `national_population_by_age` `+0.0019` - - `state_agi_distribution` `+0.0017` -- pre-sim parity also diverged materially: - - earlier no-filing artifact: - - state-age support recall `0.5643` - - state count `50` - - mean tax-unit size `1.7432` - - multi-person tax-unit share `0.4199` - - supported-path rerun: - - state-age support recall `0.5795` - - state count `48` - - mean tax-unit size `1.6550` - - multi-person tax-unit share `0.3808` -- interpretation: - - the `filing_status_code` exclusion hook is worth keeping for controlled ablations - - but the win is **not yet reproducible enough** to set as the default mission path - - treat this as a reproducibility / run-path discrepancy that needs explanation before widening tax/filer exclusions - -## 2026-03-29 deterministic PUF age fix - -- found a concrete reproducibility bug in `src/microplex_us/data_sources/puf.py` - - the live PUF path does **not** have `age` or `AGE_HEAD` after the demographics merge - - so `map_puf_variables()` falls back to `_impute_age()` - - `_impute_age()` was adding Gaussian noise with unseeded `np.random.normal(...)` -- that means identical broad `cps+puf + qrf + bootstrap + entropy` runs could differ before donor integration and calibration even with the same configured seed -- patch: - - `map_puf_variables(..., random_seed=...)` - - `_impute_age(..., random_seed=...)` - - `_build_puf_tax_units(..., random_seed=...)` - - `PUFSourceProvider.load_frame()` now passes provider `random_seed` through to the age-imputation fallback -- regression coverage: - - `tests/test_puf_source_provider.py::test_map_puf_variables_seed_controls_age_imputation` - - `tests/test_puf_source_provider.py::test_puf_source_provider_age_imputation_is_reproducible_with_same_seed` -- validation after the patch: - - two same-seed exported H5s from the broad baseline path - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_postfix_rebuild_a_20260329.h5` - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_postfix_rebuild_b_20260329.h5` - - have identical pre-sim parity metrics: - - state-age nonempty cells `571` - - state-age support recall `0.6220` - - mean tax-unit size `1.7212` - - multi-person tax-unit share `0.4013` - - and identical exported variable arrays across the full common H5 surface (`different_variable_count = 0`) -- implication: - - same-config A/Bs on the patched path are now much more trustworthy - - do not interpret older `cps+puf` broad comparisons as fully clean unless they were built after this fix - -## 2026-03-30 filing-status exclusion confirmed on deterministic path - -- direct PE-native broad rescoring of the deterministic no-filing artifact: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_postfix_no_filing_20260329.h5` - - candidate loss `0.8677052580` - - PE baseline `0.0202439085` - - delta `+0.8474613495` -- this is a real improvement over the deterministic patched baseline export: - - patched baseline `0.9286499637` - - improvement from excluding donor-imputed `filing_status_code`: `0.0609447056` (`6.56%`) -- top remaining family deltas on the improved no-filing candidate are still: - - `national_irs_other` `+0.2473` - - `state_agi_distribution` `+0.1822` - - `state_age_distribution` `+0.1807` - - `national_population_by_age` `+0.0560` - - `national_census_other` `+0.0449` - - `state_aca_spending` `+0.0315` -- compared with the deterministic patched baseline, excluding `filing_status_code`: - - strongly improves several IRS/HOH/high-income cells - - but also worsens some ACA spending / ACA enrollment state cells -- pre-sim signal: - - `filing_status` is exported and used directly in PE-US-data SOI loss masks - - `exemptions_count` and `eitc_children` are **not** on the exported H5 input surface right now, so they are not the immediate next exclusion candidates -- action: - - restore `donor_imputer_excluded_variables=("filing_status_code",)` as the default in `USMicroplexBuildConfig` - - keep investigating the ACA regressions, because this fix helps broad loss overall but is not yet sufficient on its own - -## 2026-03-30 leafified default PE export surface - -- tightened `SAFE_POLICYENGINE_US_EXPORT_VARIABLES` in `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/policyengine/us.py` - - dropped default export of PE computed/add variables that already have leaf inputs on our surface: - - `employment_income` - - `self_employment_income` - - `pension_income` - - `social_security` - - `interest_income` - - `dividend_income` - - `capital_gains` - - `filing_status` - - kept leaf replacements already present on the surface, plus `rent` as the deliberate stored-input exception -- added a regression in `/Users/maxghenis/PolicyEngine/microplex-us/tests/policyengine/test_us.py` - - default export-map test no longer expects tax-unit `filing_status` - - new guard checks that the default export whitelist does not overlap PE formula/add/subtract variables except the explicit `rent` exception -- focused verification: - - `pytest -q tests/policyengine/test_us.py -k 'export_variable_maps or avoids_formula_aggregates'` -> `5 passed` - - `ruff check src/microplex_us/policyengine/us.py tests/policyengine/test_us.py` -> clean -- post-change audit against live `policyengine-us` metadata: - - default computed-variable overlap is now only `[('rent', True, False)]` -- interpretation: - - this aligns `microplex-us` much more closely with the PE-US-data “store leaf inputs, not recomputed aggregates” rule - - `filing_status` remains available as an explicit direct override if we intentionally want to bypass PE, but it is no longer part of the default pre-sim export contract - -## 2026-03-30 scorer env fix + leafified/state-floor follow-up - -- fixed a real PE-native rescoring portability bug in `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_native_scores.py` - - the scorer now automatically includes a sibling `/Users/maxghenis/PolicyEngine/microimpute` checkout on `PYTHONPATH` when resolving a local `policyengine-us-data` repo - - added regression coverage in `/Users/maxghenis/PolicyEngine/microplex-us/tests/pipelines/test_pe_native_scores.py` -- focused verification: - - `pytest -q tests/pipelines/test_pe_native_scores.py tests/test_cps_source_provider.py` -> `12 passed` - - `ruff check src/microplex_us/pipelines/pe_native_scores.py src/microplex_us/data_sources/cps.py tests/pipelines/test_pe_native_scores.py tests/test_cps_source_provider.py` -> clean -- direct candidate-only PE-native broad rescoring of the leafified export: - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_leafified_export_pe_native_broad_20260330.h5` - - candidate loss `0.8892950182` - - this is worse than the deterministic no-filing checkpoint `0.8677052580` - - interpretation: leafifying the export surface is the right correctness/control-surface fix, but it does not improve the mission metric by itself -- checked a CPS source-sampling state-floor experiment and reverted it - - temporary artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_leafified_statefloor_export_pe_native_broad_20260330.h5` - - pre-sim effect: - - all `51` states survive through seed, synthetic, and calibrated tables - - exported H5 state-age support recall improved from about `0.5708` to `0.5871` - - mission effect: - - candidate loss worsened to `0.9147484499` - - action: - - do **not** keep a one-household-per-state floor in default CPS source subsampling -- additional seam confirmed from the live build: - - `rent` and `real_estate_taxes` are absent from `seed_data`, `synthetic_data`, and `calibrated_data` on the current `cps+puf` path - - the exported H5 now includes those arrays, but they are all-zero placeholders rather than populated pre-sim inputs - -## 2026-03-30 PE-native helper root-cause fix - -- the remaining scorer-helper failure under nested `uv run` was not mainly a `PYTHONPATH` problem -- root cause: - - `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_native_scores.py` was calling `.resolve()` on `/Users/maxghenis/PolicyEngine/policyengine-us-data/.venv/bin/python` - - that followed the venv symlink to the underlying Homebrew/system Python binary and silently stripped the venv context - - effect: the helper subprocess imported global `policyengine_us`, then failed deep inside local `microimpute` with missing `statsmodels` -- fixes now in place: - - preserve the `.venv/bin/python` path instead of resolving the symlink target - - build a minimal subprocess env rather than inheriting the full outer process env - - still include sibling local `microimpute` on `PYTHONPATH` -- regression coverage: - - `tests/pipelines/test_pe_native_scores.py` now checks both: - - sibling `microimpute` inclusion on `PYTHONPATH` - - preservation of the `.venv/bin/python` symlink path -- direct candidate-only broad rescoring remains the trustworthy numeric checkpoint for the leafified export: - - candidate artifact `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_leafified_export_pe_native_broad_20260330.h5` - - candidate loss `0.8892950182` - -## 2026-03-30 joint-return A/B + export direct-override path - -- ruled out a tempting but wrong IRS fix on the live broad path - - artifact: `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_joint_allocation_head_preserving_ab_20260330.json` - - config: `cps_asec_2023 + irs_soi_puf_2024`, `sample_n=1000`, `n_synthetic=2000`, `bootstrap + qrf + entropy`, `donor_imputer_excluded_variables=('filing_status_code',)` - - result: - - current split baseline: candidate loss `0.8659920427` - - head-preserving equal-share joint allocation: candidate loss `0.8784570742` - - interpretation: - - keeping the “equal-share” PUF joint-return variables entirely on the head makes broad PE-native loss worse - - the dominant IRS gap is not coming from that specific PUF personization rule -- checked deeper PE role structure on the old better candidate vs the newer leafified export - - the leafified candidate does **not** lose overall tax-unit dependents or HOH-eligible mass relative to the older better candidate - - the regressions are therefore about AGI mass allocation within filing statuses, not a simple collapse of dependent/HOH structure -- added first-class direct-export override plumbing for PE-native experiments - - `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py` - - `USMicroplexBuildConfig` now includes `policyengine_direct_override_variables` - - `export_policyengine_dataset(...)` accepts explicit `direct_override_variables` and defaults to the build config value - - `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/performance.py` - - PE-native scoring path now forwards `build_config.policyengine_direct_override_variables` into export - - focused verification: - - `pytest -q tests/pipelines/test_performance.py -k 'native_loss or export_direct_overrides'` -> passed - - `pytest -q tests/pipelines/test_us.py -k 'export_policyengine_dataset'` -> passed - - `ruff check src/microplex_us/pipelines/us.py src/microplex_us/pipelines/performance.py tests/pipelines/test_us.py tests/pipelines/test_performance.py` -> clean -- current pending high-signal run: - - export-policy A/B on the same built candidate tables: - - default leafified export - - leafified export + explicit direct override `('filing_status',)` - - exported datasets already written: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_leaf_default_export_ab_20260330.h5` - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_leaf_filing_override_export_ab_20260330.h5` - - broad PE-native scores are still running; this is the cleanest test of whether `filing_status` should remain a temporary deliberate exception while deeper tax-unit structure is fixed - -## 2026-03-30 repeatability + exact-target diagnosis + parity-input patch - -- confirmed the current nominally best broad config is still not reproducible under the same seed: - - repeated `cps_asec_2023 + irs_soi_puf_2024`, `sample_n=1000`, `n_synthetic=2000`, `bootstrap + qrf + entropy`, `donor_imputer_excluded_variables=('filing_status_code',)` landed at: - - loss `0.8643217352`, `n_constraints=1234`, `mean_error=0.77098` - - loss `0.8810677038`, `n_constraints=1252`, `mean_error=0.79746` - - implication: there is still a real nondeterminism bug in the live build path, not just scorer noise -- exact broad target deltas on the current best saved H5 (`/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_best_broad_target_deltas_20260330.json`) show many hard-zero regressions against PE's enhanced CPS, including: - - `nation/irs/aca_spending/la` - - `nation/census/medicare_part_b_premiums/age_20_to_29` - - `nation/irs/aca_spending/nh` - - `nation/irs/aca_spending/tx` - - `nation/irs/adjusted gross income/total/AGI in 500k-1m/taxable/Head of Household` - - `nation/census/child_support_received` - - `nation/irs/total social security/total/AGI in 10k-15k/taxable/All` -- traced the zeroed-out targets back to missing pre-sim inputs rather than donor-imputer choice: - - current best candidate H5 did not export `child_support_received`, `medicare_part_b_premiums`, `other_medical_expenses`, `health_insurance_premiums_without_medicare_part_b`, `alimony_income`, or `disability_benefits` - - `policyengine-us-data` does source these already: - - CPS: `/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/cps.py` - - PUF: `/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/puf/puf.py` -- implemented a parity-input patch on the Microplex-US side: - - CPS now derives and keeps: - - `alimony_income` - - `child_support_received` - - `disability_benefits` - - `health_insurance_premiums_without_medicare_part_b` - - `other_medical_expenses` - - `over_the_counter_health_expenses` - - `medicare_part_b_premiums` - - PUF now maps `alimony_income` under the PE-native name and derives the PE-style medical-expense category breakout from `medical_expense_agi_floor` - - default PE export surface now includes those new pre-sim inputs - - focused verification passed: - - `tests/test_cps_source_provider.py` - - `tests/test_puf_source_provider.py` - - `tests/policyengine/test_us.py` - - Ruff clean -- structural donor-variable ablation did **not** help: - - excluding `eitc_children`, `exemptions_count`, and `is_male` in addition to `filing_status_code` worsened broad loss from `0.8791992898` to `0.9247766974` - - implication: do not generalize a blanket “exclude count/binary donor vars” policy -- current pending mission run: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_parity_inputs_broad_pe_native_20260330.json` - - same broad config as the current best path, but with the new CPS/PUF parity inputs on the runtime surface - -## 2026-03-30 CPS repeatability fix - -- isolated the remaining same-seed drift to the CPS provider rather than PUF or the PE-native scorer - - repeated `CPSASECSourceProvider(year=2023)` loads with `sample_n=1000`, `random_seed=42` were producing different household/person samples from the same cached processed parquet - - root cause: household sampling depended on unstable row order from derived CPS households; same `random_state` on different row order yields different samples -- fixed `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/data_sources/cps.py` - - canonicalize household order by `household_id` before sampling - - canonicalize person order by `household_id`, `person_id`, `person_number` before sampling - - sort sampled household/person outputs before returning -- added regression coverage in `/Users/maxghenis/PolicyEngine/microplex-us/tests/test_cps_source_provider.py` - - repeated same-seed loads from cached processed CPS data now return identical household/person selections -- direct repeatability check after the patch: - - provider repeatability artifact `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_provider_repeatability_20260330.json` - - CPS: `same_households=true`, `same_persons=true` - - PUF: `same_households=true`, `same_persons=true` - - pre-calibration repeatability artifact `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_repeatability_precal_20260330.json` - - `same_seed_same_seed_data=true` - - `same_seed_same_integrated_seed=true` - - `same_seed_same_synthetic=true` -- focused verification: - - `pytest -q tests/test_cps_source_provider.py -k 'sampling or deterministic or derives_policyengine_value_inputs'` -> `4 passed` - - `ruff check src/microplex_us/data_sources/cps.py tests/test_cps_source_provider.py` -> clean -- current pending mission rerun: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_parity_inputs_broad_pe_native_20260330.json` - - this is the first broad PE-native rerun on a deterministic `cps+puf + qrf + bootstrap + entropy` path after the parity-input patch - -## 2026-03-30 parity-input broad blow-up + stale CPS cache diagnosis - -- the first deterministic broad rerun after the parity-input patch landed at: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_parity_inputs_broad_pe_native_20260330.json` - - candidate broad PE-native loss `7.433075015991533` - - PE baseline `0.020243908529428433` - - delta `+7.412831107462105` -- family breakdown showed the blow-up was overwhelmingly concentrated in `national_census_other` - - contribution delta `+6.582284784720224` - - other major regressions remained `national_irs_other`, `state_agi_distribution`, and `state_age_distribution` -- direct H5/input inspection showed the parity-input runtime was still not actually carrying all of the new CPS-derived inputs: - - exported candidate H5 had `child_support_received = 0` everywhere and no `disability_benefits` - - stage audit confirmed the problem was upstream of export on the live cache-backed path: - - `seed_data` and `synthetic_data` were missing `child_support_received` and `disability_benefits` -- root cause: - - `/Users/maxghenis/.cache/microplex/cps_asec_2023_processed.parquet` was stale relative to the new CPS loader contract - - `load_cps_asec()` cache validation only required the older geography / coverage columns, so it silently reused a processed cache that predated the new PE-native derived inputs -- fix now in place: - - `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/data_sources/cps.py` - - extended `PERSON_CACHE_REQUIRED_COLUMNS` to require: - - `alimony_income` - - `child_support_received` - - `disability_benefits` - - `health_insurance_premiums_without_medicare_part_b` - - `other_medical_expenses` - - `over_the_counter_health_expenses` - - `medicare_part_b_premiums` - - `/Users/maxghenis/PolicyEngine/microplex-us/tests/test_cps_source_provider.py` - - updated stale-cache and deterministic-cache fixtures to match the stricter processed-cache contract - - focused verification: - - `pytest -q tests/test_cps_source_provider.py -k 'deterministic or stale_processed_cache_without_pe_presim_inputs or derives_policyengine_value_inputs'` -> passed - - `ruff check src/microplex_us/data_sources/cps.py tests/test_cps_source_provider.py` -> clean -- live-path verification after rebuilding the actual cached CPS parquet: - - `load_cps_asec(year=2023)` now rebuilds the stale cache and returns all new derived inputs - - on the broad runtime path: - - `child_support_received` is now present in `seed_data`, `synthetic_data`, and `calibrated_data` - - `disability_benefits` is now present in `seed_data`, `synthetic_data`, and `calibrated_data` -- current pending clean rerun: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_parity_inputs_broad_pe_native_20260330_v2.json` - - this is the first broad PE-native rerun on: - - deterministic CPS sampling - - rebuilt live CPS processed cache - - actual carriage of the new CPS-derived PE inputs - -## 2026-03-30 focused Claude review — broad PE-native loss checkpoint - -- Scope: v2 clean broad result after deterministic CPS + rebuilt cache fixes -- Full review: `reviews/2026-03-30-claude-broad-native-loss-checkpoint-review.md` -- Top findings: - 1. **HIGH**: Calibration-vs-scoring target mismatch dominates loss — calibrated against 1,255 constraints, scored against 2,817 targets. Top 3 families (`national_irs_other`, `state_agi_distribution`, `state_age_distribution`) account for 72% of the 0.855 delta. - 2. **HIGH**: Calibration never converges — all saved artifacts show `converged=false`. A/B comparisons unreliable unless delta exceeds ~0.02-0.03. - 3. **MEDIUM**: Cache invalidation checks column presence, not derivation correctness — same bug class as the 7.43 blow-up, different future trigger. -- 7.43 blow-up: fully explained by stale CPS processed cache missing new PE-derived inputs. No deeper bug. -- v2 result (candidate 0.875, PE baseline 0.020): trustworthy for family-level diagnosis, not for precision claims. -- Top next fixes: - 1. Increase source `sample_n` to 2000-3000 (steepest support-recall curve) - 2. Diagnose calibration convergence with 10x solver iterations - 3. Add cache derivation version to prevent stale-cache class bugs - 4. Split `national_irs_other` in the family classifier for sub-family diagnosis - -## 2026-03-30 follow-up to Claude broad-loss review - -- Landed the two most direct correctness/investigation fixes from the review: - - `src/microplex_us/data_sources/cps.py` - - added a versioned processed-cache path: - - `cps_asec_{year}_processed_v20260330.parquet` - - legacy unversioned processed caches are now ignored and rebuilt from raw source - - minimal CPS inputs now still materialize the PE-facing value leaves as zero columns: - - `alimony_income` - - `child_support_received` - - `disability_benefits` - - `health_insurance_premiums_without_medicare_part_b` - - `other_medical_expenses` - - `over_the_counter_health_expenses` - - `medicare_part_b_premiums` - - `src/microplex_us/pipelines/us.py` - - `USMicroplexBuildConfig` now carries: - - `calibration_tol` - - `calibration_max_iter` - - entropy / IPF / chi2 calibrators now honor those settings - - `src/microplex_us/pipelines/performance.py` - - calibration cache keys now include `calibration_tol` and `calibration_max_iter` - - precalibration cache keys exclude them so only the calibration stage reruns when these change -- Focused verification: - - `pytest -q tests/test_cps_source_provider.py tests/pipelines/test_us.py -k 'cache or deterministic or tolerance_config or stale_processed_cache or derives_policyengine_value_inputs or build_weight_calibrator'` -> `7 passed` - - `pytest -q tests/pipelines/test_performance.py -k 'calibration_cache_key_includes_iteration_and_tolerance_settings or preserves_target_profiles or can_evaluate_native_loss'` -> `3 passed` - - `ruff check src/microplex_us/data_sources/cps.py src/microplex_us/pipelines/us.py src/microplex_us/pipelines/performance.py tests/test_cps_source_provider.py tests/pipelines/test_us.py tests/pipelines/test_performance.py` -> clean -- Running now: - - deterministic broad PE-native smoke on the current path with: - - `sample_n=2000` - - `n_synthetic=2000` - - `donor_imputer_backend='qrf'` - - `donor_imputer_excluded_variables=('filing_status_code',)` - - `calibration_backend='entropy'` - - `calibration_max_iter=1000` - - output target: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_weighted_sample2000_iter1000_pe_native_broad_20260330.json` -- Result: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_weighted_sample2000_iter1000_pe_native_broad_20260330.json` - - candidate PE-native broad loss: `0.8830832791543215` - - PE baseline: `0.020243908529428433` - - delta: `+0.862839370624893` - - calibration still did not converge: - - `converged=false` - - `mean_error=0.8053911891798184` - - `max_error=1.5450053947105458` - - `n_constraints=1263` - - feasibility filter still dropped `2348 / 3611` constraints (`65.0%`) - - conclusion: - - increasing entropy solve effort from `100` to `1000` iterations on the current deterministic `sample_n=2000 / n_synthetic=2000` path did not help the mission metric - - next lever should stay on source support (`sample_n=3000`) rather than more entropy iterations - -## 2026-03-30 full-support PE-scale path - -- Code: - - `src/microplex_us/pipelines/us.py` - - added `synthesis_backend='seed'` to preserve the full donor-integrated support surface instead of resampling it before PE-table calibration - - added `policyengine_selection_household_budget` and a sparse household selector that prunes PE tables to a fixed household budget before the final calibration pass - - `src/microplex_us/pipelines/performance.py` - - `sample_n` can now be `None` for full-source runs - - calibration cache keys now include `policyengine_selection_household_budget`, while precalibration cache keys still do not -- Focused verification: - - `pytest -q tests/pipelines/test_us.py -k 'synthesize_seed_backend_preserves_seed_support or household_budget or sparse_backend or calibrate_policyengine_tables_from_db'` -> `6 passed` - - `pytest -q tests/pipelines/test_performance.py -k 'household_budget_selection or full_source_queries or preserves_target_profiles or native_loss'` -> `4 passed` - - `ruff check src/microplex_us/pipelines/us.py src/microplex_us/pipelines/performance.py tests/pipelines/test_us.py tests/pipelines/test_performance.py` -> clean -- PE-scale source-subsampled comparison point: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_qrf_weighted_sample29999_n29999_pe_native_broad_20260330.json` - - config: - - `sample_n=29999` - - `n_synthetic=29999` - - `bootstrap + qrf + entropy` - - `donor_imputer_excluded_variables=('filing_status_code',)` - - result: - - candidate PE-native broad loss: `0.9547853569761191` - - PE baseline: `0.020243908529428433` - - delta: `+0.9345414484466906` - - `converged=false` - - `n_constraints=3300` - - read: - - matching PE's row count by source-side weighted subsampling is worse than the smaller deterministic broad path - - the better next experiment is full CPS + full PUF support, then prune to `29,999` households with the new sparse selection stage -- Running now: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_fullsource_seed_sparse29999_pe_native_broad_20260330.json` - - config: - - `sample_n=None` (full sources) - - `synthesis_backend='seed'` - - `policyengine_selection_household_budget=29999` - - `donor_imputer_backend='qrf'` - - `donor_imputer_excluded_variables=('filing_status_code',)` - -## 2026-03-31 direct PE-native objective path - -- Code: - - `src/microplex_us/pipelines/pe_native_optimization.py` - - added direct PE-native loss-matrix extraction from `policyengine-us-data` - - added projected gradient weight optimization on the exact broad PE-native objective for a fixed exported candidate - - added H5 rewrite utilities to propagate optimized household weights to person and group weight arrays - - `src/microplex_us/pipelines/performance.py` - - added opt-in `optimize_pe_native_loss` harness mode so exported candidates can be weight-optimized before PE-native scoring - - `src/microplex_us/pipelines/__init__.py` - - exported the direct PE-native optimization helpers -- Focused verification: - - `pytest -q tests/pipelines/test_pe_native_optimization.py tests/pipelines/test_performance.py -k 'native_loss or pe_native_optimization'` -> `5 passed` - - `ruff check src/microplex_us/pipelines/pe_native_optimization.py src/microplex_us/pipelines/performance.py src/microplex_us/pipelines/__init__.py tests/pipelines/test_pe_native_optimization.py tests/pipelines/test_performance.py` -> clean -- First same-candidate direct-objective A/B: - - input candidate: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_native_broad_entropy_batch_noharness_20260329/20260329T210427Z-057066af/policyengine_us.h5` - - optimized output: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pe_native_direct_opt_20260331.h5` - - summary: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_pe_native_direct_opt_20260331.json` - - result: - - raw candidate PE-native broad loss: `0.9233365911702252` - - direct-objective optimized loss: `0.9229024219474923` - - improvement: `-0.00043416922273291814` - - baseline PE loss: `0.020243908529428433` - - optimizer status: - - `converged=false` - - `iterations=200` - - `positive_household_count=1993 / 2000` -- Read: - - optimizing the exact PE-native broad objective on a fixed exported candidate helps only trivially - - objective mismatch is real but not the main blocker on the current path - - the next large gain must come from better records or a budgeted selector over a larger support set, not just replacing entropy with a better weight objective after export - -## 2026-03-31 PE-native optimizer score-consistency guard - -- Code: - - `src/microplex_us/pipelines/performance.py` - - added a hard consistency check for `optimize_pe_native_loss=True` - - the rescored `candidate_enhanced_cps_native_loss` must now match the optimizer's internal `optimized_loss` within `pe_native_score_consistency_tol` (default `1e-6`) - - mismatches now raise immediately instead of silently attaching stale/incorrect optimization metadata -- Focused verification: - - `pytest -q tests/pipelines/test_performance.py -k 'optimize_native_loss or consistency'` -> `1 passed` - - `ruff check src/microplex_us/pipelines/performance.py tests/pipelines/test_performance.py` -> clean -- Read: - - this does not change the diagnosis; it just makes the direct-objective path trustworthy for future larger-candidate selector work - -## 2026-03-31 focused Claude review — direct PE-native optimizer - -- Scope: code review + architectural diagnosis of `pe_native_optimization.py`, harness integration, and first A/B result -- Full review: `reviews/2026-03-31-claude-direct-pe-native-optimizer-review.md` -- Top findings: - 1. **Objective alignment is correct**: optimizer's `||M^T w - s||^2` proven algebraically identical to the scorer's native loss. Initial losses match within float64 noise (2e-16). - 2. **No serious correctness bugs**: gradient, Lipschitz estimate, simplex projection, H5 weight rewrite, and harness integration are all correct. - 3. **MINOR**: weight-sum drift ~9e-6 relative after 200 iterations (cosmetic). No cross-validation between optimizer's internal loss and rescored loss (worth adding as guard). -- Objective alignment confirmed: the direct optimizer minimizes the exact same function the scorer evaluates. -- Tiny gain (0.92334 → 0.92290) definitively confirms record support is the bottleneck: - - The best achievable loss with 2000 households is ~0.923 — entropy was already near-optimal for this support - - Only 0.05% of the 0.903 gap to PE is attributable to the weight objective - - The other 99.95% is structural (support, state coverage, missing IRS mass) -- Top next fix: full-support + budgeted household selection path (already prototyped). Do not invest further in direct weight optimization on small candidates. - -## 2026-03-31 full-support PE-native-loss selector at PE row budget - -- Scope: full CPS + full PUF support, `synthesis_backend='seed'`, `policyengine_selection_backend='pe_native_loss'`, household budget `29,999` -- Artifact: - - `artifacts/tmp_fullsource_seed_pe_native_selector29999_20260331.json` - - `artifacts/tmp_fullsource_seed_pe_native_selector29999_20260331.h5` -- Result: - 1. candidate PE-native broad loss `0.6333835740352115` - 2. PE baseline `0.020243908529428433` - 3. delta `+0.613139665505783` -- Comparison: - 1. materially better than earlier full-support sparse selector (`0.8960`) - 2. materially better than source-sampled `29,999` run (`0.9548`) - 3. still far from full PE baseline -- Diagnostics: - 1. final calibration still `converged=false` - 2. supported targets `2575 / 4183` - 3. feasibility filter dropped `887 / 3462` post-selection constraints (`25.6%`) - 4. selector optimization itself did not converge in `200` iterations, but still produced a much stronger selected population - 5. selector kept exactly `29,999` positive-weight households from `56,839` input households -- Read: - - budgeted selection on a full-support candidate is the first PE-scale change that clearly moved the frontier in the right direction - - this is still not enough to beat full PE, but it is strong evidence that candidate construction + selection is a better lever than source-side subsampling or post-export weight tuning - -## 2026-03-31 harness output contract - -- Code: - - `src/microplex_us/pipelines/performance.py` - - added `output_json_path` and `output_policyengine_dataset_path` to the local harness config - - harness can now persist one self-contained JSON summary and one final PE-ingestable H5 without ad hoc wrapper scripts - - when `optimize_pe_native_loss=True`, the persisted H5 is the optimized dataset that was actually scored, not the pre-optimization export -- Focused verification: - - `pytest -q tests/pipelines/test_performance.py -k 'write_output_bundle or writes_optimized_dataset_output or can_optimize_native_loss or can_evaluate_native_loss'` -> `4 passed` - - `ruff check src/microplex_us/pipelines/performance.py tests/pipelines/test_performance.py` -> clean -- Read: - - long PE-scale runs no longer need bespoke `uv run python < `6 passed` - - `ruff check src/microplex_us/pipelines/performance.py tests/pipelines/test_performance.py` -> clean -- Read: - - the ad hoc exact-target analysis wrapper can now be replaced by a first-class harness output - -## 2026-03-31 harness batch native scoring - -- Code: - - `src/microplex_us/pipelines/performance.py` - - added `USMicroplexPerformanceHarnessRequest` and `USMicroplexPerformanceSession.run_batch(...)` - - shared-session batch runs now export candidates once, group compatible requests by baseline/repo/period, and score PE-native loss through `compute_batch_us_pe_native_scores(...)` - - keeps direct PE-native optimizer runs on the single-candidate path, but removes repeated scorer subprocess overhead for normal multi-candidate native-loss A/Bs -- Focused verification: - - `pytest -q tests/pipelines/test_performance.py -k 'run_batch_uses_native_batch_scorer or write_pe_native_target_delta_output or rejects_nonpositive_target_delta_top_k or write_output_bundle or writes_optimized_dataset_output or can_optimize_native_loss or can_evaluate_native_loss or reuses_comparison_cache or reuses_loaded_frames or reuses_precalibration_state or reuses_calibration_state'` -> `11 passed` - - `ruff check src/microplex_us/pipelines/performance.py src/microplex_us/pipelines/__init__.py src/microplex_us/__init__.py tests/pipelines/test_performance.py` -> clean -- Read: - - the local performance harness now has a real multi-candidate PE-native path instead of relying on separate experiment/backfill machinery - -## 2026-03-31 harness matched-N PE baseline - -- Code: - - `src/microplex_us/pipelines/performance.py` - - added `evaluate_matched_pe_native_loss` - - harness can now sample the full PE baseline down to a matched household count, rescale the sampled baseline weights back to the original total, and score `Microplex@N` against that raw `PE@N` - - default matched household count follows the candidate household count; optional output path persists the sampled PE baseline H5 -- Focused verification: - - `pytest -q tests/pipelines/test_performance.py -k 'evaluate_matched_native_loss or rejects_nonpositive_matched_baseline_household_count or run_batch_uses_native_batch_scorer or write_pe_native_target_delta_output or rejects_nonpositive_target_delta_top_k or write_output_bundle or writes_optimized_dataset_output or can_optimize_native_loss or can_evaluate_native_loss or reuses_comparison_cache or reuses_loaded_frames or reuses_precalibration_state or reuses_calibration_state'` -> `13 passed` - - `ruff check src/microplex_us/pipelines/performance.py tests/pipelines/test_performance.py` -> clean -- Read: - - matched-size raw PE baselines are now a first-class harness comparator instead of a separate notebook-style script - -## 2026-03-31 harness matched-N reweighted PE baseline - -- Code: - - `src/microplex_us/pipelines/performance.py` - - added `reweight_matched_pe_native_loss` - - matched-size PE baseline path can now run PE's own `enhanced_cps.reweight(...)` on the sampled baseline H5 before rescoring - - this gives the local harness a fairer `PE@N_reweighted` comparator than simple weight rescaling alone -- Focused verification: - - `pytest -q tests/pipelines/test_performance.py -k 'reweight_matched_native_loss or evaluate_matched_native_loss or rejects_reweighted_matched_loss_without_matched_loss or rejects_nonpositive_matched_baseline_household_count or run_batch_uses_native_batch_scorer or write_pe_native_target_delta_output or rejects_nonpositive_target_delta_top_k or write_output_bundle or writes_optimized_dataset_output or can_optimize_native_loss or can_evaluate_native_loss or reuses_comparison_cache or reuses_loaded_frames or reuses_precalibration_state or reuses_calibration_state'` -> `15 passed` - - `ruff check src/microplex_us/pipelines/performance.py tests/pipelines/test_performance.py` -> clean -- Read: - - the local harness can now emit `Microplex@N`, raw `PE@N`, and reweighted `PE@N` from one comparable evaluation surface - -## 2026-03-31 matched PE baseline fidelity fix - -- Scope: repaired matched-`N` PE baseline generation in `src/microplex_us/pipelines/performance.py` -- Root cause: - - the harness matched-baseline writer was lossy - - full-count `PE@29999` collapsed to `17` variables instead of `167` - - smaller matched baselines silently dropped non-annual variables such as `is_household_head` (`ETERNITY`) and `receives_wic` (monthly) -- Fix: - - `N == full_N` now short-circuits to a byte-for-byte copy of the original PE baseline H5 - - smaller matched baselines are now sampled directly at the H5 array level, preserving all variables and all stored periods -- Focused verification: - - `pytest -q tests/pipelines/test_performance.py -k 'matched_native_loss or write_matched_policyengine_us_baseline_dataset_preserves_variables'` -> `3 passed` - - `ruff check src/microplex_us/pipelines/performance.py tests/pipelines/test_performance.py` -> clean - - direct schema diff now matches full PE exactly at `N=2000`, `N=3000`, and `N=29999` (`167` vars, no missing, no extra) -- Consequence: - - the earlier harness-produced raw `PE@29999` comparator was invalid and should not be used - -## 2026-03-31 filing-status experiments falsified - -- Scope: tested two ways to push separated / surviving-spouse structure into PE on the `29,999` full-support selector path - - direct `filing_status` override - - exporting person-level `is_separated` / `is_surviving_spouse` -- Results: - - prior `statusfix` baseline: `0.6362298466` - - direct `filing_status` override: `0.6539544578` - - leaf-input export: `0.9793611801` - - PE baseline: `0.0202439085` -- Root cause read: - - PE's `filing_status` formula uses tax-unit structure plus person-level leaf inputs - - direct override carried existing synthesized MFJ structural errors straight into PE - - the leaf-input experiment was worse because coarse CPS `marital_status` / `filing_status_code` hints were not precise enough to safely synthesize `is_separated` and `is_surviving_spouse` - - that path inflated separated-filer structure and caused severe weight collapse -- Code consequence: - - reverted `is_separated` / `is_surviving_spouse` from the default PE export surface - - kept only passthrough normalization if those columns ever exist from a more trustworthy source -- Read: - - the filing-status seam is real, but these two fixes are not the right fix - - next work should shift back to the larger `national_irs_other`, `state_agi_distribution`, and `state_age_distribution` support problems - -## 2026-03-31 signed IRS surface repair - -- Scope: repair signed-income and missing-leaf seams that were still zeroing major IRS loss terms on the `29,999` full-support selector path -- Root cause: - - raw mapped PUF `self_employment_income` is signed, but Microplex marked it as `ZERO_INFLATED_POSITIVE`, so donor matching could never emit losses - - raw mapped PUF `rental_income_negative` is a positive loss amount, and `map_puf_variables()` was adding it instead of subtracting it - - `capital_gains_distributions` existed in PUF but never reached PE because the export surface omitted the correct PE input alias `non_sch_d_capital_gains` -- Code: - - `src/microplex_us/data_sources/puf.py` - - preserve rental losses as negative values when combining positive and negative rental components - - `src/microplex_us/variables.py` - - stop treating `self_employment_income` as a positive-only donor target; preserve signed support - - `src/microplex_us/policyengine/us.py` - - export `capital_gains_distributions` through the PE input alias `non_sch_d_capital_gains` -- Focused verification: - - `pytest -q tests/test_puf_source_provider.py -k 'rental_loss_sign or preserve_joint_tax_unit_monetary_totals or splits_negative_joint_self_employment_losses or maps_policyengine_medical_and_alimony_inputs'` -> `3 passed` - - `pytest -q tests/policyengine/test_us.py -k 'default_policyengine_us_export_surface_avoids_formula_aggregates or supports_pre_sim_aliases'` -> `2 passed` - - `pytest -q tests/test_variables.py -k 'self_employment_income_semantics_preserve_signed_support or person_native_irs_semantics_match_current_policyengine_entities or donor_imputation_block_specs_include_match_strategies'` -> `3 passed` - - `ruff check src/microplex_us/data_sources/puf.py src/microplex_us/policyengine/us.py src/microplex_us/variables.py tests/test_puf_source_provider.py tests/policyengine/test_us.py tests/test_variables.py` -> clean -- Read: - - the remaining IRS gap is not just “more support”; several high-loss cells were impossible to hit because losses or leaves were being structurally erased before PE saw them - -## 2026-03-31 authoritative donor override for shared IRS variables - -- Scope: allow PUF to replace weak shared CPS scaffold values for a narrow signed-IRS allowlist instead of only filling donor-only variables -- Root cause: - - even after restoring signed PUF support, donor integration only modeled `donor_observed - scaffold_observed` - - `self_employment_income` and `rental_income` exist on both CPS and PUF, so PUF could not overwrite the CPS scaffold despite being the more authoritative IRS-style source - - when a shared variable becomes a donor target, it also must be removed from donor conditions for that block; otherwise the imputer just learns back the scaffold value being replaced -- Code: - - `src/microplex_us/pipelines/us.py` - - add `donor_imputer_authoritative_override_variables`, defaulting to `self_employment_income` and `rental_income` - - allow authoritative donors to model and overwrite those shared variables - - exclude block target variables from the donor condition set -- Focused verification: - - `pytest -q tests/pipelines/test_us.py -k 'authoritative_override_for_shared_irs_variables or preserves_informative_scaffold_values or defaults'` -> `4 passed` - - `ruff check src/microplex_us/pipelines/us.py tests/pipelines/test_us.py` -> clean -- Cheap export spotcheck: - - `artifacts/tmp_signed_income_override_spotcheck_20260331.h5` - - `self_employment_income_before_lsr`: `31` negative rows, `62` positive rows, min `-14175.0` - - `rental_income`: `14` negative rows, `32` positive rows, min `-243450.0` - - `non_sch_d_capital_gains`: `24` positive rows -- Read: - - the signed IRS surfaces now survive into a real PE export, which is the prerequisite for the next full `29,999` selector rerun - -## 2026-03-31 signed-support and shared-override PE-scale readout - -- Full `29,999` selector results: - - prior strong selector: `0.6333835740` - - `statusfix` baseline: `0.6362298466` - - signed-support fixes only: `0.9762246696` - - signed-support + `self_employment_income` authoritative override: `0.9317965866` - - signed-support + `rental_income` authoritative override: `0.9831478185` - - signed-support + both overrides: `0.9686514499` - - PE baseline: `0.0202439085` -- Read: - - restoring signed IRS support was necessary for representability, but not a win on the current selector/calibration path - - all shared authoritative override variants were worse than the pre-override baseline - - `self_employment_income` override was harmful; `rental_income` override was worse - - keep `donor_imputer_authoritative_override_variables` opt-in only, not default -- Code consequence: - - revert the default override allowlist to `()` - - retain the override mechanism for future bounded A/Bs only - -## 2026-03-31 capital gains distributions export moved back to opt-in - -- Evidence: - - the no-override `signedirsfix` run (`0.9762246696`) was still far worse than `statusfix` (`0.6362298466`) - - in `tmp_fullsupport_selector29999_signedirsfix_20260331.h5`, `self_employment_income_before_lsr` and `rental_income` still had `0` negative rows, so the signed-income support repairs were not yet affecting the default path - - the main new default-path change was exporting `capital_gains_distributions` as `non_sch_d_capital_gains` -- Code consequence: - - remove `non_sch_d_capital_gains` from `SAFE_POLICYENGINE_US_EXPORT_VARIABLES` - - keep the alias available for explicit opt-in through `direct_override_variables` -- Focused verification: - - `pytest -q tests/policyengine/test_us.py -k 'default_policyengine_us_export_surface_avoids_formula_aggregates or supports_pre_sim_aliases'` -> `2 passed` - - `ruff check src/microplex_us/policyengine/us.py tests/policyengine/test_us.py` -> clean -- Read: - - until a direct H5 ablation proves otherwise, `non_sch_d_capital_gains` should not be on the default PE export surface - -## 2026-04-01 PE-native support audit on the trusted `statusfix` path - -- Code: - - `src/microplex_us/pipelines/pe_native_scores.py` - - add `compute_us_pe_native_support_audit(...)` - - compare candidate vs baseline on stored-variable presence, filing-status support, high-AGI MFS support, state marketplace enrollment, and state age-bucket support - - `src/microplex_us/pipelines/performance.py` - - add `output_pe_native_support_audit_path` - - allow the harness to emit a durable support-audit JSON next to the normal PE-native score outputs - - `tests/pipelines/test_pe_native_scores.py` - - `tests/pipelines/test_performance.py` -- Focused verification: - - `uv run pytest -q tests/pipelines/test_pe_native_scores.py -k 'support_audit or target_deltas'` -> `2 passed` - - `uv run pytest -q tests/pipelines/test_performance.py -k 'support_audit or target_delta_output'` -> `2 passed` - - `ruff check src/microplex_us/pipelines/pe_native_scores.py src/microplex_us/pipelines/performance.py tests/pipelines/test_pe_native_scores.py tests/pipelines/test_performance.py` -> clean -- Artifact: - - `artifacts/tmp_fullsupport_selector29999_statusfix_support_audit_20260401.json` -- Read: - - the trusted `statusfix` candidate is not just missing a few leaves; it is structurally underweighted after calibration - - candidate PE household-weight sum: `41.17M` - - same run's selection optimizer preserved `135.40M` total weight before entropy calibration - - full `enhanced_cps_2024` baseline PE household-weight sum: `149.96M` - - support gaps are therefore broad, not isolated: - - `child_support_expense` is entirely absent on the candidate export (`stored=false`, `weighted_nonzero=0.0`) while baseline has `2.63M` weighted nonzero support - - `has_marketplace_health_coverage`: candidate `2.54M` weighted nonzero vs baseline `11.74M` - - `has_esi`: candidate `63.61M` vs baseline `185.45M` - - `medicare_part_b_premiums`: candidate `11.54M` vs baseline `49.53M` - - `self_employment_income_before_lsr`: candidate `3.74M` vs baseline `25.53M` - - `rental_income`: candidate `3.04M` vs baseline `13.21M` - - filing-status support is still structurally incomplete: - - `SEPARATE` weighted count `0.0` vs baseline `6.53M` - - `SURVIVING_SPOUSE` weighted count `0.0` vs baseline `1.74M` - - MFS support in `75k+` AGI bins is exactly zero across the board - - ACA and state-age failures are clearly structural: - - biggest marketplace enrollment gaps include `GA`, `CA`, `TX`, `IL`, `NY` - - biggest state-age bucket gaps are concentrated in `TX`, `CA`, and `FL` -- Next hypothesis: - - the best current selector path is being undone by post-selection entropy calibration collapsing total mass - - the next decisive experiment is to renormalize the final calibrated `statusfix` weights back toward the pre-calibration/selection total and rescore before changing record construction again - -## 2026-04-01 source-backed `child_support_expense` added to CPS + PE export - -- Code: - - `src/microplex_us/data_sources/cps.py` - - map CPS `CHSP_VAL -> child_support_expense` - - treat it as a nonnegative zero-default PE pre-sim input - - require it in the processed CPS cache contract - - bump CPS processed-cache version to `20260401` - - `src/microplex_us/policyengine/us.py` - - add `child_support_expense` to `SAFE_POLICYENGINE_US_EXPORT_VARIABLES` - - `tests/test_cps_source_provider.py` - - `tests/policyengine/test_us.py` -- Why: - - the new PE-native support audit showed the trusted `statusfix` candidate exported no `child_support_expense` at all, while the full PE baseline had `2.63M` weighted nonzero support - - `policyengine-us-data` already sources this directly from CPS (`CHSP_VAL`), so this is a clean parity miss rather than a speculative new feature -- Focused verification: - - `uv run pytest -q tests/test_cps_source_provider.py -k 'policyengine_value_inputs or stale_processed_cache_without_pe_presim_inputs or caches_household_geography_on_persons'` -> `3 passed` - - `uv run pytest -q tests/policyengine/test_us.py -k 'export_variable_maps_includes_tax_inputs or default_policyengine_us_export_surface_avoids_formula_aggregates'` -> `2 passed` - - `ruff check src/microplex_us/data_sources/cps.py src/microplex_us/policyengine/us.py tests/test_cps_source_provider.py tests/policyengine/test_us.py` -> clean -- Read: - - this is a safe source-backed fix and should stay - - it may help some SNAP / expense surfaces, but it is not expected to explain the full `statusfix` gap by itself - -## 2026-04-12 reject checkpoint CPS `state x household-income-band` floor - -- Code: - - no retained code changes; the temporary `state_income_floor` experiment in - `src/microplex_us/data_sources/cps.py` and - `src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py` was reverted - after the benchmark run regressed -- Why: - - the next clean AGI-side upstream hypothesis was to mirror the accepted CPS - `state x age-band` support floor with a coarse `state x household-income-band` - floor during checkpoint sampling - - this stayed within the same architecture: better sampled source support - before synthesis/calibration, same PE oracle, same downstream calibration - planner -- Focused verification: - - `python -m py_compile src/microplex_us/data_sources/cps.py src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py tests/test_cps_source_provider.py tests/pipelines/test_pe_us_data_rebuild_checkpoint.py` - - `uv run pytest tests/test_cps_source_provider.py tests/pipelines/test_pe_us_data_rebuild_checkpoint.py -q -k 'state_age_floor or default_policyengine_us_data_rebuild_queries'` -- Artifact: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_income_donors/broader-donors-cps-stateage1-income-v1` -- Read: - - the hypothesis lost on the mission metric and should not stay in the code - surface - - matched broader donor baseline with the accepted CPS age floor: - `full_oracle_capped_mean_abs_relative_error = 0.7329149849` - - candidate with the added income-band floor: - `full_oracle_capped_mean_abs_relative_error = 0.7554346215` - - delta: `+0.0225196366` worse - - the candidate also worsened active-solve capped loss (`0.8499 -> 0.8586`) - while increasing selected constraints (`1059 -> 1086`) - - conclusion: keep the accepted checkpoint CPS `state x age-band` floor, and - do not add the `state x household-income-band` floor - -## 2026-04-12 reject checkpoint CPS `state x tax-unit-income-band` floor - -- Code: - - no retained code changes; the temporary `state_tax_unit_income_floor` - experiment in `src/microplex_us/data_sources/cps.py` and - `src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py` was reverted - after the benchmark run -- Why: - - the household-income analogue was too blunt, so the next cleaner AGI-side - upstream hypothesis was a CPS `state x tax-unit-income-band` floor built - from summed `total_person_income` within each CPS tax unit - - this is closer to the PE AGI target surface than household income while - still staying entirely in checkpoint-scale source sampling -- Focused verification: - - `python -m py_compile src/microplex_us/data_sources/cps.py src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py tests/test_cps_source_provider.py tests/pipelines/test_pe_us_data_rebuild_checkpoint.py` - - `uv run pytest tests/test_cps_source_provider.py tests/pipelines/test_pe_us_data_rebuild_checkpoint.py -q -k 'state_age_floor or default_policyengine_us_data_rebuild_queries'` -- Artifact: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_taxunitincome_donors/broader-donors-cps-stateage1-taxunitincome-v1` -- Read: - - this was a near miss but still not a keeper on the mission metric - - matched broader donor baseline with the accepted CPS age floor: - `full_oracle_capped_mean_abs_relative_error = 0.7329149849` - - candidate with the added tax-unit-income floor: - `full_oracle_capped_mean_abs_relative_error = 0.7372298992` - - delta: `+0.0043149143` worse - - unlike the household-income version, this candidate did improve some - secondary diagnostics: - - `full_oracle_mean_abs_relative_error`: `0.8169 -> 0.8134` - - `active_solve_capped_mean_abs_relative_error`: `0.8499 -> 0.8047` - - conclusion: still reject for the current frontier objective; if this idea - comes back later, it should come back with tighter AGI-band design or a - clearer target-family-specific objective rather than as a default checkpoint - support rule - -## 2026-04-12 reject PolicyEngine-style CPS tax-leaf splits on the broader donor checkpoint - -- Code: - - no retained runtime code changes from this lane - - the temporary CPS-source leaf-input materialization and the temporary - export-side split fallback were both reverted after the benchmark runs - - retained code state only bumps the CPS processed-cache version in - `src/microplex_us/data_sources/cps.py` to avoid reusing the rejected - source-side cache schema -- Why: - - the next direct AGI-alignment hypothesis was to reuse the same CPS tax-input - split assumptions as `policyengine-us-data` for interest, dividends, and - pension income - - two boundaries were tested: - - source-side: materialize those leaf inputs directly in the CPS provider - before Microplex donor integration - - export-side: keep the CPS source on gross aggregates but apply the same - split only when building the final PolicyEngine export surface -- Focused verification: - - source/provider and semantic regression slice: - `uv run pytest tests/test_cps_source_provider.py tests/test_variables.py tests/pipelines/test_us.py -q -k 'policyengine_value_inputs or atomic_variable_semantics or prune_redundant_variables or sparse_irs_tax_variables_use_puf_irs_predictors or person_native_irs_semantics or derives_tax_input_columns or fallback_employment_excludes_transfer_income'` - - after reversion: `7 passed` -- Artifacts: - - source-side candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_cps_pe_agi_donors/broader-donors-cps-pe-agi-v1` - - export-side candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_pe_export_cps_agi_donors/broader-donors-pe-export-cps-agi-v1` - - matched incumbent baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_donors/broader-donors-cps-stateage1-v1` -- Read: - - the source-side version is clearly wrong for the mixed-source Microplex - pipeline: - - baseline capped full-oracle loss: `0.7329149849` - - source-side candidate: `0.9164981002` - - delta: `+0.1835831153` worse - - top residual families now included - `tax_unit_count|domain=tax_exempt_interest_income` and - `tax_exempt_interest_income|domain=tax_exempt_interest_income`, which is - a strong sign that the source surface was polluted by estimated leafs too - early - - the export-side version is better than the source-side one but still not a - keeper: - - export-side candidate: `0.7998451134` - - delta vs baseline: `+0.0669301285` worse - - conclusion: - - do not promote PE-style CPS tax leafs into the source provider - - do not apply the export-side split by default either - - the clean alignment boundary for this lane is still unresolved, so the - default path stays on gross CPS tax aggregates for now - -## 2026-04-12 keep donor checkpoint `state x age-band` floor - -- Code: - - keep donor survey checkpoint sampling support for `state_age_floor` in - `src/microplex_us/data_sources/donor_surveys.py` - - keep the default checkpoint query builder passing `state_age_floor=1` to - donor survey providers in - `src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py` - - keep the new donor sampling/query regressions in - `tests/test_donor_survey_source_providers.py` and - `tests/pipelines/test_pe_us_data_rebuild_checkpoint.py` -- Why: - - after accepting the CPS checkpoint `state x age-band` floor, donor-inclusive - checkpoints still had an upstream asymmetry: CPS sampling guaranteed - `state x age` coverage, donor survey sampling only guaranteed a plain state - floor - - the next clean test was to mirror the same age-band support floor on donor - survey checkpoint sampling, but only keep it if the full-oracle metric moved -- Focused verification: - - `python -m py_compile src/microplex_us/data_sources/donor_surveys.py src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py tests/test_donor_survey_source_providers.py tests/pipelines/test_pe_us_data_rebuild_checkpoint.py` - - `uv run pytest tests/test_donor_survey_source_providers.py tests/pipelines/test_pe_us_data_rebuild_checkpoint.py -q -k 'state_age_floor or default_policyengine_us_data_rebuild_queries or forwards_state_age_floor'` -- Artifacts: - - baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_donors/broader-donors-cps-stateage1-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_donor_stateage1_donors/broader-donors-donor-stateage1-v1` -- Read: - - the gain is small but real on the deterministic broader donor benchmark - - baseline capped full-oracle loss: `0.7329149849` - - candidate capped full-oracle loss: `0.7327632809` - - delta: `-0.0001517041` - - active-solve capped loss also improved slightly: `0.8498782563 -> 0.8495978941` - - selected constraints stayed flat at `1059` - - conclusion: keep this as a low-risk checkpoint-default refinement, not as a - headline methodological change - -## 2026-04-12 keep PE-style PUF person-expansion randomness - -- Code: - - `src/microplex_us/data_sources/puf.py` - - `tests/test_puf_source_provider.py` - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Why: - - the PE-demographics branch in Microplex was decoding `_puf_agerange`, - `_puf_agedp*`, and `_puf_earnsplit` to fixed midpoints, while - `policyengine-us-data` samples inside those coded bins and also randomizes - spouse/dependent sex assignment - - that is a direct upstream parity bug, not a new modeling idea -- Focused verification: - - `python -m py_compile src/microplex_us/data_sources/puf.py tests/test_puf_source_provider.py` - - `uv run pytest tests/test_puf_source_provider.py -q -k 'expand_to_persons or sample_tax_units'` - - `uv run pytest tests/test_puf_source_provider.py -q -k 'not pre_tax_contributions_via_policyengine_subprocess'` -- Artifacts: - - source-stage parity candidate: - `artifacts/tmp_puf_source_stage_parity_personexpansion_20260412.json` - - donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_donors/broader-donors-puf-personexpansion-v1` - - no-donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_nodonors/broader-nodonors-puf-personexpansion-v1` -- Read: - - raw PUF source-stage parity improved materially on the direct PE boundary: - - age weighted-mean ratio: `1.0367 -> 1.0275` - - employment-income weighted-mean ratio: `1.2196 -> 0.9996` - - taxable-interest weighted-mean ratio: `2.2495 -> 1.1774` - - matched broader no-donor checkpoint improved on the mission metric: - - `0.7368409543 -> 0.7336528770` - - active-solve capped loss: `0.8497778115 -> 0.8005940161` - - matched broader donor checkpoint regressed slightly on capped full-oracle - loss while still improving active-solve loss: - - `0.7327632809 -> 0.7342149723` - - active-solve capped loss: `0.8495978941 -> 0.8037192584` - - conclusion: - - keep the parity fix - - log the donor-path regression explicitly - - treat the donor interaction as the next thing to explain, not as a reason - to restore the old midpoint-decoding bug - -## 2026-04-12 split PUF person-expansion parity fix; keep only `EARNSPLIT` - -- Code: - - `src/microplex_us/data_sources/puf.py` - - `tests/test_puf_source_provider.py` - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Why: - - the bundled parity fix was too coarse; it mixed age/sex randomization with - income-split randomization, and the broader donor checkpoint gave only a - slightly negative net result - - the next direct move was a matched ablation, not more speculation -- Focused verification: - - `python -m py_compile src/microplex_us/data_sources/puf.py tests/test_puf_source_provider.py` - - `uv run pytest tests/test_puf_source_provider.py -q -k 'expand_to_persons or sample_tax_units'` - - `uv run pytest tests/test_puf_source_provider.py -q -k 'not pre_tax_contributions_via_policyengine_subprocess'` -- Artifacts: - - donor baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_donor_stateage1_donors/broader-donors-donor-stateage1-v1` - - age/sex-only ablation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_ageonly_donors/broader-donors-puf-personexpansion-ageonly-v1` - - earnsplit-only ablation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_earnsplitonly_donors/broader-donors-puf-personexpansion-earnsplitonly-v1` - - real code-path confirmation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_default_donors/broader-donors-puf-personexpansion-default-v2` -- Read: - - age/sex-only is clearly harmful on the broader donor frontier: - - `0.7327632809 -> 0.7463902007` - - earnsplit-only is clearly beneficial: - - `0.7327632809 -> 0.7176041064` - - active-solve capped loss: `0.8495978941 -> 0.7726915403` - - the real code-path rerun matches the earnsplit-only ablation exactly - - conclusion: - - keep PE-style `EARNSPLIT` randomization in the default path - - revert PE-style age/sex randomization for now - - treat age-bin randomization as an unresolved parity lane, not a current - default - -## 2026-04-12 widen deferred family focus to 7 after `EARNSPLIT` - -- Code: - - `src/microplex_us/pipelines/pe_us_data_rebuild.py` - - `tests/pipelines/test_pe_us_data_rebuild.py` - - `tests/pipelines/test_pe_us_data_rebuild_checkpoint.py` - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Why: - - after the accepted `EARNSPLIT` fix, the strongest surviving individual - rows were ACA PTC and rental tails, but the staged selector was still - filling its family slots with AGI and EITC pairs - - the clean test was a one-axis rerun with wider deferred family focus, not - another ad hoc selector change -- Artifacts: - - donor baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_default_donors/broader-donors-puf-personexpansion-default-v2` - - donor family-7 rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - donor-free baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_default_nodonors/broader-nodonors-puf-personexpansion-default-v2` - - donor-free confirmation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_nodonors/broader-nodonors-puf-personexpansion-family7-v1` -- Read: - - donor run improves on the mission metric: - - `0.7176041064 -> 0.7044626415` - - donor-free broader run also improves: - - `0.7170633141 -> 0.7039665310` - - the widened focus set includes `aca_ptc` and `rental_income` in both - deferred passes - - fresh residual drilldown now shows: - - ACA/rental mass down sharply - - remaining mass led again by age, AGI, and EITC families - - top individual rows still concentrated in ACA amount and eligibility cells - - conclusion: - - promote `policyengine_calibration_deferred_stage_top_family_count = 7` - into the default rebuild policy - - keep the geography gate at `4` - -## 2026-04-12 reject full PUF age/sex randomization again on top of family-7 - -- Code: - - `src/microplex_us/data_sources/puf.py` was restored to the earnsplit-only - default after the retest - - `tests/test_puf_source_provider.py` was restored to the incumbent - earnsplit-only regression expectations - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Verification: - - `uv run pytest tests/test_puf_source_provider.py -q -k 'expand_to_persons_uses_pe_demographic_helpers_when_present or expand_to_persons_preserves_joint_tax_unit_monetary_totals or expand_to_persons_splits_negative_joint_self_employment_losses or expand_to_persons_clears_status_flags_for_non_head_members'` -- Artifacts: - - current donor incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - full-rng retest: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_rng_donors/broader-donors-puf-personexpansion-rng-v1` -- Read: - - broader donor default still loses with full age/sex randomization: - - `0.7044626415 -> 0.7111876263` - - conclusion: - - keep earnsplit-only PUF person expansion in the default path - - do not reopen this same parity lane until there is a new interaction - hypothesis stronger than “try the rejected thing again” - -## 2026-04-12 keep CPS tax-unit structure at the source boundary - -- implemented source-layer CPS tax-unit role derivation keyed by raw `TAX_ID` - in `src/microplex_us/data_sources/cps.py` - - derive: - - `is_tax_unit_head` - - `is_tax_unit_spouse` - - `is_tax_unit_dependent` - - `tax_unit_is_joint` - - `tax_unit_count_dependents` - - added a focused provider regression in - `tests/test_cps_source_provider.py` -- focused verification: - - `python -m py_compile src/microplex_us/data_sources/cps.py tests/test_cps_source_provider.py` - - `uv run pytest tests/test_cps_source_provider.py -q -k 'derives_tax_unit_roles_from_tax_id or caches_household_geography_on_persons or derives_survivor_and_dependent_social_security or loads_observation_frame or canonical_income_alias'` -- artifact comparison: - - incumbent broader donor default: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - source-structure rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_taxunit_structure_donors/broader-donors-cps-taxunit-structure-v1` -- read: - - capped full-oracle loss is exactly unchanged: - - `0.7044626415 -> 0.7044626415` - - conclusion: - - keep this change because it moves CPS tax-unit semantics to the correct - source boundary and removes downstream reconstruction pressure - - do not sell it as a frontier gain; it is architecture cleanup - -## 2026-04-12 reject direct CPS student flag on the broader donor checkpoint - -- tested a narrow EITC-side parity hypothesis: - - materialize `is_full_time_college_student` directly from CPS `A_HSCOL` - in the processed CPS cache -- result on the matched broader donor rerun: - - incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - student-input rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_student_donors/broader-donors-cps-student-v1` - - capped full-oracle loss: - - `0.7044626415 -> 0.7815651801` -- action: - - reverted the student-field addition in `src/microplex_us/data_sources/cps.py` - and the temporary student assertions in `tests/test_cps_source_provider.py` - - reran the focused CPS verification slice after the revert -- interpretation: - - this is another case where a direct PE CPS input is not automatically - plug-compatible with the current mixed-source broader Microplex path - - next upstream work should stay on age/AGI/EITC structure, but not through - this direct student-field promotion - -## 2026-04-12 reject partial preserved tax units as the broader donor default - -- implemented a mixed-preservation path in `src/microplex_us/pipelines/us.py` - - households with complete source `tax_unit_id` values can now keep those IDs - - unresolved households still fall back to `TaxUnitOptimizer` - - added a mixed-household regression in `tests/pipelines/test_us.py` -- focused verification: - - `python -m py_compile src/microplex_us/pipelines/us.py tests/pipelines/test_us.py` - - `uv run pytest tests/pipelines/test_us.py -q -k 'preserve_existing_tax_unit_ids or falls_back_when_existing_tax_unit_ids_cross_households or partially_preserves_existing_tax_unit_ids'` -- artifact comparison: - - incumbent broader donor default: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - partial-preservation rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_partial_preserve_taxunits_donors/broader-donors-partial-preserve-taxunits-v1` -- read: - - capped full-oracle loss regresses slightly: - - `0.7044626415 -> 0.7055670761` - - active-solve capped loss improves: - - `0.7909211525 -> 0.7648463685` - - conclusion: - - do not flip the broader default to preserved tax units - - keep the code path available for future targeted runs, but move the next - upstream work off this boundary and back to AGI/EITC inputs - -## 2026-04-12 keep PE-style CPS `ssn_card_type` - -- changed: - - derive PE-style CPS `ssn_card_type` from raw CPS immigration / benefits / - work / housing-assistance fields in - `src/microplex_us/data_sources/cps.py` - - add mixed-source export support plus `CITIZEN` fallback in - `src/microplex_us/policyengine/us.py` - - bump the processed CPS cache version so the new column is materialized in - rebuilt caches - - add focused regressions in - `tests/test_cps_source_provider.py` - and - `tests/policyengine/test_us.py` -- verification: - - `python -m py_compile src/microplex_us/data_sources/cps.py src/microplex_us/policyengine/us.py tests/test_cps_source_provider.py tests/policyengine/test_us.py` - - `uv run pytest tests/test_cps_source_provider.py -q -k 'ssn_card_type or derives_tax_unit_roles_from_tax_id'` - - `uv run pytest tests/policyengine/test_us.py -q -k 'default_policyengine_us_export_surface or defaults_missing_ssn_card_type_to_citizen'` -- artifact comparison: - - incumbent broader donor default: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - `ssn_card_type` rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` -- read: - - capped full-oracle loss improves: - - `0.7044626415 -> 0.6955460` - - active-solve capped loss improves: - - `0.7909211525 -> 0.7813926586` - - direct `ssn_card_type` family improves sharply: - - `1.0000 -> 0.3786` - - EITC child-count families improve: - - `0.8283 -> 0.7499` - - `0.8154 -> 0.7408` - - aggregate `eitc` gets worse: - - `0.1066 -> 0.2954` -- conclusion: - - keep it - - interpret it narrowly as an identification / child-count improvement - rather than a blanket EITC win - -## 2026-04-12 reject PE-style EITC take-up and voluntary filing inputs - -- prototyped PE-style `takes_up_eitc` and - `would_file_taxes_voluntarily` tax-unit inputs in - `src/microplex_us/pipelines/us.py`, exposed them in - `src/microplex_us/policyengine/us.py`, and added review-driven fallback and - determinism checks before the checkpoint -- verification before the run: - - `python -m py_compile src/microplex_us/pipelines/us.py src/microplex_us/policyengine/us.py tests/pipelines/test_us.py tests/policyengine/test_us.py` - - `uv run pytest tests/pipelines/test_us.py -q -k 'build_policyengine_entity_tables'` - - `uv run pytest tests/policyengine/test_us.py -q -k 'default_policyengine_us_export_surface or defaults_missing_ssn_card_type_to_citizen'` -- artifact comparison: - - incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_takeup_donors/broader-donors-takeup-v1` -- metric read: - - capped full-oracle loss: - - `0.6955460 -> 0.7041134` - - active-solve capped loss: - - `0.7813927 -> 0.7896826` - - EITC child-count families improved, but aggregate `eitc` worsened: - - `0.2954 -> 0.4010` - - ACA amount / count families also worsened: - - `2.3488 -> 2.5737` - - `1.1521 -> 1.3708` -- action: - - revert the take-up / voluntary-filing code path - - keep `broader-donors-ssn-card-type-v1` as the incumbent broader donor - runtime - - do not read this as “drop the concept”; the separation between filing - propensity and EITC take-up remains a structural requirement, but the - attempted late export-layer implementation is not good enough yet - -## 2026-04-12 reject `state_age_floor = 2` on broader donor checkpoints - -- tested a matched broader donor checkpoint with: - - `cps_state_age_floor = 2` - - `donor_state_age_floor = 2` -- artifact comparison: - - incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_stateage2_donors/broader-donors-stateage2-v1` -- metric read: - - capped full-oracle loss: - - `0.6955460 -> 0.7361964` - - active-solve capped loss: - - `0.7813927 -> 0.8371045` - - age improves slightly: - - `0.4681 -> 0.4480` - - but AGI, EITC child-count, and ACA all regress hard enough to dominate - the frontier: - - `0.7119 -> 0.7553` - - `0.6372 -> 0.6618` - - `0.7499 -> 0.8880` - - `0.7408 -> 0.8755` - - `2.3488 -> 2.9982` -- action: - - reject stronger checkpoint age-floor heuristics - - keep the accepted floor-1 incumbent - - move the next experiment to upstream PUF age/AGI construction instead - -## 2026-04-12 reject high-AGI-preserving PUF checkpoint samples - -- prototyped a checkpoint-only PUF sampler that preserved the top AGI tail - whenever `sample_n` was active, then ran the matched broader donor checkpoint -- artifact comparison: - - incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_agi_tail_donors/broader-donors-puf-agi-tail-v1` -- metric read: - - capped full-oracle loss: - - `0.6955460 -> 1.1132009` - - active-solve capped loss: - - `0.7813927 -> 1.9290` -- action: - - reject it - - revert the sampler path completely - - treat the fast source-stage improvement on dividends / interest as a false - friend unless it survives the real broader checkpoint - -## 2026-04-12 reject standalone ACA take-up construction patch - -- traced the ACA residual lane and confirmed that - `takes_up_aca_if_eligible` is a real PE construction input, not a made-up - Microplex feature -- implemented the narrow probe in `src/microplex_us/pipelines/us.py` and - exposed it in `src/microplex_us/policyengine/us.py`, then verified the local - code path with focused `py_compile` and pytest slices -- because disk pressure made a fresh broader rerun unreliable, reevaluated the - incumbent broader donor synthetic population in memory against the shared - oracle and saved the readout in - `artifacts/tmp_broader_aca_takeup_recalibration_20260412.json` -- read: - - capped full-oracle loss: - - `0.6955460 -> 0.8211989` - - active-solve capped loss: - - `0.7813927 -> 0.7013644` - - ACA families improve sharply: - - `aca_ptc|domain=aca_ptc` - - `2.3488 -> 0.5529` - - `tax_unit_count|domain=aca_ptc` - - `1.1521 -> 0.7112` - - `person_count|domain=aca_ptc,is_aca_ptc_eligible` - - `1.0994 -> 0.7771` -- action: - - revert the patch from the default path - - keep the concept documented as required future parity work - - interpret this as “wrong implementation boundary right now,” not “wrong - concept” - -## 2026-04-12 ACA child gap is mostly Medicaid crowd-out, not missing ACA knobs - -- ACA-specific review conclusion: - - beyond raw `has_marketplace_health_coverage` / `has_esi`, the only real - ACA-specific upstream input is `takes_up_aca_if_eligible` - - so there is no large hidden ACA-specific construction surface still - missing from Microplex -- diagnostic comparison: - - compared the incumbent broader donor artifact - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1/policyengine_us.h5` - against PE's `enhanced_cps_2024.h5` - - saved readout: - `artifacts/tmp_broader_aca_eligibility_decomposition_20260412.json` -- read: - - the incumbent has higher under-20 Medicaid/CHIP eligibility than the PE - baseline: - - `eligible_share_under20`: `0.4909 -> 0.6094` - - `medicaid_share_under20`: `0.3930 -> 0.5278` - - the key driver is much lower child-unit `medicaid_income_level` in the - incumbent: - - median under-20 `medicaid_income_level`: - `15.1512 -> 1.6054` - - p75 under-20 `medicaid_income_level`: - `364.3831 -> 3.9464` - - filing-status mix is not the main failure mode; child tax units are simply - too low-income relative to the PE baseline -- action: - - move the next lane to AGI / tax-unit construction and imputation for child - units - - stop treating ACA as primarily an ACA-specific export/input problem - -## 2026-04-13 child-unit income miss is already present before synthesis - -- stage-localized the incumbent broader donor artifact by comparing - `seed_data.parquet`, `synthetic_data.parquet`, and `calibrated_data.parquet` - on under-20 tax-unit income aggregates -- read: - - `seed` and `synthetic` are effectively identical on the child-unit income - surface: - - weighted mean under-20 tax-unit income: - `110304.6 -> 110304.6` - - weighted mean under-20 tax-unit employment income: - `68829.3 -> 68829.3` - - calibration only nudges those values: - - weighted mean under-20 tax-unit income: - `110304.6 -> 108967.8` - - weighted mean under-20 tax-unit employment income: - `68829.3 -> 65923.5` -- action: - - treat the current child-unit AGI / Medicaid-income miss as entering in the - seeded integrated microdata before synthesis - - keep the next debugging lane on upstream construction / source-impute - parity rather than calibration - -## 2026-04-13 reject source tax-unit preservation as the broader donor default - -- tested: - - flipped `policyengine_prefer_existing_tax_unit_ids=True` only in the - canonical PE rebuild default - - left the generic build-config default unchanged - - ran the focused rebuild/checkpoint config tests - - got an explorer review; no concrete code-level regressions were identified -- synthetic proxy read: - - preserving source tax-unit IDs still looked slightly better on the cached - synthetic-policyengine comparison: - - `0.63654 -> 0.63583` -- real decision run: - - incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_preserve_taxunits_default_donors/broader-donors-preserve-taxunits-default-v1` -- read: - - capped full-oracle loss regresses slightly: - - `0.6955 -> 0.6977` - - active-solve capped loss improves: - - `0.7814 -> 0.7624` - - selected constraints fall slightly: - - `1031 -> 1019` -- action: - - reverted the default flip in `src/microplex_us/pipelines/pe_us_data_rebuild.py` - and the matching config assertions in the rebuild/checkpoint tests - - kept the optional preservation path available in `src/microplex_us/pipelines/us.py` -- interpretation: - - the structural clue is still real, but the broader donor frontier metric - does not justify making this the default rebuild path yet - - keep the next lane on upstream child-unit AGI / Medicaid-income - construction and source-impute parity - -## 2026-04-13 reject minor-household source tax-unit preservation - -- tested: - - added an opt-in experiment flag that preserved source `tax_unit_id` values - only for households containing a minor and left adult-only households on - the optimizer rebuild path - - added a focused preservation regression in `tests/pipelines/test_us.py` -- real decision run: - - incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_minorhousehold_preserve_taxunits_donors/broader-donors-minorhousehold-preserve-taxunits-v1` -- read: - - the child symptom improves sharply: - - under-20 singleton-tax-unit share: - `0.1538 -> 0.0345` - - under-20 mean `medicaid_income_level`: - `2.7279 -> 3.0408` - - but the broader donor frontier metric still regresses: - - capped full-oracle loss: - `0.6955 -> 0.6985` - - active-solve capped loss: - `0.7814 -> 0.7614` -- action: - - reverted the experiment flag and its targeted test -- interpretation: - - preserving child tax-unit structure helps, but it is not the main blocker - anymore - - the next upstream lane has to be AGI component construction for child-linked - tax units - -## 2026-04-13 under-20 AGI miss is now clearly a component-construction problem - -- compared PE baseline, the incumbent broader donor artifact, and the rejected - minor-household-preservation rerun on person-mapped under-20 tax-unit - aggregates -- read: - - under-20 mapped AGI / Medicaid MAGI improve with the rejected structure - probe, but remain far below the PE baseline: - - `adjusted_gross_income`: - `137623.5` (PE) vs `85755.2` (incumbent) vs `98230.0` (minor-preserve) - - `medicaid_magi`: - `140533.9` (PE) vs `86338.8` (incumbent) vs `98586.5` (minor-preserve) - - the remaining miss is in AGI composition: - - `tax_unit_partnership_s_corp_income` stays far too low: - `23323.0` (PE) vs `9568.7` vs `10710.1` - - `net_capital_gains` stays far too low: - `3200.0` (PE) vs `534.3` vs `945.7` - - `qualified_dividend_income` remains zero in both Microplex artifacts - - `tax_exempt_interest_income` remains zero in both Microplex artifacts -- action: - - move the next direct-path work off tax-unit-preservation variants and onto - AGI component construction / source-impute parity for child-linked units - -## 2026-04-13 reject PE-style sequential PUF joint-QRF imputation in the current donor runtime - -- tested: - - added a non-default `sequential_qrf` donor-imputer backend for the main PUF - AGI leaf lane and grouped the key tax variables into one joint block when - that backend was selected - - added focused regressions, verified the challenger path locally, then ran - matched medium and broader donor checkpoints -- real decision runs: - - medium candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_sequential_puf_joint_medium/medium-donors-sequential-puf-joint-v1` - - broader candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_sequential_puf_joint_donors/broader-donors-sequential-puf-joint-v1` - - incumbent baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` -- read: - - the broader donor frontier metric regresses: - - capped full-oracle loss: - `0.6955 -> 0.7190` - - active-solve capped loss: - `0.7814 -> 0.7757` - - selected constraints: - `1031 -> 999` - - the medium donor rerun is also not attractive: - - capped full-oracle loss: - `0.9426` - - active-solve capped loss: - `0.6618` - - a direct matched CPS+PUF stage probe on a `1000/1000` sample shows the - challenger changes child-linked AGI composition aggressively rather than - cleanly fixing the miss: - - under-20 linked `qualified_dividend_income`: - `40.0 -> 1199.0` - - under-20 linked `taxable_interest_income`: - `507.2 -> 1634.6` - - under-20 linked `tax_exempt_interest_income`: - `4.66 -> 249.4` - - under-20 linked `taxable_pension_income`: - `9118.5 -> 19317.6` -- action: - - rejected the challenger, reverted the experiment code, and kept the - incumbent donor-impute backend -- interpretation: - - the parity clue is still useful because PolicyEngine really does use a more - joint QRF architecture for this lane - - but the direct port into the current donor/rank-match runtime is not - numerically safe enough to keep - - the next lane remains narrower AGI component construction / source-impute - parity for child-linked tax units, not a backend replacement - -## 2026-04-13 reject post-donor zeroing of PUF tax leaves on dependent rows - -- tested: - - added a post-donor semantic guard that zeroed selected PE-style PUF tax - leaves on rows with `is_tax_unit_dependent > 0` - - rationale: raw expanded PUF dependents already carry zero for these leaves, - while the incumbent broader donor seed artifact was assigning large - dependent-row mass on `partnership_s_corp_income`, - `taxable_pension_income`, and `taxable_interest_income` -- local diagnostic read: - - the guard did what it was intended to do on the incumbent seed artifact: - - under-20 `partnership_s_corp_income`: - `4.09M -> 87.3k` - - under-20 `taxable_pension_income`: - `17.77M -> 172.6k` - - under-20 `taxable_interest_income`: - `33.98k -> 3.28k` -- real decision run: - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_dependent_zero_tax_leaves_donors/broader-donors-dependent-zero-tax-leaves-v1` - - incumbent baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` -- read: - - the broader donor frontier metric regresses badly: - - capped full-oracle loss: - `0.6955 -> 1.1372` - - active-solve capped loss: - `0.7814 -> 1.6581` - - the run starts from a much worse first calibration stage: - - post-stage-1 capped full-oracle loss: - `1.3660` - - deferred stages improve that bad candidate but do not rescue it: - - post-stage-2 capped full-oracle loss: - `1.2460` - - final capped full-oracle loss: - `1.1372` -- action: - - rejected the guard and reverted the code -- interpretation: - - the structural clue is still useful because the dependent-row mass is being - created during donor integration, not in raw PUF expansion - - but blunt post-donor zeroing is the wrong repair and should not stay in the - default path - - the next lane remains narrower donor-impute/source-impute parity for these - child-linked tax leaves - -## 2026-04-13 reject dependent-role partitioning inside donor imputation - -- tested: - - added an exact-match partition on `is_tax_unit_dependent` for the three PUF - leaves that were actually exploding on child-linked rows: - `partnership_s_corp_income`, `taxable_pension_income`, - `taxable_interest_income` - - rationale: move the repair to the actual failure point inside donor - imputation, instead of zeroing rows after integration -- real decision run: - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_dependent_partition_tax_leaves_donors/broader-donors-dependent-partition-tax-leaves-v1` - - incumbent baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` -- read: - - the broader donor frontier metric regresses even more: - - capped full-oracle loss: - `0.6955 -> 1.2406` - - active-solve capped loss: - `0.7814 -> 1.6943` - - the child-dependent mass is strongly suppressed, but that still does not - help the shared objective: - - under-20 `partnership_s_corp_income`: - `74.5k` - - under-20 `taxable_pension_income`: - `257.4k` - - under-20 `taxable_interest_income`: - `3.33k` -- review: - - an independent review also found correctness risks in the partition - implementation: - - null partition keys would fall through to a global donor fallback - - projected partition labels were lossy after entity projection - - empty donor partitions silently disabled exact-match isolation -- action: - - rejected the experiment and reverted the code -- interpretation: - - the failure point is still donor integration - - but role-suppression heuristics, even inside donor fitting/matching, are not - the right repair - - the next lane should move closer to PE source-impute structure for these AGI - leaves rather than adding more support heuristics - -## 2026-04-13 reject richer singleton condition surfaces for the PUF child-linked tax leaves - -- tested: - - expanded the preferred donor-condition surface for - `partnership_s_corp_income`, `taxable_interest_income`, and - `taxable_pension_income` beyond the PE-style demographic predictors to also - use current income state - - kept the current donor backend and singleton block structure unchanged - - added focused regressions that the richer predictors resolved only for these - leaves and that `income` was actually added to the resolved condition set - when available -- verification: - - `python -m py_compile src/microplex_us/variables.py tests/test_variables.py tests/pipelines/test_us.py` - - `uv run pytest tests/test_variables.py tests/pipelines/test_us.py -q -k 'puf_irs_predictors or pe_style_puf_predictors_for_generic_irs_vars or donor_imputation_block_specs or augment_donor_condition_frame_for_targets_derives_pe_style_puf_predictors'` -- real decision run: - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_income_aware_puf_tax_leaves_donors/broader-donors-income-aware-puf-tax-leaves-v1` - - incumbent baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` -- read: - - the broader donor frontier metric regresses: - - capped full-oracle loss: - `0.6955 -> 0.7420` - - active-solve capped loss: - `0.7814 -> 0.8499` - - selected constraints: - `1031 -> 1027` - - the candidate does improve across deferred stages, but never catches the - incumbent: - - post-stage-1 capped full-oracle loss: - `0.8326` - - post-stage-2 capped full-oracle loss: - `0.7879` - - final capped full-oracle loss: - `0.7420` -- PE code read: - - this explains why the shortcut loses: PolicyEngine does not solve these - leaves with richer singleton donor surfaces - - they live inside one sequential PUF QRF pass, with only - `taxable_pension_income` also touching the separate ACS donor path -- action: - - rejected the richer singleton condition-surface patch and reverted the code -- interpretation: - - widening singleton condition surfaces is still the wrong abstraction for - this lane - - local code read confirms these are PUF-native leaves entering the build - through the PUF provider before the donor-survey sources, not current - explicit direct-override variables - - the next step should move toward the real structure gap in how PUF tax - leaves enter the build, not pile more predictors onto the generic donor path - -## 2026-04-13 reject standalone PUF-native QRF hook for three child-linked AGI leaves - -- tested: - - added a temporary PUF-provider QRF hook at tax-unit load time for - `partnership_s_corp_income`, `taxable_interest_income`, and - `taxable_pension_income` - - kept the rest of the donor integration and calibration path unchanged -- verification: - - focused `py_compile` passed - - focused `tests/test_puf_source_provider.py` slices passed before the real - rerun -- real decision run: - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_puf_tax_leaf_qrf_donors/broader-donors-puf-tax-leaf-qrf-v1` - - incumbent baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` -- read: - - the broader donor frontier metric regresses hard: - - capped full-oracle loss: - `0.6955 -> 0.8729` - - active-solve capped loss: - `0.7814 -> 1.1545` - - selected constraints: - `1031 -> 1064` -- action: - - rejected the provider-hook experiment and reverted the code -- interpretation: - - the right lesson is not “more QRF earlier” - - a standalone PUF-side QRF hook, without the rest of PolicyEngine’s - sequential clone/impute structure, is still the wrong runtime shape for - this lane diff --git a/_WORKSPACE.md b/_WORKSPACE.md deleted file mode 100644 index ec492333..00000000 --- a/_WORKSPACE.md +++ /dev/null @@ -1,103 +0,0 @@ -# _WORKSPACE.md - -This file is the durable local context for `microplex-us`. - -## Repo role - -`microplex-us` is the US country pack. It should specialize core `microplex`, not fork it conceptually. - -Core repo: - -- [`/Users/maxghenis/PolicyEngine/microplex`](/Users/maxghenis/PolicyEngine/microplex) - -Sibling country pack: - -- [`/Users/maxghenis/PolicyEngine/microplex-uk`](/Users/maxghenis/PolicyEngine/microplex-uk) - -## Current high-value modules - -### PolicyEngine-US - -- `src/microplex_us/policyengine/us.py` -- `src/microplex_us/policyengine/harness.py` -- `src/microplex_us/policyengine/comparison.py` - -### Local reweighting - -- `src/microplex_us/pipelines/local_reweighting.py` - -### Source semantics / manifests - -- `src/microplex_us/variables.py` -- `src/microplex_us/data_sources/` -- `src/microplex_us/manifests/` - -## Current architectural boundary - -US should keep local: - -- PE-US microsimulation/materialization details -- US target database/provider specifics -- raw CPS/PUF and other US source mappings - -US should not keep local if it generalizes: - -- benchmark math -- benchmark suite/result types -- reweighting math -- generic target querying/filtering -- long-lived eval-repo benchmark orchestration for method bakeoffs - -## Important current caveat - -US tax filing units may eventually be policy-endogenous. Avoid hard-baking tax-unit structure too deeply into shared abstractions. - -## Current mission metric - -- The real US mission is no longer generic parity improvement. It is to beat PolicyEngine on the PE-native broad loss frontier. -- The main comparator should be matched-size PE baselines: - - `Microplex@N` vs `PE@N` - - ideally `PE@N` should be reweighted/recalibrated after sampling -- Full `enhanced_cps_2024` remains the stretch reference, not the only pass/fail bar. - -## Current benchmark guidance - -- Use common-target comparisons when claiming candidate vs baseline wins. -- Composite parity loss remains useful as a diagnostic, but it is not the US mission metric. -- PE-native broad loss is the canonical mission score for US frontier work. -- Do not assume larger `N` should help automatically on the current path; non-monotonicity has already shown that record support and optimizer alignment are still imperfect. - -## Current diagnostic read - -- Post-export direct optimization on the exact PE-native broad objective is now available. -- On a fixed `2000`-household exported candidate, direct PE-native optimization improved loss only trivially (`0.92334 -> 0.92290`). -- Current read: objective mismatch is real, but the larger bottleneck is still record construction/support, not just the final weight objective. -- The next high-leverage path is full-support candidate construction plus budgeted household selection, not more small-candidate entropy or donor A/B loops. - -## Selection backends - -- Household-budgeted selection now has two backends in the US pipeline: - - `sparse` - - `pe_native_loss` -- `pe_native_loss` is the cleaner experimental backend because it ranks/selects households using the actual PE-native loss surface on an exported candidate. -- Until the full-support `pe_native_loss` selector run lands, do not port this architecture to UK. - -## High-signal tests - -- `tests/policyengine/test_comparison.py` -- `tests/policyengine/test_harness.py` -- `tests/policyengine/test_us.py` -- `tests/pipelines/test_local_reweighting.py` -- `tests/test_share_imputation.py` - -## Working rule - -If the same helper starts appearing in both PE-US and PE-UK benchmark/reweighting flows, promote it to core. - -## Review handoff - -- Current durable Claude request: - - `/Users/maxghenis/PolicyEngine/microplex-us/reviews/PENDING_CLAUDE_REVIEW.md` -- Full saved reviews belong under: - - `/Users/maxghenis/PolicyEngine/microplex-us/reviews/` -- `_BUILD_LOG.md` should only keep a concise review summary, not the full review body. diff --git a/artifacts/live_pe_native_cps_puf_rich_broad_fixed_20260329/20260329T175330Z-057066af/pe_us_data_rebuild_parity.json b/artifacts/live_pe_native_cps_puf_rich_broad_fixed_20260329/20260329T175330Z-057066af/pe_us_data_rebuild_parity.json deleted file mode 100644 index 45fdf928..00000000 --- a/artifacts/live_pe_native_cps_puf_rich_broad_fixed_20260329/20260329T175330Z-057066af/pe_us_data_rebuild_parity.json +++ /dev/null @@ -1,199 +0,0 @@ -{ - "artifactDir": "/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_native_cps_puf_rich_broad_fixed_20260329/20260329T175330Z-057066af", - "artifactId": "20260329T175330Z-057066af", - "baselineSlice": { - "baselineDatasetPath": "/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5", - "baselineLabel": "policyengine_us_data", - "calibrationTargetProfile": "pe_native_broad", - "candidateLabel": "microplex", - "comparisonMetadata": { - "base_experiment_name": "cps+puf-rich-pe-native-broad-fixed", - "baseline_dataset": "enhanced_cps_2024.h5", - "calibration_target_profile": "pe_native_broad", - "experiment_name": "cps+puf-rich-pe-native-broad-fixed-n2000", - "harness_slice_names": [ - "all_targets" - ], - "harness_suite": "policyengine_us_all_targets", - "mission": "beat_pe_native_loss", - "n_synthetic": 2000, - "policyengine_us_runtime_version": "1.587.0", - "scope": "broad_fixed", - "sources": [ - "cps_asec", - "irs_soi_puf" - ], - "sweep_parameter": "n_synthetic", - "target_domains": [], - "target_geo_levels": [], - "target_period": 2024, - "target_profile": "pe_native_broad", - "target_reform_id": 0, - "target_variables": [], - "targets_db": "policy_data.db" - }, - "datasetYear": 2024, - "targetPeriod": 2024, - "targetProfile": "pe_native_broad", - "targetsDbPath": "/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/calibration/policy_data.db" - }, - "comparison": { - "policyengineHarness": { - "available": true, - "baseline_composite_parity_loss": 168283.32348760363, - "baseline_mean_abs_relative_error": 90041.20274843804, - "baseline_supported_target_rate": 0.09564296119719243, - "candidate_composite_parity_loss": 0.9671507721030934, - "candidate_mean_abs_relative_error": 0.9650507960258878, - "composite_parity_loss_delta": -168282.35633683152, - "isPolicyEngineComparison": true, - "mean_abs_relative_error_delta": -90040.237697642, - "period": 2024, - "slice_win_rate": 1.0, - "supported_target_rate": 0.09564296119719243, - "tag_summaries": { - "all_targets": { - "baseline_attribute_macro_mean_abs_relative_error": 59582.92670221516, - "baseline_attribute_tail_mean_abs_relative_error": 579573.9387208556, - "baseline_composite_parity_loss": 168283.32348760363, - "baseline_mean_abs_relative_error": 90041.20274843804, - "baseline_micro_mean_abs_relative_error": 90041.20274843805, - "baseline_supported_target_rate": 0.09564296119719243, - "candidate_attribute_macro_mean_abs_relative_error": 0.956382760630277, - "candidate_attribute_tail_mean_abs_relative_error": 1.021066616965775, - "candidate_composite_parity_loss": 0.9671507721030934, - "candidate_mean_abs_relative_error": 0.9650507960258878, - "candidate_micro_mean_abs_relative_error": 0.9650507960258878, - "composite_parity_loss_delta": -168282.35633683152, - "mean_abs_relative_error_delta": -90040.237697642, - "slice_win_rate": 1.0, - "supported_target_rate": 0.09564296119719243, - "target_win_rate": 0.18332871780670174 - }, - "benchmark": { - "baseline_attribute_macro_mean_abs_relative_error": 59582.92670221516, - "baseline_attribute_tail_mean_abs_relative_error": 579573.9387208556, - "baseline_composite_parity_loss": 168283.32348760363, - "baseline_mean_abs_relative_error": 90041.20274843804, - "baseline_micro_mean_abs_relative_error": 90041.20274843805, - "baseline_supported_target_rate": 0.09564296119719243, - "candidate_attribute_macro_mean_abs_relative_error": 0.956382760630277, - "candidate_attribute_tail_mean_abs_relative_error": 1.021066616965775, - "candidate_composite_parity_loss": 0.9671507721030934, - "candidate_mean_abs_relative_error": 0.9650507960258878, - "candidate_micro_mean_abs_relative_error": 0.9650507960258878, - "composite_parity_loss_delta": -168282.35633683152, - "mean_abs_relative_error_delta": -90040.237697642, - "slice_win_rate": 1.0, - "supported_target_rate": 0.09564296119719243, - "target_win_rate": 0.18332871780670174 - } - }, - "target_win_rate": 0.18332871780670174 - }, - "policyengineNativeScores": { - "available": true, - "baseline_enhanced_cps_native_loss": 0.020243908529428433, - "baseline_unweighted_msre": 0.02091315896964405, - "candidate_beats_baseline": false, - "candidate_enhanced_cps_native_loss": 0.9585613389048618, - "candidate_unweighted_msre": 0.9576662035111907, - "enhanced_cps_native_loss_delta": 0.9383174303754334, - "isPolicyEngineComparison": true, - "metric": "enhanced_cps_native_loss", - "n_national_targets": 641, - "n_state_targets": 2176, - "n_targets_bad_dropped": 10, - "n_targets_kept": 2817, - "n_targets_total": 2829, - "n_targets_zero_dropped": 10, - "period": 2024, - "unweighted_msre_delta": 0.9367530445415466 - } - }, - "evidence": { - "manifest": { - "exists": true, - "file": "manifest.json", - "source": "artifact_bundle" - }, - "policyengineHarness": { - "exists": true, - "file": "policyengine_harness.json", - "source": "artifact_bundle" - }, - "policyengineNativeScores": { - "exists": true, - "file": "policyengine_native_scores.json", - "source": "artifact_bundle" - } - }, - "profileConformance": { - "differingKeyCount": 8, - "differingKeys": [ - { - "expected": "qrf", - "key": "donor_imputer_backend", - "observed": "maf" - }, - { - "expected": "pe_prespecified", - "key": "donor_imputer_condition_selection", - "observed": "top_correlated" - }, - { - "expected": 100000, - "key": "n_synthetic", - "observed": 2000 - }, - { - "expected": null, - "key": "policyengine_calibration_target_profile", - "observed": "pe_native_broad" - }, - { - "expected": null, - "key": "policyengine_dataset_year", - "observed": 2024 - }, - { - "expected": null, - "key": "policyengine_target_period", - "observed": 2024 - }, - { - "expected": null, - "key": "policyengine_target_profile", - "observed": "pe_native_broad" - }, - { - "expected": "seed", - "key": "synthesis_backend", - "observed": "bootstrap" - } - ], - "exactMatch": false, - "matchingKeyCount": 48 - }, - "program": { - "programId": "pe-us-data-rebuild-v1", - "stageStatuses": { - "cps-construction": "partial", - "entity-export-parity": "partial", - "extended-cps-qrf": "partial", - "family-imputation-parity": "partial", - "puf-ingestion-uprating": "partial", - "source-contracts": "partial", - "targets-and-eval": "close", - "weighting-backend": "close" - }, - "title": "Rebuild PE-US-data in Microplex" - }, - "schemaVersion": 1, - "verdict": { - "candidateBeatsHarnessCompositeParityLoss": true, - "candidateBeatsHarnessMeanAbsRelativeError": true, - "candidateBeatsNativeBroadLoss": false, - "hasRealPolicyEngineComparison": true - } -} diff --git a/artifacts/live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json b/artifacts/live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json deleted file mode 100644 index a96f775b..00000000 --- a/artifacts/live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json +++ /dev/null @@ -1,467 +0,0 @@ -{ - "artifactRoots": [ - "artifacts/live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "artifacts/live_pe_us_data_rebuild_checkpoint_20260408_modelpass", - "artifacts/live_pe_us_data_rebuild_checkpoint_20260409_modelpass" - ], - "bestRuns": [ - { - "artifactPath": "smoke-cps-puf-only-20260409-no-cal-rescale-proper-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260409_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "national_irs_other", - "largestRegressingFamilyDelta": 0.196275731648123, - "largestRegressingTarget": null, - "lossDelta": 0.62817110482181, - "missingStoredCriticalInputs": [], - "top3Families": [ - "national_irs_other", - "state_agi_distribution", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260409-no-cal-rescaled-calibratedshape-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260409_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "national_irs_other", - "largestRegressingFamilyDelta": 0.19963828456608318, - "largestRegressingTarget": null, - "lossDelta": 0.6367445882623051, - "missingStoredCriticalInputs": [], - "top3Families": [ - "national_irs_other", - "state_agi_distribution", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260410-countcal-default-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260409_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 0.3214476167416313, - "largestRegressingTarget": null, - "lossDelta": 0.8667642407973689, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260410-cpsincome-countcal-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260409_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 0.3214476167416313, - "largestRegressingTarget": null, - "lossDelta": 0.8667642407973689, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260407-nocalraw-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "national_irs_other", - "largestRegressingFamilyDelta": 0.30122078732152846, - "largestRegressingTarget": null, - "lossDelta": 0.9705530107645463, - "missingStoredCriticalInputs": [], - "top3Families": [ - "national_irs_other", - "state_agi_distribution", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260407-truenocal-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "national_irs_other", - "largestRegressingFamilyDelta": 0.30122078732152846, - "largestRegressingTarget": null, - "lossDelta": 0.9705530107645463, - "missingStoredCriticalInputs": [], - "top3Families": [ - "national_irs_other", - "state_agi_distribution", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260407-countyfix-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_aca_spending", - "largestRegressingFamilyDelta": 0.3632270641676908, - "largestRegressingTarget": null, - "lossDelta": 1.0540849017736476, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_aca_spending", - "national_irs_other", - "state_aca_enrollment" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260407-hohoverride-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_aca_spending", - "largestRegressingFamilyDelta": 0.3632270641676908, - "largestRegressingTarget": null, - "lossDelta": 1.0540849017736476, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_aca_spending", - "national_irs_other", - "state_aca_enrollment" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260410-cpsincome-demog-nocal-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260409_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_aca_spending", - "largestRegressingFamilyDelta": 0.2923340703099431, - "largestRegressingTarget": null, - "lossDelta": 1.1059974897026443, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_aca_spending", - "national_irs_other", - "state_agi_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260410-pufconds-nocal-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260409_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_aca_spending", - "largestRegressingFamilyDelta": 0.2943020646639436, - "largestRegressingTarget": null, - "lossDelta": 1.1096703824516962, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_aca_spending", - "national_irs_other", - "state_agi_distribution" - ] - } - ], - "familyCountsByRoot": { - "live_pe_us_data_rebuild_checkpoint_20260407_modelpass": [ - { - "count": 21, - "family": "national_irs_other" - }, - { - "count": 19, - "family": "state_agi_distribution" - }, - { - "count": 13, - "family": "state_aca_spending" - }, - { - "count": 8, - "family": "state_age_distribution" - }, - { - "count": 2, - "family": "state_aca_enrollment" - } - ], - "live_pe_us_data_rebuild_checkpoint_20260408_modelpass": [ - { - "count": 18, - "family": "national_irs_other" - }, - { - "count": 18, - "family": "state_aca_spending" - }, - { - "count": 17, - "family": "state_agi_distribution" - }, - { - "count": 1, - "family": "state_aca_enrollment" - } - ], - "live_pe_us_data_rebuild_checkpoint_20260409_modelpass": [ - { - "count": 27, - "family": "national_irs_other" - }, - { - "count": 27, - "family": "state_agi_distribution" - }, - { - "count": 23, - "family": "state_aca_spending" - }, - { - "count": 4, - "family": "state_age_distribution" - } - ] - }, - "largestFamilyCounts": [ - { - "count": 34, - "family": "state_agi_distribution" - }, - { - "count": 25, - "family": "national_irs_other" - }, - { - "count": 7, - "family": "state_aca_spending" - } - ], - "missingCriticalInputsCounts": [], - "targetCountsFromAudits": [ - { - "count": 2, - "target": "state/NM/adjusted_gross_income/count/500000_inf" - }, - { - "count": 2, - "target": "state/WI/adjusted_gross_income/count/500000_inf" - }, - { - "count": 1, - "target": "nation/irs/aca_spending/mi" - }, - { - "count": 1, - "target": "nation/irs/count/count/AGI in 1m-inf/taxable/Single" - }, - { - "count": 1, - "target": "state/KY/adjusted_gross_income/count/500000_inf" - } - ], - "top3FamilyCounts": [ - { - "family": "national_irs_other", - "rank1Count": 25, - "rank2Count": 41, - "rank3Count": 0, - "top3Count": 66 - }, - { - "family": "state_agi_distribution", - "rank1Count": 34, - "rank2Count": 18, - "rank3Count": 11, - "top3Count": 63 - }, - { - "family": "state_aca_spending", - "rank1Count": 7, - "rank2Count": 7, - "rank3Count": 40, - "top3Count": 54 - }, - { - "family": "state_age_distribution", - "rank1Count": 0, - "rank2Count": 0, - "rank3Count": 12, - "top3Count": 12 - }, - { - "family": "state_aca_enrollment", - "rank1Count": 0, - "rank2Count": 0, - "rank3Count": 3, - "top3Count": 3 - } - ], - "totalAuditedRuns": 7, - "totalScoredRuns": 66, - "worstRuns": [ - { - "artifactPath": "smoke-cps-puf-donors-20260407-pretax-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 10.749089662361879, - "largestRegressingTarget": null, - "lossDelta": 16.049868854019003, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-pufparityfix-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 10.749089662361879, - "largestRegressingTarget": null, - "lossDelta": 16.049868854019003, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-demographyfix-v2", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "national_irs_other", - "largestRegressingFamilyDelta": 9.30368450058258, - "largestRegressingTarget": null, - "lossDelta": 11.352893897255619, - "missingStoredCriticalInputs": [], - "top3Families": [ - "national_irs_other", - "state_agi_distribution", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-demographyfix-v3", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "national_irs_other", - "largestRegressingFamilyDelta": 9.30368450058258, - "largestRegressingTarget": null, - "lossDelta": 11.352893897255619, - "missingStoredCriticalInputs": [], - "top3Families": [ - "national_irs_other", - "state_agi_distribution", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-statusfix-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": true, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 5.826379190259378, - "largestRegressingTarget": "state/WI/adjusted_gross_income/count/500000_inf", - "lossDelta": 10.792989501722793, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_aca_spending" - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260408-sparseirs-semantics-smoke-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260408_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 5.877938302384082, - "largestRegressingTarget": null, - "lossDelta": 8.448860731815127, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_aca_spending" - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-entityidfix-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 4.545408699346589, - "largestRegressingTarget": null, - "lossDelta": 7.291624527856844, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_aca_spending" - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-hohfix-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": true, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 4.539702054022589, - "largestRegressingTarget": "state/WI/adjusted_gross_income/count/500000_inf", - "lossDelta": 7.220330647606508, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_aca_spending" - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-pesoi-v2", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": true, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "national_irs_other", - "largestRegressingFamilyDelta": 4.190878910137539, - "largestRegressingTarget": "nation/irs/count/count/AGI in 1m-inf/taxable/Single", - "lossDelta": 6.632127832091368, - "missingStoredCriticalInputs": [], - "top3Families": [ - "national_irs_other", - "state_agi_distribution", - "state_age_distribution" - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-pretaxh5-v2", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "auditAvailable": false, - "candidateBeatsBaseline": false, - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingFamilyDelta": 3.936771953338614, - "largestRegressingTarget": null, - "lossDelta": 6.363579076831835, - "missingStoredCriticalInputs": [], - "top3Families": [ - "state_agi_distribution", - "national_irs_other", - "state_aca_spending" - ] - } - ] -} diff --git a/artifacts/live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json b/artifacts/live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json deleted file mode 100644 index 3c743f40..00000000 --- a/artifacts/live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json +++ /dev/null @@ -1,579 +0,0 @@ -{ - "artifactRoots": [ - "artifacts/live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "artifacts/live_pe_us_data_rebuild_checkpoint_20260408_modelpass", - "artifacts/live_pe_us_data_rebuild_checkpoint_20260409_modelpass" - ], - "auditsWhereFamilyLeads": 2, - "auditsWithMatchingTargets": 7, - "family": "national_irs_other", - "leadAudits": [ - { - "artifactPath": "smoke-cps-puf-donors-20260407-modelpass-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "largestRegressingFamily": "national_irs_other", - "largestRegressingTarget": "nation/irs/aca_spending/mi", - "matchingTargets": [ - { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 419.22091234084394 - }, - { - "target": "nation/irs/total pension income/count/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 318.56012890551284 - }, - { - "target": "nation/irs/capital gains gross/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 282.09144998889815 - }, - { - "target": "nation/irs/capital gains gross/count/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 189.71500648697602 - }, - { - "target": "nation/irs/count/count/AGI in 500k-1m/taxable/Single", - "weightedTermDelta": 145.16270011995235 - } - ], - "topFilingStatusGaps": [ - { - "filingStatus": "SINGLE", - "weightedCountDelta": 21510790.42587143 - }, - { - "filingStatus": "JOINT", - "weightedCountDelta": 11493054.950603567 - }, - { - "filingStatus": "SEPARATE", - "weightedCountDelta": -6525838.501988618 - }, - { - "filingStatus": "SURVIVING_SPOUSE", - "weightedCountDelta": -1742820.5991458818 - }, - { - "filingStatus": "HEAD_OF_HOUSEHOLD", - "weightedCountDelta": 895324.6843228303 - } - ], - "topMFSAgiGaps": [ - { - "agiBin": "100k_to_200k", - "weightedCountDelta": -441713.13540649414 - }, - { - "agiBin": "75k_to_100k", - "weightedCountDelta": -365034.6506958008 - }, - { - "agiBin": "200k_to_500k", - "weightedCountDelta": -98224.08876037598 - }, - { - "agiBin": "500k_plus", - "weightedCountDelta": -37282.45879718475 - } - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-pesoi-v2", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "largestRegressingFamily": "national_irs_other", - "largestRegressingTarget": "nation/irs/count/count/AGI in 1m-inf/taxable/Single", - "matchingTargets": [ - { - "target": "nation/irs/count/count/AGI in 1m-inf/taxable/Single", - "weightedTermDelta": 5086.753544890369 - }, - { - "target": "nation/irs/partnership and s corp income/total/AGI in 40k-50k/taxable/All", - "weightedTermDelta": 1200.3627182464477 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in 1m-inf/taxable/All", - "weightedTermDelta": 601.0202309634736 - }, - { - "target": "nation/irs/count/count/AGI in 500k-1m/taxable/Single", - "weightedTermDelta": 458.95151799685226 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 408.13469397662254 - }, - { - "target": "nation/irs/count/count/AGI in 1m-2m/taxable/All", - "weightedTermDelta": 395.54238294868156 - }, - { - "target": "nation/irs/partnership and s corp income/count/AGI in 40k-50k/taxable/All", - "weightedTermDelta": 359.4286801357562 - }, - { - "target": "nation/irs/adjusted gross income/total/AGI in 1m-inf/taxable/Single", - "weightedTermDelta": 320.1043740427233 - }, - { - "target": "nation/irs/adjusted gross income/total/AGI in 1m-2m/taxable/All", - "weightedTermDelta": 309.35422488688454 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in -inf-inf/taxable/All", - "weightedTermDelta": 287.0264080183984 - } - ], - "topFilingStatusGaps": [ - { - "filingStatus": "HEAD_OF_HOUSEHOLD", - "weightedCountDelta": -18367409.86255217 - }, - { - "filingStatus": "JOINT", - "weightedCountDelta": 15601585.239666067 - }, - { - "filingStatus": "SINGLE", - "weightedCountDelta": 14233834.339933932 - }, - { - "filingStatus": "SEPARATE", - "weightedCountDelta": -2925373.7285511177 - }, - { - "filingStatus": "SURVIVING_SPOUSE", - "weightedCountDelta": -683411.5991458818 - } - ], - "topMFSAgiGaps": [ - { - "agiBin": "200k_to_500k", - "weightedCountDelta": 875668.098739624 - }, - { - "agiBin": "75k_to_100k", - "weightedCountDelta": 490184.2243041992 - }, - { - "agiBin": "100k_to_200k", - "weightedCountDelta": -441713.13540649414 - }, - { - "agiBin": "500k_plus", - "weightedCountDelta": -37282.45879718475 - } - ] - } - ], - "leadFilingStatusGapSummary": [ - { - "count": 2, - "filingStatus": "SINGLE", - "meanAbsWeightedCountDelta": 17872312.38290268, - "negativeCount": 0, - "positiveCount": 2, - "weightedCountDeltaSum": 35744624.76580536 - }, - { - "count": 2, - "filingStatus": "JOINT", - "meanAbsWeightedCountDelta": 13547320.095134817, - "negativeCount": 0, - "positiveCount": 2, - "weightedCountDeltaSum": 27094640.190269634 - }, - { - "count": 2, - "filingStatus": "HEAD_OF_HOUSEHOLD", - "meanAbsWeightedCountDelta": 9631367.2734375, - "negativeCount": 1, - "positiveCount": 1, - "weightedCountDeltaSum": -17472085.17822934 - }, - { - "count": 2, - "filingStatus": "SEPARATE", - "meanAbsWeightedCountDelta": 4725606.115269868, - "negativeCount": 2, - "positiveCount": 0, - "weightedCountDeltaSum": -9451212.230539735 - }, - { - "count": 2, - "filingStatus": "SURVIVING_SPOUSE", - "meanAbsWeightedCountDelta": 1213116.0991458818, - "negativeCount": 2, - "positiveCount": 0, - "weightedCountDeltaSum": -2426232.1982917637 - } - ], - "leadMFSAgiGapSummary": [ - { - "agiBin": "200k_to_500k", - "count": 2, - "meanAbsWeightedCountDelta": 486946.09375, - "negativeCount": 1, - "positiveCount": 1, - "weightedCountDeltaSum": 777444.009979248 - }, - { - "agiBin": "100k_to_200k", - "count": 2, - "meanAbsWeightedCountDelta": 441713.13540649414, - "negativeCount": 2, - "positiveCount": 0, - "weightedCountDeltaSum": -883426.2708129883 - }, - { - "agiBin": "75k_to_100k", - "count": 2, - "meanAbsWeightedCountDelta": 427609.4375, - "negativeCount": 1, - "positiveCount": 1, - "weightedCountDeltaSum": 125149.57360839844 - }, - { - "agiBin": "500k_plus", - "count": 2, - "meanAbsWeightedCountDelta": 37282.45879718475, - "negativeCount": 2, - "positiveCount": 0, - "weightedCountDeltaSum": -74564.9175943695 - } - ], - "leadTargetCounts": [ - { - "count": 2, - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDeltaMean": 413.67780315873324, - "weightedTermDeltaSum": 827.3556063174665 - }, - { - "count": 2, - "target": "nation/irs/count/count/AGI in 500k-1m/taxable/Single", - "weightedTermDeltaMean": 302.0571090584023, - "weightedTermDeltaSum": 604.1142181168046 - }, - { - "count": 1, - "target": "nation/irs/count/count/AGI in 1m-inf/taxable/Single", - "weightedTermDeltaMean": 5086.753544890369, - "weightedTermDeltaSum": 5086.753544890369 - }, - { - "count": 1, - "target": "nation/irs/partnership and s corp income/total/AGI in 40k-50k/taxable/All", - "weightedTermDeltaMean": 1200.3627182464477, - "weightedTermDeltaSum": 1200.3627182464477 - }, - { - "count": 1, - "target": "nation/irs/ordinary dividends/total/AGI in 1m-inf/taxable/All", - "weightedTermDeltaMean": 601.0202309634736, - "weightedTermDeltaSum": 601.0202309634736 - }, - { - "count": 1, - "target": "nation/irs/count/count/AGI in 1m-2m/taxable/All", - "weightedTermDeltaMean": 395.54238294868156, - "weightedTermDeltaSum": 395.54238294868156 - }, - { - "count": 1, - "target": "nation/irs/partnership and s corp income/count/AGI in 40k-50k/taxable/All", - "weightedTermDeltaMean": 359.4286801357562, - "weightedTermDeltaSum": 359.4286801357562 - }, - { - "count": 1, - "target": "nation/irs/adjusted gross income/total/AGI in 1m-inf/taxable/Single", - "weightedTermDeltaMean": 320.1043740427233, - "weightedTermDeltaSum": 320.1043740427233 - }, - { - "count": 1, - "target": "nation/irs/total pension income/count/AGI in 500k-1m/taxable/All", - "weightedTermDeltaMean": 318.56012890551284, - "weightedTermDeltaSum": 318.56012890551284 - }, - { - "count": 1, - "target": "nation/irs/adjusted gross income/total/AGI in 1m-2m/taxable/All", - "weightedTermDeltaMean": 309.35422488688454, - "weightedTermDeltaSum": 309.35422488688454 - } - ], - "matchingAudits": [ - { - "artifactPath": "smoke-cps-puf-donors-20260407-modelpass-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "largestRegressingFamily": "national_irs_other", - "largestRegressingTarget": "nation/irs/aca_spending/mi", - "matchingTargets": [ - { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 419.22091234084394 - }, - { - "target": "nation/irs/total pension income/count/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 318.56012890551284 - }, - { - "target": "nation/irs/capital gains gross/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 282.09144998889815 - }, - { - "target": "nation/irs/capital gains gross/count/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 189.71500648697602 - }, - { - "target": "nation/irs/count/count/AGI in 500k-1m/taxable/Single", - "weightedTermDelta": 145.16270011995235 - } - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-pesoi-v2", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "largestRegressingFamily": "national_irs_other", - "largestRegressingTarget": "nation/irs/count/count/AGI in 1m-inf/taxable/Single", - "matchingTargets": [ - { - "target": "nation/irs/count/count/AGI in 1m-inf/taxable/Single", - "weightedTermDelta": 5086.753544890369 - }, - { - "target": "nation/irs/partnership and s corp income/total/AGI in 40k-50k/taxable/All", - "weightedTermDelta": 1200.3627182464477 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in 1m-inf/taxable/All", - "weightedTermDelta": 601.0202309634736 - }, - { - "target": "nation/irs/count/count/AGI in 500k-1m/taxable/Single", - "weightedTermDelta": 458.95151799685226 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 408.13469397662254 - }, - { - "target": "nation/irs/count/count/AGI in 1m-2m/taxable/All", - "weightedTermDelta": 395.54238294868156 - }, - { - "target": "nation/irs/partnership and s corp income/count/AGI in 40k-50k/taxable/All", - "weightedTermDelta": 359.4286801357562 - }, - { - "target": "nation/irs/adjusted gross income/total/AGI in 1m-inf/taxable/Single", - "weightedTermDelta": 320.1043740427233 - }, - { - "target": "nation/irs/adjusted gross income/total/AGI in 1m-2m/taxable/All", - "weightedTermDelta": 309.35422488688454 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in -inf-inf/taxable/All", - "weightedTermDelta": 287.0264080183984 - } - ] - }, - { - "artifactPath": "medium-cps-puf-donors-20260407-v2", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingTarget": "state/KY/adjusted_gross_income/count/500000_inf", - "matchingTargets": [ - { - "target": "nation/irs/total pension income/count/AGI in 1m-inf/taxable/All", - "weightedTermDelta": 750.5585096058483 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in 200k-500k/taxable/All", - "weightedTermDelta": 250.35660581400418 - }, - { - "target": "nation/irs/capital gains gross/count/AGI in 1m-inf/taxable/All", - "weightedTermDelta": 209.32445764672067 - }, - { - "target": "nation/irs/capital gains gross/total/AGI in 200k-500k/taxable/All", - "weightedTermDelta": 197.34019622748488 - }, - { - "target": "nation/irs/capital gains gross/count/AGI in 200k-500k/taxable/All", - "weightedTermDelta": 188.28378308831282 - } - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-hohfix-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingTarget": "state/WI/adjusted_gross_income/count/500000_inf", - "matchingTargets": [ - { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 546.9401135923162 - }, - { - "target": "nation/irs/count/count/AGI in 25k-30k/taxable/Head of Household", - "weightedTermDelta": 301.2332536699099 - }, - { - "target": "nation/irs/adjusted gross income/total/AGI in 25k-30k/taxable/Head of Household", - "weightedTermDelta": 250.6928253148586 - }, - { - "target": "nation/irs/capital gains gross/count/AGI in 20k-25k/taxable/All", - "weightedTermDelta": 235.4072424916251 - }, - { - "target": "nation/irs/business net profits/total/AGI in 15k-20k/taxable/All", - "weightedTermDelta": 231.10002301208237 - }, - { - "target": "nation/irs/capital gains gross/count/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 224.5421823309171 - }, - { - "target": "nation/irs/estate income/count/AGI in -inf-inf/taxable/All", - "weightedTermDelta": 171.5270556028451 - } - ] - }, - { - "artifactPath": "smoke-cps-puf-donors-20260407-statusfix-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260407_modelpass", - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingTarget": "state/WI/adjusted_gross_income/count/500000_inf", - "matchingTargets": [ - { - "target": "nation/irs/count/count/AGI in 25k-30k/taxable/Head of Household", - "weightedTermDelta": 2393.640154631822 - }, - { - "target": "nation/irs/adjusted gross income/total/AGI in 25k-30k/taxable/Head of Household", - "weightedTermDelta": 1976.8697656996242 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 1364.3587575986378 - }, - { - "target": "nation/irs/capital gains gross/count/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 671.88662693973 - }, - { - "target": "nation/irs/exempt interest/total/AGI in -inf-inf/taxable/All", - "weightedTermDelta": 435.28090824504244 - } - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260408-no-fs-code-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260408_modelpass", - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingTarget": "state/NM/adjusted_gross_income/count/500000_inf", - "matchingTargets": [ - { - "target": "nation/irs/taxable interest income/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 191.07157601007822 - }, - { - "target": "nation/irs/taxable interest income/total/AGI in 30k-40k/taxable/All", - "weightedTermDelta": 78.47103597657797 - } - ] - }, - { - "artifactPath": "smoke-cps-puf-only-20260409-interestonly-v1", - "artifactRoot": "live_pe_us_data_rebuild_checkpoint_20260409_modelpass", - "largestRegressingFamily": "state_agi_distribution", - "largestRegressingTarget": "state/NM/adjusted_gross_income/count/500000_inf", - "matchingTargets": [ - { - "target": "nation/irs/count/count/AGI in 20k-25k/taxable/Head of Household", - "weightedTermDelta": 383.73795080903983 - }, - { - "target": "nation/irs/adjusted gross income/total/AGI in 20k-25k/taxable/Head of Household", - "weightedTermDelta": 289.51483080158965 - }, - { - "target": "nation/irs/taxable interest income/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 109.66172682016968 - }, - { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDelta": 88.7647064061777 - } - ] - } - ], - "matchingTargetCounts": [ - { - "count": 5, - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "weightedTermDeltaMean": 565.4838367829195, - "weightedTermDeltaSum": 2827.419183914598 - }, - { - "count": 3, - "target": "nation/irs/capital gains gross/count/AGI in 500k-1m/taxable/All", - "weightedTermDeltaMean": 362.04793858587436, - "weightedTermDeltaSum": 1086.143815757623 - }, - { - "count": 2, - "target": "nation/irs/count/count/AGI in 25k-30k/taxable/Head of Household", - "weightedTermDeltaMean": 1347.436704150866, - "weightedTermDeltaSum": 2694.873408301732 - }, - { - "count": 2, - "target": "nation/irs/adjusted gross income/total/AGI in 25k-30k/taxable/Head of Household", - "weightedTermDeltaMean": 1113.7812955072413, - "weightedTermDeltaSum": 2227.5625910144827 - }, - { - "count": 2, - "target": "nation/irs/count/count/AGI in 500k-1m/taxable/Single", - "weightedTermDeltaMean": 302.0571090584023, - "weightedTermDeltaSum": 604.1142181168046 - }, - { - "count": 2, - "target": "nation/irs/ordinary dividends/total/AGI in 200k-500k/taxable/All", - "weightedTermDeltaMean": 234.9033336201827, - "weightedTermDeltaSum": 469.8066672403654 - }, - { - "count": 2, - "target": "nation/irs/taxable interest income/total/AGI in 500k-1m/taxable/All", - "weightedTermDeltaMean": 150.36665141512395, - "weightedTermDeltaSum": 300.7333028302479 - }, - { - "count": 1, - "target": "nation/irs/count/count/AGI in 1m-inf/taxable/Single", - "weightedTermDeltaMean": 5086.753544890369, - "weightedTermDeltaSum": 5086.753544890369 - }, - { - "count": 1, - "target": "nation/irs/partnership and s corp income/total/AGI in 40k-50k/taxable/All", - "weightedTermDeltaMean": 1200.3627182464477, - "weightedTermDeltaSum": 1200.3627182464477 - }, - { - "count": 1, - "target": "nation/irs/total pension income/count/AGI in 1m-inf/taxable/All", - "weightedTermDeltaMean": 750.5585096058483, - "weightedTermDeltaSum": 750.5585096058483 - } - ], - "totalAudits": 7 -} diff --git a/dashboard/app.js b/dashboard/app.js deleted file mode 100644 index 668fb228..00000000 --- a/dashboard/app.js +++ /dev/null @@ -1,506 +0,0 @@ -"use strict"; - -const DEFAULT_DATA_URL = "../artifacts/pe_native_target_diagnostics_current.json"; -const TABLE_LIMIT = 500; - -const state = { - data: null, - search: "", - family: "all", - scope: "all", - winner: "all", - dbMatch: "all", - sort: "weighted_term_delta:asc", -}; - -const el = { - dashboard: document.getElementById("dashboard"), - emptyState: document.getElementById("emptyState"), - fileInput: document.getElementById("fileInput"), - loadStatus: document.getElementById("loadStatus"), - kpiTargets: document.getElementById("kpiTargets"), - kpiToWinLabel: document.getElementById("kpiToWinLabel"), - kpiWinRate: document.getElementById("kpiWinRate"), - kpiLossDelta: document.getElementById("kpiLossDelta"), - kpiLossPair: document.getElementById("kpiLossPair"), - kpiDbMatch: document.getElementById("kpiDbMatch"), - kpiDbDetail: document.getElementById("kpiDbDetail"), - scopeSummary: document.getElementById("scopeSummary"), - familySummary: document.getElementById("familySummary"), - topImprovements: document.getElementById("topImprovements"), - topRegressions: document.getElementById("topRegressions"), - tableCount: document.getElementById("tableCount"), - searchInput: document.getElementById("searchInput"), - familyFilter: document.getElementById("familyFilter"), - scopeFilter: document.getElementById("scopeFilter"), - winnerFilter: document.getElementById("winnerFilter"), - dbFilter: document.getElementById("dbFilter"), - sortSelect: document.getElementById("sortSelect"), - targetTable: document.getElementById("targetTable"), -}; - -function labels() { - const datasetLabels = state.data?.dataset_labels || {}; - return { - from: datasetLabels.from || "baseline", - to: datasetLabels.to || "candidate", - }; -} - -function numberOrNull(value) { - const numeric = Number(value); - return Number.isFinite(numeric) ? numeric : null; -} - -function formatNumber(value, options = {}) { - const numeric = numberOrNull(value); - if (numeric === null) { - return "-"; - } - const abs = Math.abs(numeric); - if (abs >= 1_000_000 || (abs > 0 && abs < 0.001)) { - return numeric.toExponential(2); - } - return new Intl.NumberFormat("en-US", { - maximumFractionDigits: options.maximumFractionDigits ?? 3, - minimumFractionDigits: options.minimumFractionDigits ?? 0, - }).format(numeric); -} - -function formatCompact(value) { - const numeric = numberOrNull(value); - if (numeric === null) { - return "-"; - } - return new Intl.NumberFormat("en-US", { - notation: "compact", - maximumFractionDigits: 2, - }).format(numeric); -} - -function formatPercent(value) { - const numeric = numberOrNull(value); - if (numeric === null) { - return "-"; - } - return new Intl.NumberFormat("en-US", { - style: "percent", - maximumFractionDigits: 1, - }).format(numeric); -} - -function formatSigned(value) { - const numeric = numberOrNull(value); - if (numeric === null) { - return "-"; - } - const sign = numeric > 0 ? "+" : ""; - return `${sign}${formatNumber(numeric, { maximumFractionDigits: 4 })}`; -} - -function formatError(value) { - const numeric = numberOrNull(value); - if (numeric === null) { - return "-"; - } - return `${formatNumber(numeric, { maximumFractionDigits: 2 })}%`; -} - -function classForDelta(value) { - const numeric = numberOrNull(value) || 0; - if (numeric < 0) { - return "good"; - } - if (numeric > 0) { - return "bad"; - } - return ""; -} - -function winnerLabel(winner) { - const currentLabels = labels(); - if (winner === "to") { - return currentLabels.to; - } - if (winner === "from") { - return currentLabels.from; - } - return "tie"; -} - -function dbMatchLabel(row) { - const status = row.policyengine_target_match || "unparsed"; - if (status === "matched") { - return row.policyengine_target_id ? `#${row.policyengine_target_id}` : "matched"; - } - if (status === "legacy_only") { - return "legacy only"; - } - if (status === "db_unavailable") { - return "db unavailable"; - } - return status.replaceAll("_", " "); -} - -function summarizeRows(rows) { - const nTargets = rows.length; - const fromWins = rows.filter((row) => row.winner === "from").length; - const toWins = rows.filter((row) => row.winner === "to").length; - const ties = nTargets - fromWins - toWins; - const fromLoss = mean(rows.map((row) => row.from_weighted_term)); - const toLoss = mean(rows.map((row) => row.to_weighted_term)); - return { - n_targets: nTargets, - from_wins: fromWins, - to_wins: toWins, - ties, - from_win_rate: nTargets ? fromWins / nTargets : null, - to_win_rate: nTargets ? toWins / nTargets : null, - from_loss: fromLoss, - to_loss: toLoss, - loss_delta: toLoss - fromLoss, - mean_weighted_term_delta: mean(rows.map((row) => row.weighted_term_delta)), - }; -} - -function mean(values) { - const numbers = values.map(Number).filter(Number.isFinite); - if (!numbers.length) { - return null; - } - return numbers.reduce((sum, value) => sum + value, 0) / numbers.length; -} - -function groupSummary(rows, field) { - const grouped = new Map(); - for (const row of rows) { - const key = row[field] || "other"; - if (!grouped.has(key)) { - grouped.set(key, []); - } - grouped.get(key).push(row); - } - return Array.from(grouped.entries()).map(([key, groupRows]) => ({ - [field]: key, - ...summarizeRows(groupRows), - })); -} - -function normalizePayload(payload) { - const rows = Array.isArray(payload.targets) ? payload.targets : []; - return { - ...payload, - summary: payload.summary || summarizeRows(rows), - family_summaries: Array.isArray(payload.family_summaries) - ? payload.family_summaries - : groupSummary(rows, "target_family"), - scope_summaries: Array.isArray(payload.scope_summaries) - ? payload.scope_summaries - : groupSummary(rows, "target_scope"), - top_improvements: Array.isArray(payload.top_improvements) - ? payload.top_improvements - : [...rows] - .sort((a, b) => Number(a.weighted_term_delta) - Number(b.weighted_term_delta)) - .slice(0, 25), - top_regressions: Array.isArray(payload.top_regressions) - ? payload.top_regressions - : [...rows] - .sort((a, b) => Number(b.weighted_term_delta) - Number(a.weighted_term_delta)) - .slice(0, 25), - }; -} - -function setData(payload, sourceLabel) { - state.data = normalizePayload(payload); - el.dashboard.hidden = false; - el.emptyState.hidden = true; - el.loadStatus.textContent = sourceLabel; - populateFilters(); - render(); -} - -function showEmpty(message) { - state.data = null; - el.dashboard.hidden = true; - el.emptyState.hidden = false; - el.loadStatus.textContent = message; -} - -async function loadDefault() { - try { - const response = await fetch(`${DEFAULT_DATA_URL}?v=${Date.now()}`, { - cache: "no-store", - }); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - setData(await response.json(), "Default artifact loaded"); - } catch (_error) { - showEmpty("Default artifact unavailable"); - } -} - -function loadFile(file) { - const reader = new FileReader(); - reader.addEventListener("load", () => { - try { - setData(JSON.parse(String(reader.result)), file.name); - } catch (error) { - showEmpty(`Invalid JSON: ${error.message}`); - } - }); - reader.readAsText(file); -} - -function populateSelect(select, label, values) { - const current = select.value || "all"; - select.replaceChildren(); - const allOption = document.createElement("option"); - allOption.value = "all"; - allOption.textContent = label; - select.append(allOption); - for (const value of values) { - const option = document.createElement("option"); - option.value = value; - option.textContent = value; - select.append(option); - } - select.value = values.includes(current) ? current : "all"; -} - -function populateFilters() { - const rows = state.data.targets || []; - const families = [...new Set(rows.map((row) => row.target_family || "other"))].sort(); - const scopes = [...new Set(rows.map((row) => row.target_scope || "other"))].sort(); - const dbStatuses = [...new Set(rows.map((row) => row.policyengine_target_match || "unparsed"))].sort(); - populateSelect(el.familyFilter, "All families", families); - populateSelect(el.scopeFilter, "All scopes", scopes); - populateSelect(el.dbFilter, "All DB statuses", dbStatuses); - - const currentLabels = labels(); - el.winnerFilter.replaceChildren(); - for (const [value, label] of [ - ["all", "All winners"], - ["to", currentLabels.to], - ["from", currentLabels.from], - ["tie", "Ties"], - ]) { - const option = document.createElement("option"); - option.value = value; - option.textContent = label; - el.winnerFilter.append(option); - } -} - -function filteredRows() { - const query = state.search.trim().toLowerCase(); - const rows = state.data?.targets || []; - return rows - .filter((row) => { - if (state.family !== "all" && row.target_family !== state.family) { - return false; - } - if (state.scope !== "all" && row.target_scope !== state.scope) { - return false; - } - if (state.winner !== "all" && row.winner !== state.winner) { - return false; - } - if ( - state.dbMatch !== "all" && - (row.policyengine_target_match || "unparsed") !== state.dbMatch - ) { - return false; - } - if (!query) { - return true; - } - return [ - row.target_name, - row.target_family, - row.target_scope, - row.policyengine_target_match, - row.policyengine_target_id, - row.policyengine_target_source, - row.policyengine_target_domain_variable, - ] - .join(" ") - .toLowerCase() - .includes(query); - }) - .sort((a, b) => { - const [field, direction] = state.sort.split(":"); - const av = Number(a[field]); - const bv = Number(b[field]); - const result = Number.isFinite(av) && Number.isFinite(bv) - ? av - bv - : String(a[field] || "").localeCompare(String(b[field] || "")); - return direction === "desc" ? -result : result; - }); -} - -function render() { - if (!state.data) { - return; - } - renderKpis(); - renderSummaries(); - renderTargetList(el.topImprovements, state.data.top_improvements || [], true); - renderTargetList(el.topRegressions, state.data.top_regressions || [], false); - renderTable(filteredRows()); -} - -function renderKpis() { - const currentLabels = labels(); - const summary = state.data.summary || {}; - el.kpiTargets.textContent = formatNumber(summary.n_targets); - el.kpiToWinLabel.textContent = `${currentLabels.to} Wins`; - el.kpiWinRate.textContent = formatPercent(summary.to_win_rate); - el.kpiLossDelta.textContent = formatSigned(summary.loss_delta); - el.kpiLossDelta.className = classForDelta(summary.loss_delta); - el.kpiLossPair.textContent = `${formatNumber(summary.from_loss)} -> ${formatNumber(summary.to_loss)}`; - const dbSummary = state.data.target_db_summary || {}; - el.kpiDbMatch.textContent = dbSummary.match_rate === null || dbSummary.match_rate === undefined - ? formatNumber(dbSummary.matched) - : formatPercent(dbSummary.match_rate); - el.kpiDbDetail.textContent = `${formatNumber(dbSummary.matched)} matched / ${formatNumber(dbSummary.legacy_only)} legacy`; -} - -function renderSummaries() { - const familyRows = [...(state.data.family_summaries || [])].sort( - (a, b) => Number(a.loss_delta) - Number(b.loss_delta), - ); - const scopeRows = [...(state.data.scope_summaries || [])].sort( - (a, b) => String(a.target_scope).localeCompare(String(b.target_scope)), - ); - renderSummaryList(el.scopeSummary, scopeRows, "target_scope"); - renderSummaryList(el.familySummary, familyRows, "target_family"); -} - -function renderSummaryList(container, rows, field) { - container.replaceChildren(); - for (const row of rows) { - const wrapper = document.createElement("div"); - wrapper.className = "summary-row"; - - const left = document.createElement("div"); - const name = document.createElement("div"); - name.className = "summary-name"; - name.textContent = row[field] || "other"; - const meta = document.createElement("div"); - meta.className = "summary-meta"; - meta.textContent = `${formatNumber(row.n_targets)} targets - ${formatPercent(row.to_win_rate)} wins`; - left.append(name, meta); - - const value = document.createElement("div"); - value.className = `summary-value ${classForDelta(row.loss_delta)}`; - value.textContent = formatSigned(row.loss_delta); - wrapper.append(left, value); - container.append(wrapper); - } -} - -function renderTargetList(container, rows, improvementList) { - container.replaceChildren(); - const displayRows = rows.slice(0, 12); - for (const row of displayRows) { - const wrapper = document.createElement("div"); - wrapper.className = "target-row"; - wrapper.title = row.target_name || ""; - - const left = document.createElement("div"); - const name = document.createElement("div"); - name.className = "target-name"; - name.textContent = row.target_name || "-"; - const meta = document.createElement("div"); - meta.className = "target-meta"; - meta.textContent = `${row.target_family || "other"} - ${winnerLabel(row.winner)} - ${dbMatchLabel(row)}`; - left.append(name, meta); - - const delta = document.createElement("div"); - delta.className = `delta ${classForDelta(row.weighted_term_delta)}`; - delta.textContent = formatSigned(row.weighted_term_delta); - if (improvementList && Number(row.weighted_term_delta) > 0) { - delta.classList.add("bad"); - } - wrapper.append(left, delta); - container.append(wrapper); - } -} - -function renderTable(rows) { - el.targetTable.replaceChildren(); - const visibleRows = rows.slice(0, TABLE_LIMIT); - el.tableCount.textContent = rows.length > TABLE_LIMIT - ? `${formatNumber(TABLE_LIMIT)} of ${formatNumber(rows.length)} rows` - : `${formatNumber(rows.length)} rows`; - - const fragment = document.createDocumentFragment(); - for (const row of visibleRows) { - const tr = document.createElement("tr"); - tr.title = row.target_name || ""; - appendCell(tr, row.target_name || "-"); - appendCell(tr, row.target_family || "other"); - appendCell(tr, row.target_scope || "other"); - appendCell(tr, winnerLabel(row.winner), `winner ${row.winner || "tie"}`); - appendCell(tr, formatSigned(row.weighted_term_delta), `mono ${classForDelta(row.weighted_term_delta)}`); - appendCell(tr, formatError(row.from_abs_pct_error), "mono"); - appendCell(tr, formatError(row.to_abs_pct_error), "mono"); - appendCell(tr, formatCompact(row.target_value), "mono"); - appendCell( - tr, - dbMatchLabel(row), - `db-status ${row.policyengine_target_match || "unparsed"}`, - ); - fragment.append(tr); - } - el.targetTable.append(fragment); -} - -function appendCell(row, text, className = "") { - const cell = document.createElement("td"); - cell.textContent = text; - if (className) { - cell.className = className; - } - row.append(cell); -} - -el.fileInput.addEventListener("change", (event) => { - const [file] = event.target.files || []; - if (file) { - loadFile(file); - } -}); - -el.searchInput.addEventListener("input", (event) => { - state.search = event.target.value; - render(); -}); - -el.familyFilter.addEventListener("change", (event) => { - state.family = event.target.value; - render(); -}); - -el.scopeFilter.addEventListener("change", (event) => { - state.scope = event.target.value; - render(); -}); - -el.winnerFilter.addEventListener("change", (event) => { - state.winner = event.target.value; - render(); -}); - -el.dbFilter.addEventListener("change", (event) => { - state.dbMatch = event.target.value; - render(); -}); - -el.sortSelect.addEventListener("change", (event) => { - state.sort = event.target.value; - render(); -}); - -loadDefault(); diff --git a/dashboard/index.html b/dashboard/index.html deleted file mode 100644 index 6b276029..00000000 --- a/dashboard/index.html +++ /dev/null @@ -1,156 +0,0 @@ - - - - - - Microplex US Diagnostics - - - - - - - - -
-
-
-

Microplex US

-

Diagnostics

-
-
- -

Loading default artifact

-
-
- - - - -
- - - - diff --git a/dashboard/policyengine-theme.css b/dashboard/policyengine-theme.css deleted file mode 100644 index 861ae0d8..00000000 --- a/dashboard/policyengine-theme.css +++ /dev/null @@ -1,28 +0,0 @@ -/* Generated from the exported PolicyEngine design tokens. - Source: policyengine.org/packages/config/theme.css - Re-run: python scripts/sync_policyengine_theme.py -*/ -:root { ---color-void: #06070a; - --color-bg: #090b10; - --color-elevated: rgba(18, 19, 26, 0.9); - --color-card: rgba(20, 21, 30, 0.78); - --color-surface: rgba(27, 29, 40, 0.96); - --color-border: rgba(244, 239, 230, 0.13); - --color-border-subtle: rgba(244, 239, 230, 0.08); - - --color-text: #f4efe6; - --color-text-secondary: #cbc3b8; - --color-text-muted: #928a7f; - - --color-cyan: #7ce2cf; - --color-cyan-bright: #b9fff0; - --color-cyan-dim: #4aa391; - --color-cyan-ghost: rgba(124, 226, 207, 0.08); - --color-amber: #d5a565; - --color-green: #b0ef9f; - --color-coral: #ff8f6b; - - --ease-out: cubic-bezier(0.16, 1, 0.3, 1); - --ease-spring: cubic-bezier(0.34, 1.56, 0.64, 1); -} diff --git a/dashboard/styles.css b/dashboard/styles.css deleted file mode 100644 index bdafc4f7..00000000 --- a/dashboard/styles.css +++ /dev/null @@ -1,525 +0,0 @@ -* { - box-sizing: border-box; -} - -:root { - --f-display: var(--font-display), Georgia, serif; - --f-body: var(--font-body), "Helvetica Neue", Arial, sans-serif; - --f-mono: var(--font-mono), "Fira Code", ui-monospace, SFMono-Regular, Menlo, - Consolas, monospace; -} - -html { - min-height: 100%; - background: var(--color-void); - color: var(--color-text); - -webkit-font-smoothing: antialiased; - text-rendering: optimizeLegibility; -} - -body { - min-height: 100vh; - margin: 0; - font-family: var(--f-body); - background: - radial-gradient(circle at top left, rgba(124, 226, 207, 0.1), transparent 28%), - radial-gradient(circle at 88% 8%, rgba(213, 165, 101, 0.1), transparent 24%), - linear-gradient(180deg, #07080d 0%, #090b10 48%, #06070a 100%); -} - -button, -input, -select { - font: inherit; -} - -code { - display: block; - width: 100%; - overflow-x: auto; - padding: 14px 16px; - border: 1px solid var(--color-border-subtle); - border-radius: 8px; - color: var(--color-cyan-bright); - background: rgba(6, 7, 10, 0.72); - font-family: var(--f-mono); - font-size: 12px; -} - -.grid-bg, -.noise { - position: fixed; - inset: 0; - pointer-events: none; -} - -.grid-bg { - z-index: 0; - background-image: - linear-gradient(rgba(124, 226, 207, 0.03) 1px, transparent 1px), - linear-gradient(90deg, rgba(124, 226, 207, 0.03) 1px, transparent 1px); - background-size: 78px 78px; - mask-image: radial-gradient( - ellipse 74% 62% at 50% 0%, - black 0%, - transparent 100% - ); -} - -.noise { - z-index: 1; - opacity: 0.018; - background-image: url("data:image/svg+xml,%3Csvg viewBox='0 0 256 256' xmlns='http://www.w3.org/2000/svg'%3E%3Cfilter id='n'%3E%3CfeTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='4' stitchTiles='stitch'/%3E%3C/filter%3E%3Crect width='100%25' height='100%25' filter='url(%23n)'/%3E%3C/svg%3E"); -} - -.shell { - position: relative; - z-index: 2; - width: min(1480px, calc(100% - 32px)); - margin: 0 auto; - padding: 32px 0 48px; -} - -.topbar { - display: flex; - align-items: end; - justify-content: space-between; - gap: 24px; - padding: 0 0 24px; - border-bottom: 1px solid var(--color-border-subtle); -} - -.eyebrow { - margin: 0 0 8px; - color: var(--color-cyan); - font-family: var(--f-mono); - font-size: 11px; - line-height: 1; - text-transform: uppercase; -} - -h1, -h2 { - margin: 0; - font-family: var(--f-display); - font-weight: 400; - letter-spacing: 0; -} - -h1 { - font-size: clamp(38px, 6vw, 82px); - line-height: 0.9; -} - -h2 { - color: var(--color-text); - font-size: 18px; - line-height: 1.2; -} - -.load-control { - display: flex; - align-items: center; - gap: 14px; - min-width: min(100%, 420px); - justify-content: flex-end; -} - -.file-button { - position: relative; - display: inline-flex; - min-height: 40px; - align-items: center; - border: 1px solid color-mix(in srgb, var(--color-cyan) 42%, transparent); - border-radius: 8px; - padding: 0 16px; - color: var(--color-cyan-bright); - background: var(--color-cyan-ghost); - cursor: pointer; - transition: - border-color 160ms var(--ease-out), - background 160ms var(--ease-out); -} - -.file-button:hover { - border-color: var(--color-cyan); - background: rgba(124, 226, 207, 0.13); -} - -.file-button input { - position: absolute; - inset: 0; - opacity: 0; - cursor: pointer; -} - -.status-text { - margin: 0; - color: var(--color-text-muted); - font-family: var(--f-mono); - font-size: 11px; - line-height: 1.4; -} - -.empty-state { - margin: 64px 0 0; - padding: 32px; - border: 1px solid var(--color-border); - border-radius: 8px; - background: rgba(18, 19, 26, 0.58); -} - -.empty-state h2 { - max-width: 760px; - margin-bottom: 24px; - font-size: clamp(24px, 4vw, 44px); -} - -.kpi-strip { - display: grid; - grid-template-columns: repeat(5, minmax(0, 1fr)); - margin: 24px 0; - border-block: 1px solid var(--color-border-subtle); -} - -.kpi-strip div { - min-width: 0; - padding: 18px 22px; - border-right: 1px solid var(--color-border-subtle); -} - -.kpi-strip div:last-child { - border-right: 0; -} - -.kpi-strip span { - display: block; - margin-bottom: 8px; - color: var(--color-text-muted); - font-family: var(--f-mono); - font-size: 11px; - text-transform: uppercase; -} - -.kpi-strip strong { - display: block; - overflow-wrap: anywhere; - color: var(--color-text); - font-family: var(--f-mono); - font-size: clamp(20px, 2.7vw, 34px); - font-weight: 500; - line-height: 1.05; -} - -.kpi-strip small { - display: block; - margin-top: 7px; - overflow-wrap: anywhere; - color: var(--color-text-muted); - font-family: var(--f-mono); - font-size: 10px; - line-height: 1.3; -} - -.workspace { - display: grid; - grid-template-columns: minmax(280px, 330px) minmax(0, 1fr); - gap: 24px; -} - -.rail, -.main-pane { - display: flex; - min-width: 0; - flex-direction: column; - gap: 24px; -} - -.panel { - min-width: 0; - border: 1px solid var(--color-border-subtle); - border-radius: 8px; - background: rgba(18, 19, 26, 0.58); - backdrop-filter: blur(14px); -} - -.section-head { - padding: 18px 20px 16px; - border-bottom: 1px solid var(--color-border-subtle); -} - -.section-head.row { - display: flex; - align-items: end; - justify-content: space-between; - gap: 16px; -} - -.summary-list { - display: flex; - flex-direction: column; -} - -.summary-row { - display: grid; - grid-template-columns: minmax(0, 1fr) auto; - gap: 14px; - padding: 14px 18px; - border-bottom: 1px solid var(--color-border-subtle); -} - -.summary-row:last-child { - border-bottom: 0; -} - -.summary-row:hover, -tbody tr:hover { - background: rgba(124, 226, 207, 0.055); -} - -.summary-name, -.target-name { - min-width: 0; - overflow: hidden; - color: var(--color-text); - text-overflow: ellipsis; - white-space: nowrap; -} - -.summary-meta, -.target-meta { - color: var(--color-text-muted); - font-family: var(--f-mono); - font-size: 11px; -} - -.summary-value { - color: var(--color-cyan-bright); - font-family: var(--f-mono); - font-size: 13px; - text-align: right; - white-space: nowrap; -} - -.dense .summary-row { - padding-block: 11px; -} - -.split { - display: grid; - grid-template-columns: repeat(2, minmax(0, 1fr)); - gap: 24px; -} - -.target-list { - display: flex; - max-height: 484px; - overflow: auto; - flex-direction: column; -} - -.target-row { - display: grid; - grid-template-columns: minmax(0, 1fr) auto; - gap: 16px; - padding: 14px 18px; - border-bottom: 1px solid var(--color-border-subtle); -} - -.target-row:last-child { - border-bottom: 0; -} - -.target-row:hover { - background: rgba(124, 226, 207, 0.055); -} - -.delta { - font-family: var(--f-mono); - font-size: 13px; - text-align: right; - white-space: nowrap; -} - -.delta.good, -.winner.to { - color: var(--color-green); -} - -.delta.bad, -.winner.from { - color: var(--color-coral); -} - -.winner.tie { - color: var(--color-amber); -} - -.table-panel { - overflow: hidden; -} - -.filters { - display: grid; - grid-template-columns: minmax(180px, 1.4fr) repeat(5, minmax(120px, 1fr)); - gap: 10px; - padding: 16px 20px; - border-bottom: 1px solid var(--color-border-subtle); -} - -input, -select { - min-width: 0; - min-height: 38px; - border: 1px solid var(--color-border); - border-radius: 6px; - padding: 0 11px; - color: var(--color-text); - background: rgba(6, 7, 10, 0.58); - outline: none; -} - -input:focus, -select:focus { - border-color: color-mix(in srgb, var(--color-cyan) 58%, var(--color-border)); - box-shadow: 0 0 0 3px rgba(124, 226, 207, 0.08); -} - -.table-wrap { - max-height: 620px; - overflow: auto; -} - -table { - width: 100%; - border-collapse: collapse; - table-layout: fixed; -} - -th, -td { - overflow: hidden; - border-bottom: 1px solid var(--color-border-subtle); - padding: 11px 12px; - text-align: left; - text-overflow: ellipsis; - white-space: nowrap; -} - -th { - position: sticky; - top: 0; - z-index: 1; - color: var(--color-text-muted); - background: rgba(9, 11, 16, 0.98); - font-family: var(--f-mono); - font-size: 10px; - font-weight: 500; - text-transform: uppercase; -} - -td { - color: var(--color-text-secondary); - font-size: 13px; -} - -td.mono { - font-family: var(--f-mono); - font-size: 12px; -} - -td:first-child, -th:first-child { - width: 30%; -} - -td:nth-child(2), -th:nth-child(2) { - width: 14%; -} - -td:nth-child(3), -th:nth-child(3), -td:nth-child(4), -th:nth-child(4) { - width: 8%; -} - -td:nth-child(5), -th:nth-child(5), -td:nth-child(6), -th:nth-child(6), -td:nth-child(7), -th:nth-child(7), -td:nth-child(8), -th:nth-child(8), -td:nth-child(9), -th:nth-child(9) { - width: 8%; -} - -.db-status.matched { - color: var(--color-cyan-bright); -} - -.db-status.legacy_only, -.db-status.ambiguous { - color: var(--color-amber); -} - -.db-status.db_unavailable, -.db-status.unparsed { - color: var(--color-text-muted); -} - -@media (max-width: 1080px) { - .workspace, - .split, - .kpi-strip { - grid-template-columns: 1fr; - } - - .kpi-strip div { - border-right: 0; - border-bottom: 1px solid var(--color-border-subtle); - } - - .kpi-strip div:last-child { - border-bottom: 0; - } - - .filters { - grid-template-columns: 1fr 1fr; - } -} - -@media (max-width: 680px) { - .shell { - width: min(100% - 20px, 1480px); - padding-top: 20px; - } - - .topbar, - .load-control { - align-items: stretch; - flex-direction: column; - } - - .load-control { - justify-content: flex-start; - } - - .filters { - grid-template-columns: 1fr; - } - - .section-head.row { - align-items: start; - flex-direction: column; - } - - th, - td { - padding-inline: 10px; - } -} diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index b5ea7c0a..00000000 --- a/docs/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# microplex-us docs - -- [Architecture](./architecture.md) -- [Canonical pipeline stages](./pipeline-stages.md) -- [Stage contracts and manifests](./stage-contracts.md) -- [API reference](./api.md) -- [Source semantics](./source-semantics.md) -- [Imputation conditioning contract](./imputation-conditioning-contract.md) -- [Benchmarking](./benchmarking.md) -- [Methodology ledger](./methodology-ledger.md) -- [PolicyEngine oracle compatibility path](./policyengine-oracle-compatibility.md) -- [PE construction parity](./pe-construction-parity.md) -- [Superseding `policyengine-us-data`](./superseding-policyengine-us-data.md) -- [Hugging Face artifact publishing](./huggingface-artifact-publishing.md) - -This doc set is intentionally technical. It is meant to answer seven questions: - -1. What is the current architecture? -2. How do source semantics and variable semantics drive donor integration? -3. What is structurally required in imputation conditioning, and what is still - experimental? -4. Which construction contracts currently match PE, and which are only - compatible? -5. How do we measure progress against `policyengine-us-data` on real targets? -6. What is the actual roadmap for fully superseding `policyengine-us-data`? -7. Which methodological choices are currently canonical, provisional, or open? - -The docs describe the code that exists today. They do not try to freeze a final -paper narrative while the architecture is still moving. diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index 957cb37c..00000000 --- a/docs/_config.yml +++ /dev/null @@ -1,18 +0,0 @@ -title: microplex-us -author: PolicyEngine - -execute: - execute_notebooks: "off" - -repository: - url: https://github.com/PolicyEngine/microplex-us - path_to_book: docs - branch: main - -sphinx: - config: - html_theme: sphinx_book_theme - autodoc_member_order: bysource - extra_extensions: - - sphinx.ext.autodoc - - sphinx.ext.napoleon diff --git a/docs/_toc.yml b/docs/_toc.yml deleted file mode 100644 index 9cbd510e..00000000 --- a/docs/_toc.yml +++ /dev/null @@ -1,13 +0,0 @@ -format: jb-book -root: README -chapters: - - file: pipeline-stages - - file: stage-contracts - - file: api - - file: architecture - - file: source-semantics - - file: imputation-conditioning-contract - - file: benchmarking - - file: pe-construction-parity - - file: superseding-policyengine-us-data - - file: huggingface-artifact-publishing diff --git a/docs/aca-ptc-multiplier-source-choice.md b/docs/aca-ptc-multiplier-source-choice.md deleted file mode 100644 index e25d6714..00000000 --- a/docs/aca-ptc-multiplier-source-choice.md +++ /dev/null @@ -1,113 +0,0 @@ -# ACA PTC Multiplier Source Choice - -This records the first Microplex-US reconstruction of -`policyengine-us-data`'s `aca_ptc_multipliers_2022_2024.csv` from Arch -publisher-source consumer facts. - -## Recipe - -Inputs: - -- KFF full-year average marketplace effectuated enrollment, 2022 and 2024 -- CMS 2022 OEP state-level average monthly APTC -- CMS 2024 OEP state-level average monthly APTC -- CMS full-year 2022 effectuated-enrollment workbook average monthly APTC - -Source selection: - -- `enroll_2022` and `enroll_2024`: KFF full-year effectuated enrollment -- `aptc_2024`: CMS 2024 OEP average monthly APTC -- `aptc_2022`: CMS 2022 OEP average monthly APTC where published, with CMS - full-year 2022 average monthly APTC as fallback - -Derived columns: - -- `vol_mult = enroll_2024 / enroll_2022` -- `val_mult = aptc_2024 / aptc_2022` -- PE's state `tax_unit_count` factor uses `vol_mult` -- PE's state `aca_ptc` amount factor uses `vol_mult * val_mult` - -## Reproduction - -Build the five Arch source-package suites, then run: - -```bash -uv run microplex-us-build-aca-ptc-multipliers \ - /tmp/mp-aca-ptc-arch-sources/kff-2022/consumer_facts.jsonl \ - /tmp/mp-aca-ptc-arch-sources/kff-2024/consumer_facts.jsonl \ - /tmp/mp-aca-ptc-arch-sources/cms-oep-2022/consumer_facts.jsonl \ - /tmp/mp-aca-ptc-arch-sources/cms-oep-2024/consumer_facts.jsonl \ - /tmp/mp-aca-ptc-arch-sources/cms-effectuated-2022/consumer_facts.jsonl \ - --out /tmp/mp-aca-ptc-arch-sources/aca_ptc_multipliers_2022_2024.csv -``` - -The 2026-05-12 run wrote 51 rows. Compared with PE's incumbent -`policyengine_us_data/storage/aca_ptc_multipliers_2022_2024.csv`: - -- state set matches -- `enroll_2022` matches for all 51 states -- `enroll_2024` matches for all 51 states -- `vol_mult` matches for all 51 states -- `aptc_2024` matches for all 51 states -- `aptc_2022` differs for 22 states -- `val_mult` differs for the same 22 states - -## PE Incumbent Provenance Trace - -The local `policyengine-us-data` history does not contain a generator for the -incumbent CSV. `git log --follow` shows the file first appearing at its current -path in `8d2c49fa15a515e2379d1b4b5e2c1856a1d4ebe9` on 2026-02-11: -`Add hierarchical uprating notebook, fix verification, move ACA PTC -multipliers`. The commit adds -`policyengine_us_data/storage/aca_ptc_multipliers_2022_2024.csv` directly, plus -notebooks which document that ACA PTC factors are loaded from the CSV and -described as CMS/KFF enrollment data. Those notebooks do not show row-level -source derivation. - -Spot checks against the raw CMS 2022 OEP state-level source support the -Microplex-US source choice for the mismatching states where OEP publishes a -number. For example, current Arch-selected OEP values are New Jersey `489`, New -Mexico `460`, and Virginia `506`, matching the CMS OEP -`APTC_Cnsmr_Avg_APTC` column. The PE incumbent has `504`, `534`, and `407` for -those states, respectively. Nevada remains the explicit fallback case because -the CMS 2022 OEP state-level file reports no Nevada average monthly APTC fact; -Microplex-US uses the CMS full-year effectuated-enrollment value `429.75`. - -## Reconciliation Queue - -States not listed matched PE's incumbent CSV exactly. For listed states, the -Microplex-US value is the Arch publisher-source value selected by the recipe -above. Nevada is the known CMS full-year fallback case because the CMS 2022 OEP -state-level source package has no Nevada average monthly APTC fact. - -| State | PE aptc_2022 | Microplex-US aptc_2022 | PE val_mult | Microplex-US val_mult | -| --- | ---: | ---: | ---: | ---: | -| Nevada | 435 | 429.75 | 1.006896551724138 | 1.019197207678883 | -| New Jersey | 504 | 489 | 1.0337301587301588 | 1.065439672801636 | -| New Mexico | 534 | 460 | 1.0318352059925093 | 1.1978260869565218 | -| New York | 364 | 363 | 1.25 | 1.2534435261707988 | -| North Carolina | 583 | 579 | 0.9571183533447685 | 0.9637305699481865 | -| North Dakota | 436 | 452 | 0.9931192660550459 | 0.9579646017699115 | -| Ohio | 479 | 437 | 1.0396659707724425 | 1.139588100686499 | -| Oklahoma | 577 | 558 | 0.9965337954939342 | 1.0304659498207884 | -| Oregon | 503 | 489 | 1.0417495029821073 | 1.0715746421267893 | -| Pennsylvania | 523 | 501 | 1.0133843212237095 | 1.0578842315369261 | -| Rhode Island | 427 | 403 | 1.063231850117096 | 1.1265508684863523 | -| South Carolina | 566 | 512 | 0.9770318021201413 | 1.080078125 | -| South Dakota | 649 | 640 | 0.9414483821263482 | 0.9546875 | -| Tennessee | 572 | 543 | 1.013986013986014 | 1.0681399631675874 | -| Texas | 539 | 502 | 0.9944341372912802 | 1.0677290836653386 | -| Utah | 385 | 370 | 1.0935064935064935 | 1.1378378378378378 | -| Vermont | 620 | 566 | 1.132258064516129 | 1.2402826855123674 | -| Virginia | 407 | 506 | 0.995085995085995 | 0.8003952569169961 | -| Washington | 438 | 437 | 1.0342465753424657 | 1.036613272311213 | -| West Virginia | 1057 | 1002 | 0.97918637653737 | 1.032934131736527 | -| Wisconsin | 562 | 530 | 1.0177935943060499 | 1.079245283018868 | -| Wyoming | 873 | 812 | 0.9885452462772051 | 1.062807881773399 | - -Open reconciliation decision: - -- Treat the Microplex-US output as the publisher-source reconstruction. -- Treat PE byte parity as a separate legacy-compatibility target. Do not add - overrides unless a row-level legacy source or intentional source-choice table - is supplied. diff --git a/docs/api.md b/docs/api.md deleted file mode 100644 index 6f0b66f1..00000000 --- a/docs/api.md +++ /dev/null @@ -1,105 +0,0 @@ -# API reference - -## Stage contracts - -```{eval-rst} -.. automodule:: microplex_us.pipelines.stage_contracts - :members: - :undoc-members: -``` - -## Stage manifests - -```{eval-rst} -.. automodule:: microplex_us.pipelines.stage_manifest - :members: - :undoc-members: -``` - -### Stage Manifest Internals - -```{eval-rst} -.. automodule:: microplex_us.pipelines.stage_manifest_types - :members: - :undoc-members: - -.. automodule:: microplex_us.pipelines.stage_manifest_builder - :members: - :undoc-members: - -.. automodule:: microplex_us.pipelines.stage_manifest_io - :members: - :undoc-members: - -.. automodule:: microplex_us.pipelines.stage_status - :members: - :undoc-members: - -.. automodule:: microplex_us.pipelines.stage_metrics - :members: - :undoc-members: - -.. automodule:: microplex_us.pipelines.stage_data_flow - :members: - :undoc-members: - -.. automodule:: microplex_us.pipelines.stage_policyengine_artifacts - :members: - :undoc-members: - -.. automodule:: microplex_us.pipelines.stage_validation_evidence - :members: - :undoc-members: -``` - -## Stage artifacts - -```{eval-rst} -.. automodule:: microplex_us.pipelines.stage_artifacts - :members: - :undoc-members: -``` - -## Conditional readiness - -```{eval-rst} -.. automodule:: microplex_us.pipelines.stage_readiness - :members: - :undoc-members: -``` - -## Stage run writer - -```{eval-rst} -.. automodule:: microplex_us.pipelines.stage_run - :members: - :undoc-members: -``` - -## Stage runtime writer - -```{eval-rst} -.. automodule:: microplex_us.pipelines.stage_runtime - :members: - :undoc-members: -``` - -## Artifact helpers - -```{eval-rst} -.. automodule:: microplex_us.pipelines.artifacts - :members: - :undoc-members: -``` - -## US pipeline - -```{eval-rst} -.. automodule:: microplex_us.pipelines.us - :members: - :undoc-members: -``` - -Generic source, fusion, synthesis, and calibration primitives live in the core -`microplex` package. See the core `microplex` API docs for those library-level -interfaces. diff --git a/docs/arch-target-gap-queue.md b/docs/arch-target-gap-queue.md deleted file mode 100644 index f3332d05..00000000 --- a/docs/arch-target-gap-queue.md +++ /dev/null @@ -1,138 +0,0 @@ -# Arch Target Gap Queue - -The Arch target gap queue is a Microplex-side review tool. It compares a -Microplex target profile to a queryable Arch target DB and emits rows that help -humans or agents decide what Arch source work is missing. - -The queue does not make Arch own Microplex target selection. Profile membership, -source aging, reconciliation, activation, and model-variable aliases remain in -`microplex-us`. - -## Boundary Rules - -- Arch stores publisher/source facts with provenance, constraints, periods, - geography, and source lineage. -- Arch should not duplicate a source fact only because Microplex names a model - variable differently. -- Microplex adapters may map one Arch source fact into simulator-specific target - semantics. For example, Arch - `irs_soi.returns_with_income_tax_after_credits` can satisfy the - PolicyEngine `income_tax_positive` count target because SOI Table 1.1 reports - the count of returns with positive income tax after credits. -- A gap row is an authoring hint, not proof that a source exists. -- Rows marked as source-mapping review or deprioritized must be reviewed before - assigning loader work to agents. - -## Categories - -`gap_category` is the high-level agent-readiness taxonomy: - -| Category | Meaning | Default action | -| --- | --- | --- | -| `covered` | An Arch target record already satisfies the target cell. | No task. | -| `ready_primary_loader` | The expected publisher source and Arch variable shape are known, but the record is missing. | Assign source-loader/spec work. | -| `ready_rollup_or_geography` | The Arch variable exists but not at the requested geography. | Add rollup/geography records or review source geography. | -| `adapter_or_constraint_review` | The Arch variable exists at the geography, but filters or adapter matching do not cover the cell. | Review constraints and adapter mapping. | -| `source_mapping_review` | The queue cannot identify a defensible source fact or Arch variable shape. | Human source-mapping review first. | -| `survey_or_model_input_deprioritized` | The cell is currently treated as a survey/model-input proxy rather than a primary administrative source task. | Defer unless a primary source is identified. | - -`loader_status` is the lower-level diagnostic used to derive the category. Use -`gap_category` for agent routing and `loader_status` for debugging why a cell -landed there. - -## Current PolicyEngine Profile Boundary - -`pe_native_broad` keeps the raw PolicyEngine parity surface intact. It includes -all currently tracked broad target cells, including survey/model-input rows and -cells whose publisher-source semantics still need review. - -`pe_native_broad_source_backed` is the Arch-backed calibration/profile boundary. -It excludes only cells with explicit reasons in -`src/microplex_us/policyengine/target_profiles.py`, such as: - -- SOI multi-domain cells that would require joint AGI, filing status, and - positive income-tax-before-credits facts not currently published by the loaded - SOI packages -- survey-heavy or model-input cells such as rent, child support, - non-Part-B medical premium/expense components, SPM capped expenses, and - `ssn_card_type` -- source-near but non-equivalent rows such as `childcare_expenses`, where IRS - credit expenses and W-2 dependent-care benefits are narrower tax concepts -- pregnancy stock by state, where live births are a flow rather than a direct - source fact for the PolicyEngine target - -## Current Local Snapshot - -Snapshot date: 2026-05-22. - -Set `$POLICYENGINE_ROOT` to the local checkout directory that contains the -`arch` repository. - -Inputs: - -- `$POLICYENGINE_ROOT/arch/arch/fixtures/consumer_facts.jsonl` -- `$POLICYENGINE_ROOT/arch/macro/targets.db` -- `/tmp/arch-suite-hhs-acf-tanf-caseload-2024/consumer_facts.jsonl` -- `/tmp/arch-suite-soi-historic-table-2-2022/consumer_facts.jsonl` -- `/tmp/arch-suite-hhs-acf-liheap-fy2024-national-profile/consumer_facts.jsonl` -- `/tmp/arch-suite-soi-historic-table-2-state-agi-2022/consumer_facts.jsonl` -- `/tmp/arch-suite-soi-w2-statistics-2020/consumer_facts.jsonl` -- `/tmp/arch-suite-soi-table-1-4-2023/consumer_facts.jsonl` -- `/tmp/arch-suite-federal-reserve-z1-household-net-worth/consumer_facts.jsonl` -- `/tmp/arch-suite-cms-medicare-trustees-report-2025-part-b-premium-income/consumer_facts.jsonl` - -Command: - -```bash -uv run --extra policyengine microplex-us-arch-target-refresh \ - --arch-targets-db "$POLICYENGINE_ROOT/arch/arch/fixtures/consumer_facts.jsonl" \ - --arch-targets-db "$POLICYENGINE_ROOT/arch/macro/targets.db" \ - --arch-targets-db /tmp/arch-suite-hhs-acf-tanf-caseload-2024/consumer_facts.jsonl \ - --arch-targets-db /tmp/arch-suite-soi-historic-table-2-2022/consumer_facts.jsonl \ - --arch-targets-db /tmp/arch-suite-hhs-acf-liheap-fy2024-national-profile/consumer_facts.jsonl \ - --arch-targets-db /tmp/arch-suite-soi-historic-table-2-state-agi-2022/consumer_facts.jsonl \ - --arch-targets-db /tmp/arch-suite-soi-w2-statistics-2020/consumer_facts.jsonl \ - --arch-targets-db /tmp/arch-suite-soi-table-1-4-2023/consumer_facts.jsonl \ - --arch-targets-db /tmp/arch-suite-federal-reserve-z1-household-net-worth/consumer_facts.jsonl \ - --arch-targets-db /tmp/arch-suite-cms-medicare-trustees-report-2025-part-b-premium-income/consumer_facts.jsonl \ - --period 2024 \ - --profile pe_native_broad_source_backed \ - --output-dir artifacts/arch-target-coverage-source-backed -``` - -Coverage: - -- 174 target cells in `pe_native_broad_source_backed` -- 174 covered -- 0 uncovered -- 100.0% coverage - -The raw `pe_native_broad` profile is at 174 of 189 covered with 15 explicitly -reviewed rows outside the source-backed boundary. Federal Reserve Z.1 household -net worth and CMS Medicare Trustees Report Part B premium income are now -source-backed. - -| Category | Rows | -| --- | ---: | -| `adapter_or_constraint_review` | 3 | -| `source_mapping_review` | 2 | -| `survey_or_model_input_deprioritized` | 10 | - -Generated outputs: - -- `artifacts/arch-target-coverage-source-backed/pe_native_broad_source_backed_2024_coverage.json` -- `artifacts/arch-target-coverage-source-backed/pe_native_broad_source_backed_2024_gaps.json` -- `artifacts/arch-target-coverage-source-backed/pe_native_broad_source_backed_2024_gaps.csv` -- `artifacts/arch-target-coverage-source-backed/pe_native_broad_source_backed_2024_summary.md` -- `artifacts/arch-target-coverage-broad-plus/pe_native_broad_2024_coverage.json` -- `artifacts/arch-target-coverage-broad-plus/pe_native_broad_2024_gaps.json` -- `artifacts/arch-target-coverage-broad-plus/pe_native_broad_2024_gaps.csv` -- `artifacts/arch-target-coverage-broad-plus/pe_native_broad_2024_summary.md` - -Remaining work is concentrated in: - -- the raw `pe_native_broad` cells excluded from the source-backed profile, if a - future primary publisher source can support them without changing semantics -- keeping the UK source-backed/raw boundary aligned with the same rule: leave - raw PE target rows visible, and exclude only rows where source equivalence is - not defensible diff --git a/docs/architecture.md b/docs/architecture.md deleted file mode 100644 index 2727a617..00000000 --- a/docs/architecture.md +++ /dev/null @@ -1,92 +0,0 @@ -# Architecture - -`microplex-us` is the US-specific country package built on top of the generic -`microplex` engine. - -## Package split - -- `microplex`: generic engine pieces - - source descriptors and observation frames - - fusion planning - - synthesis and calibration - - canonical target spec and provider protocol - - generic geography and entity abstractions -- `microplex-us`: US-specific implementations - - CPS, PUF, and other source providers - - PE-US target import and compilation - - PE-US export and evaluation - - US experiment, registry, and artifact layers - -## Current build flow - -Main entrypoint: - -- `microplex_us.pipelines.USMicroplexPipeline` - -Current broad flow: - -1. Load one or more `SourceProvider`s into `ObservationFrame`s. -2. Build a `FusionPlan` from the source descriptors. -3. Choose a public structured scaffold source. -4. Prepare canonical seed data from the scaffold. -5. Integrate donor-only variables from other sources using source and variable - capability metadata, with donor-block-specific automatic condition selection, - declared condition-entity policy, and native-entity projection when entity - IDs are available. -6. Synthesize a new population. -7. Build PolicyEngine-style entity tables. -8. Materialize PE-derived features needed by targets. -9. Calibrate against PE-US DB targets. -10. Export a PE-ingestable H5 and evaluate against the full active target set. - -Important files: - -- `src/microplex_us/pipelines/us.py` -- `src/microplex_us/policyengine/us.py` -- `src/microplex_us/policyengine/comparison.py` -- `src/microplex_us/pipelines/artifacts.py` -- `src/microplex_us/pipelines/index_db.py` - -## What is already true - -- The package is library-first. The core build, artifact saving, experiment - running, and frontier tracking all live in importable APIs. -- PolicyEngine evaluation uses the real `policyengine-us-data` targets DB as - truth targets. -- Saved runs persist: - - artifact bundle - - `policyengine_harness.json` - - `run_registry.jsonl` - - `run_index.duckdb` - -## What is not final yet - -- Broad PE-US parity is not stable yet. -- The current US path is still scaffold-plus-donors rather than a fully - symmetric multientity latent-population model. -- Held-out target evaluation is not the default loop yet. -- Local-area production replacement is still future work. - -## Design direction - -The intended long-run shape is: - -- canonical source metadata -- canonical variable semantics -- multientity fusion -- derived-variable materialization after atomic modeling -- target compilation as a generic feature/filter/aggregation problem - -The current implementation is already moving in that direction: - -- canonical target spec -- source capability registry -- variable semantic registry -- donor block specs with declared match strategies -- donor block specs with declared condition-entity policy -- variable semantics with declared projection aggregation for group-level donor fits -- automatic donor condition selection from source overlap plus data signal -- native-entity donor execution for tax-unit-native blocks when IDs are present -- full-target PE-US harness - -But it is still an actively evolving system, not a finished paper architecture. diff --git a/docs/b2-downstream-validation-v11.md b/docs/b2-downstream-validation-v11.md deleted file mode 100644 index 6e811fc2..00000000 --- a/docs/b2-downstream-validation-v11.md +++ /dev/null @@ -1,49 +0,0 @@ -# B2 downstream validation (v11-per-stage-lambda) - -Run date: 2026-04-22 -Artifact: `artifacts/live_pe_us_data_rebuild_checkpoint_20260421_v11_per_stage_lambda/v11-per-stage-lambda/policyengine_us.h5` -Period: 2024 -Method: `scripts/run_b2_batched.py` with batch_size=50_000 for income_tax, 100_000 for aca_ptc, full-dataset for the rest. -Comparison framework: `microplex_us.validation.downstream.DOWNSTREAM_BENCHMARKS_2024`. - -## Results - -| Variable | Computed | Benchmark | Rel error | Source | -|----------|---------:|----------:|---------:|--------| -| income_tax | $2,089.7B | $2,400.0B | −12.9% | IRS SOI 2022 ~$2.22T; CBO 2024 projection ~$2.4T | -| eitc | $64.2B | $64.0B | +0.3% | IRS SOI 2023 (Table 2.5) | -| snap | $101.8B | $100.0B | +1.8% | USDA FNS FY2024 | -| ctc | $151.9B | $115.0B | +32.1% | IRS SOI 2023 (pre-OBBBA $2,000/qc) | -| ssi | $108.2B | $66.0B | +64.0% | SSA SSI Annual Statistical Report 2024 | -| aca_ptc | $14.1B | $60.0B | −76.4% | CMS/IRS ACA PTC 2024 (IRA-enhanced) | - -## Reading - -- **Within ±15%** of benchmark: income_tax (−12.9%), eitc (+0.3%), snap (+1.8%). The tax-mechanics chain and the two largest means-tested programs reconcile to published totals once calibrated weights are applied. -- **Elevated +30% to +65%**: ctc and ssi. ctc = 32% above IRS SOI suggests either more qualifying children per household than IRS counts, or the synthesis pulled CTC-eligible families with higher frequency than the population-level CTC claim rate; ssi at +64% is the cleanest outlier and points to either over-representation of the aged / disabled low-income subpopulation or a missed means-test gate in the synthesis-then-materialize step. -- **Under at −76%**: aca_ptc. The `has_marketplace_health_coverage` flag is in the synthesis target set, but the reconciled PTC depends on a policy-output chain (MAGI, federal poverty line, premium contribution). Either marketplace enrollment is under-represented at the income bands where PTC is largest, or the IRA-enhanced subsidy schedule isn't firing as it does in production IRS data. - -## Interpretation for the paper's B2 section - -Three headline aggregates reconcile within single-digit or low-teens relative error. The three that don't (ctc, ssi, aca_ptc) are individually diagnosable — each points to a specific shortfall in the synthesis step rather than a structural problem in the calibration framework. A follow-up calibration pass can add direct targets on these aggregates (CTC disbursed, SSI disbursed, ACA PTC disbursed) to drive them in. - -The income_tax reconciliation at −12.9% is the most important single number: it's the paper's headline claim that the calibrated synthesis produces a PolicyEngine-US-readable frame whose downstream tax-output reconciles to IRS administrative totals within a credible tolerance. - -## Reproduction - -```bash -# All variables except income_tax and aca_ptc fit in the full-dataset path: -for var in ssi snap eitc ctc; do - .venv/bin/python -u scripts/run_b2_validation_single_var.py \ - --dataset
--output --variable "$var" --period 2024 -done - -# income_tax and aca_ptc need batching to avoid 30+ GB peak RSS: -.venv/bin/python -u scripts/run_b2_batched.py \ - --dataset
--output --variable income_tax \ - --period 2024 --batch-size 50000 - -.venv/bin/python -u scripts/run_b2_batched.py \ - --dataset
--output --variable aca_ptc \ - --period 2024 --batch-size 100000 -``` diff --git a/docs/benchmarking.md b/docs/benchmarking.md deleted file mode 100644 index 52da096c..00000000 --- a/docs/benchmarking.md +++ /dev/null @@ -1,143 +0,0 @@ -# Benchmarking - -The benchmark question is: - -> Is Microplex closer to the real target DB than `policyengine-us-data` is? - -## What is truth - -Truth is the active target set loaded from the PE-US targets DB. - -Main provider: - -- `microplex_us.policyengine.PolicyEngineUSDBTargetProvider` - -The baseline dataset is not truth. It is only the incumbent comparator. - -## What PolicyEngine does - -`policyengine-us` is the shared measurement operator. - -Both: - -- the Microplex candidate dataset -- the `policyengine-us-data` baseline dataset - -are run through the same PE-US variable materialization and the same target -compiler before being compared to the same targets. - -So the benchmark shape is: - -`dataset -> policyengine-us -> implied aggregates -> compare to target DB` - -## Current default harness - -Default saved-build evaluation now uses: - -- the full active PE-US target estate -- one `all_targets` slice - -Main files: - -- `src/microplex_us/policyengine/harness.py` -- `src/microplex_us/policyengine/comparison.py` - -## Main metrics - -Per run: - -- `candidate_composite_parity_loss` -- `baseline_composite_parity_loss` -- `candidate_mean_abs_relative_error` -- `baseline_mean_abs_relative_error` -- `target_win_rate` -- `supported_target_rate` - -The frontier metric is currently: - -- `candidate_composite_parity_loss` - -This is a diversity-aware outer loss over the target set rather than a raw -target-count-weighted mean alone. - -## Saved outputs - -Every serious saved run can write: - -- artifact bundle directory -- `policyengine_harness.json` -- `run_registry.jsonl` -- `run_index.duckdb` -- `pe_native_target_diagnostics_current.json` - -These live under the selected artifact root. - -## Diagnostics dashboard - -The repo includes a static dashboard at `dashboard/` for inspecting the full -PE-native target diagnostic dataset. It expects the JSON payload written by: - -```bash -microplex-us-pe-native-target-diagnostics \ - --from-dataset /path/to/enhanced_cps_2024.h5 \ - --to-dataset /path/to/policyengine_us.h5 \ - --policyengine-targets-db /path/to/policy_data.db \ - --output-path artifacts/pe_native_target_diagnostics_current.json -``` - -The JSON includes full per-target rows, family summaries, scope summaries, top -improvements, top regressions, and target DB match metadata when a structured -PolicyEngine target DB is available. The dashboard loads that default artifact -when served from the repo root, and can also load an arbitrary diagnostic JSON -from disk. - -## Inspecting runs - -Useful Python APIs: - -- `select_us_microplex_frontier_entry(...)` -- `select_us_microplex_frontier_index_row(...)` -- `list_us_microplex_target_delta_rows(...)` -- `compare_us_microplex_target_delta_rows(...)` - -The last helper is meant for questions like: - -- what changed between two broad runs? -- which targets improved under a source-policy change? -- which target families regressed even when overall loss improved? - -## Current broad reference point - -As of March 27, 2026, the best recorded broad `national + state` `CPS+PUF` -frontier in the main artifact root was: - -- artifact id: `cps_puf_500_native_wages` -- candidate composite parity loss: `0.8906` -- baseline composite parity loss: `4.5412` -- candidate mean absolute relative error: `0.9928` -- baseline mean absolute relative error: `1.1920` - -That does **not** mean Microplex is already better on most targets. The same run -had a low `target_win_rate`, meaning the gain comes from improving the overall -loss surface rather than beating the incumbent on a majority of individual -targets. - -## Important caveats - -- This is parity evaluation, not held-out evaluation. -- Calibration and evaluation still overlap unless explicitly separated in build - config. -- A broad win on the composite loss is not the same thing as a majority-target - win. -- Local-area production parity is not finished yet. - -## Repro pattern - -Broad versioned builds use: - -- `build_and_save_versioned_us_microplex(...)` -- `build_and_save_versioned_us_microplex_from_source_provider(...)` -- `build_and_save_versioned_us_microplex_from_source_providers(...)` - -The resulting run can then be inspected through the JSON artifacts or via the -DuckDB index. diff --git a/docs/calibrate-on-synthesizer-result.md b/docs/calibrate-on-synthesizer-result.md deleted file mode 100644 index d5e2dc54..00000000 --- a/docs/calibrate-on-synthesizer-result.md +++ /dev/null @@ -1,68 +0,0 @@ -# Calibrate-on-synthesizer result — does `microcalibrate` rescue weak synthesis? - -*Third robustness check on the stage-1 synthesizer ordering, this time at the weighted-aggregate level instead of PRDC coverage.* - -## Setup - -20,000 rows × 50 columns of real enhanced_cps_2024 (16k train / 4k holdout). For each method: - -1. Fit, generate synthetic records with unit weights. -2. Initial weight rescale so synthetic totals roughly match holdout-scale (drops gradient descent's starting point near the target). -3. Build one `LinearConstraint` per target column requiring weighted synthetic sum to match holdout sum. -4. Run `MicrocalibrateAdapter.fit_transform` with 200 epochs, lr 1e-3. -5. Report mean relative error across target columns before and after calibration. - -## Results (post-snap-fix rerun with 500 epochs, 2026-04-17 21:17) - -| Method | Pre-cal mean rel err | Post-cal mean rel err | Max post-cal err | Cal time | -|---|---:|---:|---:|---:| -| **ZI-QRF** | 0.317 | **0.105** | 1.000 | 1.1 s | -| ZI-QDNN | 0.386 | 0.251 | 1.002 | 0.6 s | -| ZI-MAF | 17.51 | 11.86 | 168.3 | 0.6 s | - -Reading: after calibration, ZI-QRF's weighted synthetic aggregates are within 10.5 % of the holdout targets on average. ZI-QDNN is at 25.1 %. ZI-MAF is at **1,186 %** — the synthetic output is so far off target scale that calibration can't pull it back, even with 500 epochs of gradient descent. - -Pre-snap numbers at 200 epochs (archived as `artifacts/calibrate_on_synthesizer.pre-snap.json`) gave ZI-QRF post-cal 0.141, ZI-QDNN 0.327, ZI-MAF 15.08. The bump to 500 epochs + the snap fix both help; ordering and qualitative conclusion are unchanged. - -## What this tells us - -1. **Calibration doesn't rescue a broken synthesizer.** The hope was that `microcalibrate` could compensate for poor synthesis by adjusting weights. For ZI-QRF it halves the error; for ZI-MAF it shaves ~15 % off a 1798 % starting error and the final answer is still uselessly wrong. Calibration works on starting points that are close enough; ZI-MAF isn't. - -2. **ZI-MAF's failure is not about weighting.** An earlier hypothesis was that ZI-MAF's low PRDC coverage might be acceptable if weighted calibration patched the aggregates. Falsified. The synthesizer produces samples so far from target mass that no weight adjustment can make them match aggregates. - -3. **ZI-QRF's synthesis is the right STRUCTURE to calibrate.** Calibration dropping error from 0.26 → 0.14 on ZI-QRF output means the raw samples are structurally close to real; weights just need to shift them. ZI-QDNN's output is roughly in the right ballpark but less clean (0.39 → 0.33). - -4. **`max` relative error stays ~1.0 across all three for post-cal.** This is because at least one constraint (typically a rare-cell target like `disabled_ssdi`) stays exactly off — the zero-cell problem from stage-1 hasn't been addressed, it just doesn't dominate the *mean*. - -## Calibration convergence note - -200 epochs at lr=1e-3 with default `microcalibrate` settings does not fully converge these problems. The loss trajectory shows steady improvement until the last reported epoch. For a production run, epochs should probably be 500-1000 to reach the calibration's 5 % relative-error bound. - -At production scale (1.5 M records × 1255 constraints), the per-epoch step is cheaper per-record but there are vastly more records to move, so even 500-1000 epochs may leave some constraints unsolved. The `MicrocalibrateAdapterConfig.epochs` default of 32 is too low; the `us.py` wiring uses `max(self.config.calibration_max_iter, 32)` which pulls from the pipeline's `calibration_max_iter=100`. Reasonable starting point; tune up if convergence is still incomplete. - -## Four-way agreement on synthesizer ordering (post-snap-fix) - -Combined evidence with the upstream shared-col noise fix applied: - -| Check | ZI-QRF | ZI-QDNN | ZI-MAF | -|---|---|---|---| -| Raw 50-d PRDC at 40k (snap) | 0.979 (winner) | 0.796 | 0.168 | -| Raw 50-d PRDC at 77k (snap) | 0.928 (winner) | 0.707 | 0.106 | -| Embed 16-d PRDC at 40k (snap) | 0.984 (winner) | 0.819 | 0.201 | -| ZI-MAF tuned (wide+long, 40k, pre-snap) | — | — | 0.033 | -| Calibrate-on-synth post-cal mean err (20k, snap) | 0.105 (winner) | 0.251 | 11.86 | - -Every axis, every scale, every metric: **ZI-QRF > ZI-QDNN > ZI-MAF**. - -## Production implication - -- **G1 cross-section synthesizer default**: ZI-QRF. This is the fourth independent confirmation. -- **Calibration stack**: `MicrocalibrateAdapter` at the default adapter settings is fine for ZI-QRF output (error 0.26 → 0.14 in ~1 s on 16 k records). Bump `calibration_max_iter` to 500 or 1000 in the pipeline config for the production run to wring out the last few percent of residual error. -- **Neural synthesizers**: not producing structures that calibration can rescue at the default architectures. They need joint-target and joint-zero-mask modeling before being reconsidered for production. - -## Artifacts - -- `artifacts/calibrate_on_synthesizer.json` — full per-method, per-target pre- and post-cal error breakdown. -- `artifacts/calibrate_on_synthesizer.log` — full run log with calibration loss trajectory per method. - -Reproduction: `uv run python scripts/calibrate_on_synthesizer.py --n-rows 20000 --calibration-epochs 200`. ~3 minutes wall time on a 48 GB M3. diff --git a/docs/calibrator-decision.md b/docs/calibrator-decision.md deleted file mode 100644 index 0ef05904..00000000 --- a/docs/calibrator-decision.md +++ /dev/null @@ -1,175 +0,0 @@ -# Calibrator decision - -*Decided: 2026-04-16. Applies to `spec-based-ecps-rewire` and every microplex-us pipeline that follows.* - -## Context - -Three calibration systems exist in the microplex / PolicyEngine ecosystem: - -| System | Location | Method | Scale notes | -|---|---|---|---| -| `microplex.calibration.Calibrator` | microplex core, ~2011 lines | Classical IPF / chi-square / entropy balancing, with `LinearConstraint` for explicit constraint rows | Entropy backend just killed v6 at 1.5M households | -| `microplex.reweighting.Reweighter` | microplex core, 506 lines | Sparse L0/L1/L2 with scipy and cvxpy backends | Unused in production; designed for geographic-hierarchy reweighting; enforces sparsity by construction | -| `microcalibrate` | PolicyEngine external package | Gradient-descent chi-squared with soft penalties and optional feasibility filtering | Used by PE-US-data for its main calibration; has production track record | - -v6 died inside `Calibrator.fit_transform(..., backend="entropy")` on a 1.5M-household frame. The underlying problem is not the Calibrator code — it is that entropy calibration instantiates dense-ish structures at `(n_households × n_constraints)` scale, and with ~1,255 constraints that exceeds what a 48 GB machine can hold once scratch memory is included. - -## Decision - -**Mainline calibrator for all production runs: `microcalibrate` (gradient-descent chi-squared).** - -**Optional sparse deployment selector applied *after* mainline calibration: `microplex.reweighting.Reweighter` with L0/HardConcrete backend**, used only when a deployment artifact (web app, embedded tool) needs a ~50k-record subsample of a national build. - -**Retire for production use: `microplex.calibration.Calibrator` with `backend="entropy"` at scales above ~200k records.** The classical Calibrator's IPF and chi-square backends stay available for small-scale work, diagnostics, and test harnesses where their explicit constraint semantics are convenient. - -## Why `microcalibrate` and not core `Calibrator` - -1. **Identity preservation.** `microcalibrate` adjusts per-record weights via gradient descent without materializing dense constraint Jacobians. Every input record survives to the output with a new weight. The rearchitecture's longitudinal extension (SS-model) requires stable entity identity across years; identity-preservation cannot be negotiable. -2. **Scalability at the target scale.** `microcalibrate` is the calibration stack PE-US-data actually uses for production enhanced-CPS builds at full scale. v6's death at 1.5M is direct evidence the entropy path doesn't scale; `microcalibrate`'s gradient-descent pattern does. -3. **Soft-penalty feasibility handling.** The 2026-03-30 review flagged that v2's calibration dropped 65 % of constraints as infeasible and then scored against the full target set, producing a systematic loss inflation. `microcalibrate` supports soft penalty weights on targets the solver cannot feasibly hit, giving principled rather than binary drop behavior. -4. **External track record.** The SS-model methodology doc explicitly names `microcalibrate` as the calibration tool for the longitudinal extension. Picking it now aligns cross-section with the planned longitudinal path. - -## Why `Reweighter` stays as a post-mainline optional stage - -1. **L0 sparsity serves deployment, not accuracy.** The right use of L0 is to produce a small subsample of a well-calibrated national dataset for constrained deployment targets (web app UI, mobile, static hosting). It is the wrong tool for "calibrate to hit targets" because it sacrifices exact match for sparsity. -2. **Apply after, not instead of, the mainline.** The mainline run produces ~1.5M records with adjusted weights. If a deployment needs 50k records, apply `Reweighter` with appropriate L0 λ as a second pass. The mainline artifact remains the ground-truth output for analysis. -3. **`SparseCalibrator` + `HardConcreteCalibrator` analysis on the `codex/core-semantic-guards` paper work showed HardConcrete dominates the sparse-calibration Pareto frontier**, so when the sparse step does run, HardConcrete is the preferred backend. Core already ships this with multi-seed evaluation. - -## Why `Calibrator` is retired at scale - -1. v6 proves `Calibrator(backend="entropy")` OOMs at 1.5M × 1.2k-constraint scale on a 48 GB workstation. v4 proved it at 1.5M × similar scale. -2. No architectural fix is cheap. To make entropy work at that scale we would have to rewrite the backend to use sparse constraint matrices and streaming gradient, which is effectively reimplementing `microcalibrate`. -3. `Calibrator` stays available and useful for small-scale test harnesses. It is still the right tool for `n < ~200k`, for unit tests of the calibration layer, and for explicit-constraint diagnostics (the `LinearConstraint` API is clean). - -## Implementation implication - -The rewired pipeline in `spec-based-ecps-rewire` will import `microcalibrate` as a real dependency (not optional). This is a net-new dependency on microplex-us. The audit entry that proposed "retire `microcalibrate` if `Calibrator` covers the scalability requirement" is overruled by v6's evidence. - -## Calibration architecture, in order - -``` -raw seed data ─► donor integration ─► seed_ready - │ - ▼ - synthesize (seed backend = copy) - │ - ▼ - support enforcement - │ - ▼ - policyengine entity tables (households, persons, tax_units, ...) - │ - ▼ - ┌──────────────────┴──────────────────┐ - │ MAINLINE (every run) │ - │ microcalibrate.Calibrator │ - │ - chi-squared distance │ - │ - gradient descent │ - │ - soft penalty for infeasibles │ - │ - preserves all record IDs │ - │ │ - │ Hierarchical in later phases: │ - │ national → state → stratum │ - └───────────────────┬─────────────────┘ - │ - ▼ - calibrated artifact (full scale) - │ - ▼ - ┌───────────────────┴─────────────────┐ - │ OPTIONAL SPARSE DEPLOYMENT STEP │ - │ microplex.reweighting.Reweighter │ - │ - L0 / HardConcrete │ - │ - deployment-scale subsample │ - │ Only when a deployment artifact │ - │ needs to be small. │ - └─────────────────────────────────────┘ -``` - -## Hierarchical calibration — separate decision, deferred - -This decision only picks the calibration *backend*. Hierarchical geographic calibration (national → state → stratum, with spatial smoothness priors, optional Fay-Herriot small-area composites) is a structure layered on top of `microcalibrate` and will be decided in its own doc at the start of the local-area gate (G2). Cross-section gate (G1) calibrates at national scale first. - -## Does this close out the three-way overlap? - -Yes, operationally: - -- Production runs: `microcalibrate`. -- Deployment subsampling: `Reweighter`. -- Tests and small-scale diagnostics: `Calibrator`. -- No single-pipeline run crosses all three. Each tool has a distinct and non-overlapping job. - -## Empirical support: sparse selection annihilates rare subpopulations - -The single cleanest empirical argument for this split comes from -`microplex/benchmarks/results/sparse_coverage.csv`. Measuring rare-subpopulation -preservation at varying sparsity levels (lower `coverage_median` = closer to -oracle): - -| Method | `coverage_median` | elderly_selfemp_ratio | young_dividend_ratio | -|---|---:|---:|---:| -| Oracle (full) | 0.009 | 0.94 | 1.11 | -| Generative (10%) | 0.53 | 27.7 | 20.6 | -| Generative (2%) | 0.42 | 22.1 | 32.3 | -| Generative (1%) | 0.25 | 7.2 | 1.7 | -| Weighted (10%) | 0.24 | **0.00** | **0.00** | -| Weighted (2%) | 0.35 | 0.02 | **0.00** | -| Weighted (1%) | 0.65 | **0.00** | **0.00** | - -Sparse L0 weighting drops rare subpopulations to **zero representation** at -every sparsity level tested. Generative synthesis preserves them at 7–30× the -oracle ratio. For policy analysis, where rare subpopulations (elderly -self-employed, young dividend earners, disability recipients, top-1% earners) -drive outsized fiscal and distributional effects, sparse-as-mainline is -non-viable on accuracy grounds alone. - -This empirical pattern reinforces the decision above: L0/sparse selection is a -**post-calibration deployment tool**, not a calibration method. Apply it after -the mainline `microcalibrate` run has produced a fully-covered adjusted-weight -artifact, and only when a downstream consumer needs a small subsample. - -### Scale caveat - -`sparse_coverage.csv` was produced on **10,000-row synthetic data with ~7 -variables**. Production scale is 1.5M rows × 150+ variables on real joint -microdata. We should not assume the 20–30× generative-vs-weighted gap holds at -that scale — the absolute numbers will shift, and rare-subpopulation -preservation may tighten for both methods. What is expected to hold is the -structural pattern: sparse L0 exactly zeros out records, generative synthesis -does not. The argument against sparse-as-mainline survives any plausible -scale-up because the failure mode (zero representation of rare cells) is not a -noise issue, it is mathematically baked into L0 selection. - -## What this unblocks - -- Migration step 2 of `docs/core-wiring-audit.md`: "Adopt `Calibrator` end-to-end" is revised to "Adopt `microcalibrate` end-to-end as the production calibrator." That becomes the first real code change in `spec-based-ecps-rewire`. -- The rewired cross-section pipeline can start being written against a concrete calibration contract. - -## Revisit conditions - -Revisit this decision if any of the following becomes true: - -1. A benchmark shows `microcalibrate` produces materially worse loss than a refactored `Calibrator` on representative constraint matrices. (Unlikely — PE uses it successfully.) -2. Licensing / availability of `microcalibrate` becomes a blocker for external consumers of microplex-us. (Mitigate by forking the needed subset into microplex core.) -3. The SS-model longitudinal extension requires a calibration primitive that `microcalibrate` does not provide (e.g., explicit spatial smoothness, per-year temporal regularization). Add the primitive at microplex level rather than swapping backends. - -## Update 2026-06-04: optimizers added since the original decision - -Two optimizer paths landed after this doc was written and were not covered above. Recording them here so the optimizer landscape is legible in one place. - -### `pe_l0` backend (eCPS-parity L0) -`calibration_backend="pe_l0"` selects `microplex_us.pipelines.pe_l0.PolicyEngineL0Calibrator`, a lazy wrapper around PE-US-data's `fit_l0_weights` (the incumbent enhanced-CPS L0 sparse solver). Added to run the MP build under the same calibration the published eCPS uses, for apples-to-apples comparison. - -### PE-native-loss refit / scoring — `optimize_pe_native_loss_weights` -`microplex_us.pipelines.pe_native_optimization.optimize_pe_native_loss_weights` refits household weights **directly on the PolicyEngine-native target loss** (used in the eCPS-replacement symmetric-refit comparison and scoring — *not* the dataset build). - -- **Algorithm: projected (proximal) gradient descent** — least-squares PE-native loss, gradient step, project onto the nonnegative budget simplex, Lipschitz step size + backtracking line search, monotone-descent acceptance. -- **Naming caveat:** commit history ("Add robust PE-native loss and APG refit") and earlier notes call this "APG". That is a **misnomer** — there is no Nesterov momentum/extrapolation term, so it is **plain projected GD, not accelerated proximal gradient**. Do not assume acceleration from the name. - -### Current optimizer landscape (one place) -| Job | Optimizer | Method | -|---|---|---| -| Dataset-build weight calibration (mainline) | `microcalibrate` / `HardConcreteCalibrator` | gradient-descent chi-squared + optional L0 (HardConcrete) | -| Dataset-build (small-scale / diagnostics) | `microplex.calibration.Calibrator` | entropy balancing (scipy KL) / IPF (raking) / chi-square | -| Dataset-build (eCPS-parity) | `pe_l0` → `PolicyEngineL0Calibrator` | PE-US-data `fit_l0_weights` (L0 sparse) | -| eCPS-replacement refit / scoring | `optimize_pe_native_loss_weights` | projected (proximal) gradient descent on the PE-native loss ("APG" in commits = misnomer; not accelerated) | diff --git a/docs/core-wiring-audit.md b/docs/core-wiring-audit.md deleted file mode 100644 index 2a10d5dc..00000000 --- a/docs/core-wiring-audit.md +++ /dev/null @@ -1,253 +0,0 @@ -# Core wiring audit - -*Snapshot: 2026-04-16. Audit of `microplex` core against the H+ rearchitecture proposal for `microplex-us`.* - -## TL;DR - -The architectural thinking already happened. `microplex` core has ~80% of the primitives the rearchitecture needs — most of them unused. `microplex-us` has grown a parallel set of donor-block, calibration, and entity-table machinery that duplicates what core already provides. - -The project is **wire + complete + deprecate**, not **design + build**: - -1. Wire `microplex-us` pipelines to use existing core primitives. -2. Complete half-baked primitives where "thought went in" but production-readiness did not. -3. Deprecate `microplex-us` duplicates as each replacement lands. - -**Blocker:** `microplex` core is on a stale `codex/core-semantic-guards` branch (last commit 2026-04-02) with ~200 uncommitted/deleted files. Nothing destructive should land in core until that state resolves. - -## What exists in core (status by category) - -Legend: - -- **WIRED** — used by microplex-us today -- **READY** — implemented, untested in production, no obvious gaps -- **PARTIAL** — implemented with gaps or known rough edges -- **PROTOTYPE** — substantial design but probably needs finishing for production -- **UNKNOWN** — needs hands-on testing to classify - -### Spec primitives (`microplex.core`) - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `Period`, `PeriodType` | `core/periods.py` | READY | Pydantic. DAY/MONTH/QUARTER/YEAR arithmetic and containment. microplex-us does not use. | -| `EntityType` | `core/entities.py` | WIRED | — | -| `SourceArchetype`, `TimeStructure`, `Shareability` | `core/sources.py` | WIRED | Country-agnostic source taxonomy. `LONGITUDINAL_SOCIOECONOMIC`, `PANEL`, `EVENT_HISTORY` values already defined. | -| `SourceProvider`, `SourceQuery`, `ObservationFrame` | `core/sources.py` | WIRED | — | -| `SourceManifest` | `core/source_manifests.py` | WIRED | — | -| `FrameSemanticTransform` | `core/semantics.py` | PARTIAL | Declarative frame transforms with POST_SYNTHESIS / POST_IMPUTATION / POST_DONOR_INTEGRATION / POST_CALIBRATION / POST_EXPORT stages. microplex-us imports the module but coverage is unclear. | -| `SourceVariableCapability` | `core/variables.py` | WIRED | — | - -### Transitions (`microplex.transitions`) - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `Mortality` | `transitions/mortality.py` | PROTOTYPE | Hardcoded SSA 2021 period life tables (male + female qx arrays for ages 0–119). US-specific data in a "generic" module — likely belongs in microplex-us or a country-pack seam. | -| `MarriageTransition`, `DivorceTransition` | `transitions/demographic.py` | PROTOTYPE | Hardcoded rate tables from CPS/ACS. Same US-specificity concern. | -| `DisabilityOnset`, `DisabilityRecovery`, `DisabilityTransitionModel` | `transitions/disability.py` | PROTOTYPE | Hardcoded SSA DI rates. | - -**Decision point:** the hardcoded US rates in `microplex.transitions` violate the core/country split. Either (a) move these to microplex-us and leave core as pure interface, or (b) make the rate tables pluggable with country-specific providers. - -### Neural trajectory models (`microplex.models`) - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `TrajectoryTransformer` | `models/trajectory_transformer.py` | PROTOTYPE | Autoregressive Transformer for panel synthesis. ZI-QDNN candidate per SS-model docs. | -| `TrajectoryVAE` | `models/trajectory_vae.py` | PROTOTYPE | — | -| `SequenceSynthesizer` | `models/sequence_synthesizer.py` | PROTOTYPE | Variable-length sequence synthesizer. | -| `PanelEvolutionModel` | `models/panel_evolution.py` | PROTOTYPE | **Unified autoregressive replacement** for separate `transitions/*` classes. Docstring explicitly frames it as the replacement: `state[t+1] ~ state[t], state[t-1], ..., X`. | -| `BaseSynthesisModel`, `BaseTrajectoryModel`, `BaseGraphModel` | `models/base.py` | PROTOTYPE | Abstract bases. | - -**Decision point:** `transitions/*` classes and `PanelEvolutionModel` overlap. If `PanelEvolutionModel` is the intended canonical form, the separate transitions either become (a) feature-engineering helpers for it, or (b) deleted. Right now both coexist, neither is wired, and microplex-us uses neither. - -### Fusion (`microplex.fusion`) - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `FusionPlan`, `VariableCoverage` | `fusion/planning.py` | WIRED | microplex-us already uses for planning. Good design: tracks source-by-variable coverage, shareability, time structure. | -| `MaskedMAF` | `fusion/masked_maf.py` | PROTOTYPE | Masked normalizing flow over stacked multi-survey data with per-record observed masks. Country-agnostic. | -| `MultiSourceFusion` | `fusion/multi_source_fusion.py` | PROTOTYPE | Per-source + cross-source + unified three-model pipeline. Direct alternative to microplex-us's donor-block system. | -| `harmonize_surveys`, `stack_surveys`, `COMMON_SCHEMA` | `fusion/harmonize.py` | PROTOTYPE | CPS/PUF-specific mappings baked in — needs generalization before being called "core". | -| `FusionSynthesizer`, `FusionConfig`, `FusionResult` | `fusion/pipeline.py` | PROTOTYPE | High-level convenience over MaskedMAF. | - -**Decision point:** the `harmonize.py` `COMMON_SCHEMA` has US-specific variable names. Either move to microplex-us or make the mappings country-configurable. - -### Calibration (three modules, overlapping) - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `Calibrator` (IPF, chi-square, entropy) | `calibration.py` (2011 lines) | WIRED | Core calibration class. Classical survey calibration. | -| `LinearConstraint` | `calibration.py` | WIRED | Explicit linear constraint rows. | -| `Reweighter` (L0/L1/L2 sparse) | `reweighting.py` (506 lines) | PROTOTYPE | Sparse L0/L1/L2 with scipy and cvxpy backends. Geographic hierarchy support. | -| `microcalibrate` (external) | PolicyEngine package | WIRED (via microplex-us callers externally) | PolicyEngine's gradient-descent chi-squared library. | - -**Decision point (load-bearing):** three calibrators partly cover the same problem. - -- **Recommendation:** `Calibrator` (classical, identity-preserving) is the mainline for the cross-section pipeline, because it preserves all entity IDs by construction. `Reweighter` is the **optional sparse deployment selector** applied *after* Calibrator to produce a web-app-sized subsample. `microcalibrate` stays as an external dependency only if it offers something `Calibrator` does not (gradient-descent scalability beyond ~1M rows?) — otherwise retire it. -- **Must settle before any wiring commit lands** because migration step 2 depends on choosing the mainline. - -### Hierarchical synthesis (`microplex.hierarchical`) - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `HouseholdSchema`, hierarchical household→person two-pass | `hierarchical.py` (1155 lines) | PROTOTYPE | Different meaning than "hierarchical calibration." This is two-pass synthesis: household skeleton first, then person attributes conditioned on household context. | -| `TaxUnitOptimizer` | `hierarchical.py` | WIRED | Already used by microplex-us. | - -### Geography (`microplex.geography`) - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `AtomicGeographyCrosswalk` | `geography.py` | WIRED | — | -| `GeographyProvider`, `StaticGeographyProvider` | `geography.py` | WIRED | — | -| `ProbabilisticAtomicGeographyAssigner` | `geography.py` | WIRED | — | -| `GeographyAssignmentPlan` | `geography.py` | WIRED | — | - -**Note:** US-specific GEOID constants (`STATE_LEN`, `COUNTY_LEN`, `TRACT_LEN`, `BLOCK_LEN`) are in core. Comment says "kept as compatibility constants" — probably deletable after UK port proves the abstraction is truly country-agnostic. - -### Generative building blocks - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `Synthesizer` | `synthesizer.py` (728 lines) | WIRED | Main conditional synthesis class. Uses normalizing flows. | -| `ConditionalMAF` | `flows.py` (526 lines) | PROTOTYPE | Conditional MAF normalizing flow primitive. | -| `DGP` learning | `dgp.py`, `dgp_methods.py` | UNKNOWN | Population data-generating-process learning from multiple partial surveys. Distinct from fusion; claims to be "not statistical matching" and "not imputation" but learn true joint. | -| `StatMatchSynthesizer` | `statmatch_backend.py` | PROTOTYPE | Wraps py-statmatch NND hot-deck. Useful for PUMS ↔ CPS graft. | -| `MultiVariableTransformer` | `transforms.py` | WIRED | — | -| `BinaryModel`, `DiscreteModelCollection` | `discrete.py` | WIRED | — | - -### Data sources (`microplex.data_sources`) - -| Source | Location | Country-appropriate? | Notes | -|---|---|---|---| -| `cps`, `cps_mappings`, `cps_transform` | `data_sources/cps.py` et al | **No** (US-specific) | Should move to microplex-us. | -| `puf` | `data_sources/puf.py` | **No** (US-specific) | Should move to microplex-us. | -| `psid` | `data_sources/psid.py` | **No** (US-specific) | Should move to microplex-us. | - -**Cleanup:** these three belong in `microplex-us/src/microplex_us/data_sources/` (where microplex-us already has its own `cps.py`, `puf.py`, etc.). Core has US-specific data loaders sitting in what should be a country-agnostic package. - -### Validation (`microplex.validation`) - -| Primitive | File | Country-appropriate? | Notes | -|---|---|---|---| -| `baseline` | `validation/baseline.py` | Likely generic | Needs review. | -| `soi` | `validation/soi.py` | **No** (US-specific) | Should move to microplex-us. | - -### Targets (`microplex.targets`) - -| Primitive | File | Status | Notes | -|---|---|---|---| -| `TargetSpec`, `TargetSet` | `targets/spec.py` | WIRED | — | -| `TargetProvider` protocol | `targets/provider.py` | WIRED | — | -| `TargetQuery` | `targets/provider.py` | WIRED | — | -| `assert_valid_benchmark_artifact_manifest` | `targets/artifacts.py` | WIRED | — | -| `rac_mapping`, `database`, `bundles`, `benchmarking` | `targets/*` | UNKNOWN | Need review. | - -## What microplex-us currently imports from core - -Used (from grep of imports): - -``` -microplex.calibration (Calibrator, LinearConstraint) -microplex.core (EntityType, ObservationFrame, SourceProvider, SourceQuery, - SourceManifest, SourceArchetype, SourceVariableCapability) -microplex.core.semantics (subset of exports) -microplex.fusion (FusionPlan only — not the actual fusion synthesizers) -microplex.geography (subset) -microplex.hierarchical (TaxUnitOptimizer) -microplex.synthesizer (Synthesizer base) -microplex.targets (TargetQuery, TargetSpec, TargetSet, - assert_valid_benchmark_artifact_manifest) -``` - -Unused but implemented in core: - -``` -microplex.transitions (all of it — Mortality, Marriage, Divorce, Disability) -microplex.models (all trajectory / panel evolution models) -microplex.fusion.MaskedMAF (neural fusion synthesizer) -microplex.fusion.MultiSourceFusion -microplex.fusion.harmonize (stack_surveys, harmonize_surveys) -microplex.reweighting (Reweighter — sparse L0) -microplex.statmatch_backend (StatMatchSynthesizer — for PUMS graft) -microplex.hierarchical (HouseholdSchema, hierarchical synthesis pipeline) -microplex.core.periods.Period (period axis) -microplex.data_sources.psid -microplex.dgp (DGP learning) -``` - -## Gaps — what genuinely needs to be built - -Against the H+ proposal, what is NOT already in core (in any form): - -1. **Identity-preserving calibrator protocol.** Concept only exists as a note; `Calibrator` and `Reweighter` are concrete classes with different contracts. A shared protocol that declares "output retains all input entity IDs" is missing. -2. **Spatial smoothness regularization** for local-area calibration. Neither `Calibrator` nor `Reweighter` currently penalizes weight differences across adjacent geographies. -3. **Fay-Herriot / composite estimator** for small-area estimation. Not present. -4. **Held-out target evaluation harness.** Calibrate-on vs validate-on split is not a first-class concept in the existing harness. -5. **Forbes backbone integration** for top-income records. PE is adding this upstream; microplex has no equivalent. -6. **`TemporalDonorSpec` unification.** `transitions/*` classes and `PanelEvolutionModel` are two overlapping takes; a reconciled canonical abstraction does not exist. - -Everything else in the H+ proposal is at minimum a PROTOTYPE in core. - -## Three-way calibration overlap — decision required - -``` -microplex.calibration.Calibrator classical: IPF / chi-square / entropy WIRED -microplex.reweighting.Reweighter sparse: L0 / L1 / L2 UNUSED -microcalibrate (external) gradient-descent chi-squared UNUSED -``` - -Recommended resolution: - -- **Mainline:** `Calibrator` (identity-preserving, classical). Used for every production calibration. -- **Optional sparse post-step:** `Reweighter` (L0). Applied after `Calibrator` when a deployment subsample is needed (e.g., 50k-record web app artifact). -- **Retire:** `microcalibrate` external dependency, unless benchmarking shows it does something `Calibrator` does not (e.g., gradient-descent scalability past ~1M rows on realistic constraint matrices). - -This choice is load-bearing for migration step 2. It needs a yes/no before any wiring commits land. - -## Migration order - -| # | Swap | Gate | Blocked by | -|---|---|---|---| -| 0 | Resolve `codex/core-semantic-guards` branch state in microplex | microplex core tree clean on main | — | -| 1 | Adopt `microplex.core.periods.Period` in microplex-us | microplex-us compiles with single period type | 0 | -| 2 | Adopt `Calibrator` end-to-end, retire staged solve_now/solve_later | Cross-section beats current checkpoint on PE-native loss | 0, calibrator decision | -| 3 | Adopt `MultiSourceFusion` + `MaskedMAF`; retire donor-block system | Neural fusion parity-evaluated vs block donors | 2 | -| 4 | Adopt `statmatch_backend` for ACS PUMS ↔ CPS graft | PUMA-level local scaffold exists | 3 | -| 5 | Adopt `Reweighter` as optional sparse deployment selector | 50k-record web-app artifact | 4 | -| 6 | Adopt `transitions/*` for Phase 2 trivial forward projection | 1-year forward projection runs | 5 | -| 7 | Consolidate `transitions/*` and `PanelEvolutionModel` into one canonical form | Unified AR model beats separate hazards on PSID validation | 6 | -| 8 | Adopt `TrajectoryTransformer` / `TrajectoryVAE` | Neural trajectory beats interval-specific QRF on age-earnings | 7 | - -Steps 1–2 alone could clear G1 (national cross-section beats ECPS). - -## Prerequisite cleanup (microplex core) - -Before any wiring commits land in core: - -1. **Review `codex/core-semantic-guards` branch** (last commit 2026-04-02). It has useful-looking work (semantic transforms, sparse calibration frontier analysis, referee feedback) but ~200 uncommitted/deleted files. Either: - - Land the useful pieces, or - - Hard-reset to clean origin/main and cherry-pick, or - - Abandon the branch and start fresh. -2. **Relocate US-specific code out of core:** - - `microplex/data_sources/cps*`, `puf.py`, `psid.py` → microplex-us - - `microplex/validation/soi.py` → microplex-us - - SSA hardcoded tables in `transitions/*` → microplex-us (or make pluggable) - - GEOID length constants in `geography.py` → microplex-us or private helper -3. **Delete the compatibility shims** in core root (`unified_calibration.py`, `target_registry.py`, `pe_targets.py`, `data.py`, `cps_synthetic.py`, `calibration_harness.py`) once all callers have migrated to microplex-us imports. Right now they stay as shims. - -## Risks - -1. **"Unused" ≠ "ready."** Every PROTOTYPE entry above likely has at least one production-blocking gap. Expect 20–40% of wiring effort to be "finish the core primitive" rather than "integrate." -2. **US-specific rates baked into "generic" core.** `transitions/*` has SSA life tables and CPS rates hardcoded at core level. Wiring microplex-us to those is easy; porting microplex-uk to them is impossible without first decoupling. -3. **Three-way calibrator overlap may hide performance differences.** Before choosing `Calibrator` as mainline, run one apples-to-apples benchmark against `Reweighter` and `microcalibrate` on a representative constraint matrix. -4. **`codex/core-semantic-guards` abandonment.** The stale branch may contain work that materially improves these primitives. Losing it to a hard-reset could waste thought. Reviewing before discarding is cheap insurance. - -## Concrete next actions - -1. Decide the codex branch's fate (land / rebase / abandon). -2. Settle the three-way calibrator question (benchmark or decision document). -3. Write PSID → ObservationFrame adapter in microplex-us data_sources (if not already done — needs check). -4. Prototype migration step 2 on a small slice: CPS + QRF via `MultiSourceFusion` + `Calibrator` → compare to current microplex-us pipeline at 2000-record smoke scale. -5. Once smoke passes, land step 1 (Period adoption) as the first wiring commit. - -## Provenance - -This audit reads core as of commit `71f270e` on branch `codex/core-semantic-guards` (microplex core). It does not execute any of the primitives, so READY / PARTIAL / PROTOTYPE classifications are based on interface inspection and file-size heuristics. Each classification needs empirical confirmation before commitment. diff --git a/docs/embedding-prdc-validation.md b/docs/embedding-prdc-validation.md deleted file mode 100644 index 45178ab6..00000000 --- a/docs/embedding-prdc-validation.md +++ /dev/null @@ -1,67 +0,0 @@ -# Embedding-PRDC validation — is the stage-1 ordering real? - -*Settles the open question flagged in `docs/synthesizer-benchmark-scale-up.md`: is PRDC in 50-dim raw feature space too noisy to trust? Answer: the ordering is preserved.* - -## Setup - -40,000 rows × 50 columns of real enhanced_cps_2024. Same setup as stage-1. - -Autoencoder: 50 → 64 → 64 → **16** → 64 → 64 → 50 (2 hidden layers encoder + decoder, ReLU activations). Fit on holdout only (not on synthetic) for 200 epochs, batch 256, lr 1e-3. Final reconstruction MSE loss: 0.054. - -For each method (ZI-QRF / ZI-MAF / ZI-QDNN) at default hyperparameters: fit on 32k train, generate 32k synthetic, compute PRDC on 15k/15k samples (capped) in both the raw 50-dim feature space and the 16-dim latent space. - -## Results (post-snap-fix rerun 2026-04-17 21:12) - -| Method | Raw-50 coverage | Raw-50 precision | Raw-50 density | Emb-16 coverage | Emb-16 precision | Emb-16 density | -|---|---:|---:|---:|---:|---:|---:| -| ZI-QRF | **0.982** | 0.914 | 0.908 | **0.984** | 0.943 | 0.935 | -| ZI-QDNN | 0.791 | 0.847 | 0.763 | 0.819 | 0.905 | 0.802 | -| ZI-MAF | 0.183 | 0.033 | 0.026 | 0.201 | 0.070 | 0.042 | - -**Ordering preserved in both spaces: ZI-QRF > ZI-QDNN > ZI-MAF.** - -### Pre-snap numbers (archived) - -The original run was executed before the shared-col categorical-noise -fix landed upstream. Those artifacts are preserved as -`artifacts/embedding_prdc_compare.pre-snap.json` and showed much lower -absolute PRDC coverages (ZI-QRF 0.348 raw / 0.309 embed), because -noise-injected integer conditioning variables reduced PRDC scores -uniformly across all methods. Ordering was preserved in both -pre-snap and post-snap regimes; only the absolute values shift. - -## Observations - -1. **The stage-1 verdict is not a metric artifact.** The concern in the scale-up protocol doc was that raw-feature PRDC in 50 dimensions concentrates distances and becomes noise-dominated. The embedding variant has 16 dimensions with more informative axes (learned from the data), which is where PRDC is known to behave best. The ordering is the same. So the 10× gap between ZI-QRF and ZI-MAF is a real quality gap, not a measurement artifact. - -2. **Precision rises in embedding space for all three methods.** The AE compresses noise: random synthetic variation that looked far from real records in 50-dim now falls near them in 16-dim. This improves precision and, in the post-snap regime, slightly raises coverage too (likely because the smaller latent dimension is easier to cover). - -3. **ZI-QRF's edge is close to the ceiling.** 0.982 raw → 0.984 embed — already near-perfect on holdout. ZI-QDNN rises modestly (0.791 → 0.819). ZI-MAF rises from 0.183 → 0.201. The gap narrows in absolute terms (ZI-QRF / ZI-MAF ratio 5.4× raw, 4.9× embed) but the ordering is invariant. - -4. **ZI-MAF is still structurally behind.** Even in the embedding space, ZI-MAF coverage is 0.201 — about a quarter of ZI-QDNN and a fifth of ZI-QRF. Hyperparameter tuning (see `docs/zi-maf-hyperparameter-search.md`) does not close this at the architectural level. - -## Interpretation - -The ZI-QRF / ZI-QDNN / ZI-MAF ranking is robust across: - -- **Scale**: small synthetic (10 k × 7) → 5 k × 50 real → 40 k × 50 real → 77 k × 50 real. -- **PRDC sample cap**: uncapped (8 k × 32 k) and capped (15 k × 15 k). -- **Feature space**: 50 raw features and 16 learned latent dimensions. - -That's four independent robustness checks. The production default for G1 cross-section synthesis is **ZI-QRF**. - -## One thing this does not settle - -Neither raw-50 nor embed-16 PRDC weighs rare cells more than bulk cells. The `sparse_coverage.csv` finding — sparse L0 selection drives rare-cell ratios to 0 — is a different failure mode that neither PRDC variant measures. That finding still drives the calibrator decision (microcalibrate as mainline, not sparse reweighting). Both findings hold independently. - -## Artifact - -`artifacts/embedding_prdc_compare.json` — full per-method raw and embed PRDC dicts. - -Reproduction: - -```bash -uv run python scripts/embedding_prdc_compare.py --n-rows 40000 --output artifacts/embedding_prdc_compare.json -``` - -~5 minutes on a 48 GB M3. diff --git a/docs/huggingface-artifact-publishing.md b/docs/huggingface-artifact-publishing.md deleted file mode 100644 index e536e0fb..00000000 --- a/docs/huggingface-artifact-publishing.md +++ /dev/null @@ -1,153 +0,0 @@ -# Hugging Face artifact publishing - -Microplex-US publishes completed artifact bundles to Hugging Face dataset repos. -The Hugging Face repos are the stable registry consumed by dashboards and -deployment tooling; local `artifacts/` paths are build outputs, not durable -interfaces. - -## Repositories - -Use dataset repos under the `policyengine` organization: - -- `policyengine/microplex-us-diagnostics` -- `policyengine/microplex-us-deployed-datasets` - -The diagnostics repo is the lightweight inspection registry. It stores loss -summaries, per-target diagnostics, audit sidecars, immutable run manifests, and -mutable discovery pointers. - -The deployed-datasets repo stores the heavier PolicyEngine H5 bundle. Uploads -land in a staging path first, then a validated run can be promoted to the repo -root as the current deployed dataset. - -## Repository layout - -Diagnostics files: - -```text -runs//manifest.json -runs//policyengine_native_scores.json -runs//pe_us_data_rebuild_native_audit.json -runs//pe_native_target_diagnostics.json -latest.json -run_registry.jsonl -``` - -Dataset files: - -```text -staging//policyengine_us.h5 -staging//manifest.json -policyengine_us.h5 -manifest.json -``` - -The `runs//...` and `staging//...` paths are immutable once a -run is published. Only `latest.json`, `run_registry.jsonl`, and the promoted -dataset root files are expected to change. - -## Dashboard contract - -Dashboards should discover the default diagnostics bundle from: - -```text -policyengine/microplex-us-diagnostics/latest.json -``` - -For reproducibility, dashboards should also support pinned diagnostics runs: - -```text -policyengine/microplex-us-diagnostics/runs//... -``` - -The current deployed PolicyEngine dataset is: - -```text -policyengine/microplex-us-deployed-datasets/policyengine_us.h5 -``` - -A pinned staged dataset is: - -```text -policyengine/microplex-us-deployed-datasets/staging//policyengine_us.h5 -``` - -## Local publishing - -Publish a full bundle after validation: - -```bash -export HUGGING_FACE_TOKEN=... -uv run --extra hf --python 3.13 microplex-us-publish-hf-artifacts \ - artifacts/.../ \ - --run-id \ - --publish-dataset \ - --promote-dataset -``` - -Run without uploading: - -```bash -uv run --extra hf --python 3.13 microplex-us-publish-hf-artifacts \ - artifacts/.../ \ - --run-id \ - --publish-dataset \ - --promote-dataset \ - --dry-run -``` - -The command writes `hf_publish_manifest.json` into the local artifact directory. -That file records the exact Hugging Face paths and operation counts that would -be committed or were committed. - -Smoke-check a published bundle: - -```bash -uv run --extra hf --python 3.13 microplex-us-smoke-hf-artifact \ - --run-id -``` - -For staging-only dataset publishes that have not promoted root files yet: - -```bash -uv run --extra hf --python 3.13 microplex-us-smoke-hf-artifact \ - --run-id \ - --no-promoted-dataset -``` - -## GitHub Actions publishing - -The `Publish Hugging Face Artifacts` workflow is a manual workflow. It accepts -either: - -- an Actions artifact name containing an unpacked bundle or an archive, or -- an archive URL pointing to a `.zip`, `.tar`, `.tar.gz`, or `.tgz` bundle. - -The workflow extracts the bundle, finds `manifest.json`, runs the focused -publisher tests, and invokes `microplex-us-publish-hf-artifacts`. -For real publishes, it then smoke-checks that the expected remote diagnostics -and dataset files are visible on Hugging Face. - -The workflow defaults to `dry_run: true`. To publish, set `dry_run: false` and -provide a writable Hugging Face token through the repository secret `HF_TOKEN`. - -Promotion is explicit: - -- `publish_dataset: true` uploads the H5 and manifest to - `staging//...`. -- `promote_dataset: true` also writes `policyengine_us.h5` and `manifest.json` - at the dataset repo root. - -## Validation before promotion - -Before promoting a run as current, verify: - -- `policyengine_us.h5` exists and loads with PolicyEngine-US. -- `manifest.json` points to all expected sidecars. -- `policyengine_native_scores.json` is real run output. -- `pe_us_data_rebuild_native_audit.json` is real run output. -- `pe_native_target_diagnostics.json` contains per-target rows. -- H5 weights are national scale for budget scoring. -- A PolicyEngine smoke test can compute representative baseline and reform - aggregates. -- The dashboard can read the diagnostics bundle and, where needed, the H5. diff --git a/docs/imputation-conditioning-contract.md b/docs/imputation-conditioning-contract.md deleted file mode 100644 index 9d4c9bdd..00000000 --- a/docs/imputation-conditioning-contract.md +++ /dev/null @@ -1,189 +0,0 @@ -# Imputation Conditioning Contract - -This document states the current execution rule for donor conditioning in -`microplex-us`. - -It is meant to answer three questions: - -1. Which parts of donor conditioning are conceptually required? -2. Which parts are still experimental tuning choices? -3. Which artifact files should we read to evaluate those choices? - -## Core rule - -Keep three layers separate: - -1. Structural contract - - what the donor block is trying to represent - - which entity the block lives on - - which variables are allowed to define support -2. Predictor-surface choice - - which compatible conditioning variables are actually used for one block -3. Downstream evaluation - - how the imputation choice propagates through synthesis, calibration, and - the PolicyEngine oracle - -Those layers interact, but they are not the same decision. - -## Conceptually required structure - -These are not optional shortcuts. They are the current conceptual contract. - -- Donor integration is block-based, not one flat shared-variable imputer. -- Each block has a native entity and an allowed conditioning-entity policy. -- Zero-inflated positive variables should preserve support, not just totals. -- Structural tax-unit roles matter. - - `is_tax_unit_head` - - `is_tax_unit_spouse` - - `is_tax_unit_dependent` - - `tax_unit_is_joint` - - `tax_unit_count_dependents` -- Variable semantics decide whether a quantity is atomic, derived, signed, - zero-inflated, or share-like. - -This is the layer where we should encode ideas like "dependents are a distinct -role in the tax-unit support process" or "dividend components should not be -treated as unrelated continuous totals." - -## Current production modes - -The current donor-conditioning modes are: - -- `all_shared` - - use every compatible shared predictor -- `top_correlated` - - score compatible shared predictors and keep the strongest subset -- `pe_prespecified` - - use a PE-style structural predictor backbone declared in variable semantics - - optionally admit a narrow supplemental shared set from the *actual* - compatible overlap -- `pe_plus_puf_native_challenger` - - keep the same PE structural predictor backbone - - for the explicitly marked problematic PUF tax-leaf blocks only, append a - narrow source-native raw-overlap set declared in semantics - - treat this as a non-default challenger lane, not as part of the PE-aligned - contract - -For the current PUF IRS tax-leaf family, PE alignment means the structural-only -path. The local `policyengine-us-data` -`policyengine_us_data/calibration/puf_impute.py` implementation trains the PUF -clone QRF on demographic / tax-unit-role predictors only, and the PUF source -capability policy intentionally blocks derived convenience columns like -`income`, `employment_status`, and synthetic `state_fips` from entering donor -conditioning. - -The important practical point is that `pe_prespecified` is not "use some hard -coded list no matter what." It still depends on what survives source -capabilities, semantic compatibility, entity projection, and prepared condition -surface construction. - -## What is structural vs experimental - -Structural: - -- donor block boundaries -- support family and donor match strategy -- native entity -- role-aware PE structural predictors -- semantic transforms/checks that prevent category errors - -Experimental: - -- whether `all_shared`, `top_correlated`, or `pe_prespecified` wins for a given - block family -- whether `pe_plus_puf_native_challenger` is worth keeping after a real - checkpoint comparison -- whether a particular variable should admit a - `supplemental_shared_condition_vars` set -- which compatible shared predictors should be let back into a PE-structured - block -- whether a condition surface should be widened upstream or left narrow - -Usually the failure mode has been treating an experimental choice as if it were -a structural truth, or vice versa. - -## What is not a real fix - -These can still be useful probes, but they should not be confused for upstream -imputation repairs: - -- late export-layer patches -- post-donor clipping/zeroing guards -- calibration-only improvements that hide unrealistic pre-calibration support - -The current working rule is: - -- if a patch improves only after calibration but worsens the pre-calibration - imputation evidence or the mission metric, it is not a clean imputation win - -## Evidence contract - -We read four artifact layers for imputation questions. - -### 1. Block-level conditioning evidence - -- `manifest.json` - - `synthesis.donor_conditioning_diagnostics` -- `python -m microplex_us.pipelines.summarize_donor_conditioning ` - -Use this first when the question is: - -- Which predictors did this donor block actually use? -- Which shared predictors were available but dropped? -- Did the block use a prepared PE-style condition surface? -- Did a requested predictor fail at raw overlap, projection, or prepared - compatibility? - -### 2. Pre-calibration imputation evidence - -- `imputation_ablation.json` - -Use this when the question is: - -- Which variant wins support realism? -- Which variant wins weighted MAE? -- Are we trading support realism against MAE? - -### 3. Full checkpoint parity evidence - -- `pe_us_data_rebuild_parity.json` -- `pe_us_data_rebuild_native_audit.json` - -Use these when the question is: - -- Did the candidate beat the incumbent on harness slices? -- Did it beat the incumbent on the native broad loss? -- Which target families regressed? - -### 4. Calibration trajectory evidence - -- `manifest.json` - - `calibration.full_oracle_capped_mean_abs_relative_error` - - `calibration.active_solve_capped_mean_abs_relative_error` - - deferred-stage summaries - -Use this when the question is: - -- Did calibration rescue the candidate? -- Did the change make the solve harder before any rescue happened? - -## Current read as of 2026-04-14 - -- Post-hoc dependent tax-leaf guards are not a satisfactory repair. - - they regress the mission metric -- A narrow PE-structured supplemental shared patch also failed as a real fix. - - the raw-gate diagnostics now show why: the PUF source policy blocks - `income`, `employment_status`, and synthetic `state_fips` from donor - conditioning before they ever reach live overlap for these tax-leaf blocks -- The local `policyengine-us-data` read resolves the PE-alignment question. - - PE's PUF clone QRF uses the structural demographic / tax-unit-role - predictors only for this family -- That means the next question is not "why did compatible overlap lose these - vars?" - - the real question is whether we want a challenger path with source-native - PUF predictors that survive source policy, or whether we keep the current - structural-only PE-aligned contract - -This is a better next question than "which post-hoc guard should we try next," -because it targets the actual modeling choice instead of clipping the output -after the fact or chasing a nonexistent overlap bug. diff --git a/docs/methodology-ledger.md b/docs/methodology-ledger.md deleted file mode 100644 index 4948b535..00000000 --- a/docs/methodology-ledger.md +++ /dev/null @@ -1,1862 +0,0 @@ -# Methodology Ledger - -This document is the living methods record for `microplex-us`. - -It is not the paper. It is the shortest place in the repo that should answer: - -- what the current canonical pipeline is -- what PolicyEngine is doing in that pipeline -- which methodological choices are considered canonical today -- which choices are explicitly provisional or challenger-only -- where the evidence for those choices is stored - -## Core framing - -`microplex-us` is not trying to literally recreate `policyengine-us-data`. - -Current framing: - -- `policyengine-us` is the shared measurement operator -- the active PE-US targets DB is the truth surface we score against -- `policyengine-us-data` is the incumbent comparator and interface reference -- `microplex-us` is an independent US data-construction runtime - -That means incumbent-compatibility work exists to improve attribution and -interface confidence, not to define the project as a wrapper around PE-US-data. - -## Claim separation - -We keep four claims separate: - -1. Architecture claim - - `microplex-us` is a cleaner, more modular, more auditable runtime. -2. Oracle-compatibility claim - - where important, Microplex matches or intentionally departs from incumbent - PE-US-data construction behavior. -3. Benchmark claim - - Microplex produces a better PE-ingestable dataset than the incumbent on - the active target estate. -4. Paper claim - - a stable narrative about methodology, evidence, and novelty that can be - defended externally. - -The first three live in code and artifacts now. The fourth should be written -from them later, not invented separately. - -## Methods snapshot - -### Snapshot as of 2026-04-10 - -This is the current working methods snapshot, not a claim of finality. - -| Area | Current reading | Status | Main evidence | -| --- | --- | --- | --- | -| Measurement contract | `policyengine-us` plus the active targets DB are the oracle. `policyengine-us-data` is the incumbent comparator. | `Canonical` | [benchmarking.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/benchmarking.md) | -| Runtime boundary | Microplex owns source loading, donor integration, synthesis, entity build, export, artifacts, and experiment tracking. PolicyEngine owns measurement/materialization at eval time. | `Canonical` | [architecture.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/architecture.md) | -| Incumbent-compatibility work | PE-style modes are used where they improve attribution or interface confidence, but they do not define the whole project. | `Canonical` | [policyengine-oracle-compatibility.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/policyengine-oracle-compatibility.md) | -| Construction parity claim | Some construction layers are close or compatible, but general PE-construction parity is not yet established. | `Canonical` | [pe-construction-parity.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/pe-construction-parity.md) | -| Imputation evaluation | We currently track both support realism and MAE. Neither should be collapsed into a single unqualified "best" method. | `Canonical` | [pe_us_data_rebuild_parity.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_parity.json), [pe_us_data_rebuild_native_audit.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_native_audit.json) | -| Current production imputation reading | `structured_pe_conditioning` is the support winner on the current checkpoint ablation; `top_correlated_qrf` is the MAE winner. | `Provisional` | [pe_us_data_rebuild_parity.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_parity.json), [pe_us_data_rebuild_native_audit.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_native_audit.json) | -| Broad mission metric | The mission metric is PE-native broad loss frontier, but pre-calibration support evidence is retained so unrealistic imputations do not hide behind later weighting. | `Canonical` | [superseding-policyengine-us-data.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/superseding-policyengine-us-data.md), [pe_us_data_rebuild_native_audit.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_native_audit.json) | -| Full-oracle loss accounting | `full_oracle_*` metrics now score the entire active targets DB, including explicit penalty mass for unsupported rows. Supported-only diagnostics remain separate. | `Canonical` | [policyengine-oracle-compatibility.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/policyengine-oracle-compatibility.md), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_medium/medium-source-corrected-oracle-v1/manifest.json) | -| Calibration target planning | The active targets DB is one catalog, but calibration is staged and support-aware: rows are classified into `solve_now`, `solve_later`, or `audit_only` instead of forcing one flat solve. | `Canonical` | [policyengine-oracle-compatibility.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/policyengine-oracle-compatibility.md), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_medium/medium-source-corrected-oracle-v1/manifest.json) | -| Current deferred calibration policy | Default PE-oracle rebuilds use a dense first pass plus two deferred passes at support `10` and `1`, each capped to 24 constraints, always consider those passes, and narrow them to the top 7 deferred families and top 4 deferred geographies. Within that focus, deferred stages spend capacity by row-level capped error first, then family/geography loss share, and each deferred pass is only kept if it improves capped full-oracle loss. After correcting the upstream EITC-recipient oracle semantics, the support-10 pass improved the matched `2000/2000` large no-donor run from `0.9729` to `0.9498`, the matched donor-inclusive large run from `0.9730` to `0.9502`, and the medium no-donor run from `1.0298` to `1.0291`. With the row-aware selector in place, the support-1 pass further improves the broader donor-inclusive run from `0.8783` to `0.8213`, the matched broader no-donor run from `0.8908` to `0.8362`, and the medium no-donor run from `1.0291` to `1.0029`. Widening deferred family focus from 3 to 4 then improves the broader donor-inclusive run again from `0.8213` to `0.7909`, the matched broader no-donor run from `0.8362` to `0.7996`, and the medium no-donor run from `1.0029` to `0.9969`. A fresh broader donor-inclusive checkpoint through the unmodified default entrypoint reproduces that `0.7909` result exactly. Widening deferred geographies from 4 to 8 on the same broader donor run then regresses capped full-oracle loss from `0.7909` to `0.7992`, so the geography focus should stay at `4`. Fixing raw PUF checkpoint sampling to respect `S006` weights then improves the broader donor-inclusive default again from `0.7909` to `0.7682` and the matched broader no-donor default from `0.7996` to `0.7683` without any calibration-policy change. After promoting the earnsplit-only PUF person-expansion default, widening deferred family focus from 4 to 7 improves the broader donor-inclusive run again from `0.7176` to `0.7045`, and the matched donor-free broader run from `0.7171` to `0.7040`, with the same focused family set including `aca_ptc` and `rental_income`. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_eitc_recipient_oracle_large_nodonors/large-nodonors-eitc-recipient-oracle-v2/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_large_nodonors/large-nodonors-age-agi-forced-stage2-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_large_donors/large-donors-age-agi-forced-stage2-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_medium_nodonors/medium-nodonors-age-agi-forced-stage2-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_default_stage2_large_donors/large-donors-default-stage2-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_rowrank_donors/broader-donors-rowrank-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_rowrank_nodonors/broader-nodonors-rowrank-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_donors/broader-donors-stage3-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_nodonors/broader-nodonors-stage3-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_stage3_nodonors/medium-nodonors-stage3-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_default_stage3_nodonors/medium-nodonors-default-stage3-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_top4family_donors/broader-donors-stage3-top4family-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_top4family_nodonors/broader-nodonors-stage3-top4family-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_stage3_top4family_nodonors/medium-nodonors-stage3-top4family-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_default_top4family_nodonors/medium-nodonors-default-top4family-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_default_top4family_donors_rerun/broader-donors-default-top4family-v2/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_geo8_donors/broader-donors-geo8-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_donors/broader-donors-puf-weight-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_nodonors/broader-nodonors-puf-weight-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_default_nodonors/broader-nodonors-puf-personexpansion-default-v2/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_nodonors/broader-nodonors-puf-personexpansion-family7-v1/manifest.json) | -| Current checkpoint PUF sampling reading | Checkpoint-scale PUF sampling should respect raw `S006` weights before variable mapping rather than uniformly sampling raw PUF records. This is incumbent-alignment work, not a challenger method: it changes the checkpoint source sample so it better reflects the PUF weighting surface before any Microplex-specific synthesis or calibration logic. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_default_top4family_donors_rerun/broader-donors-default-top4family-v2/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_donors/broader-donors-puf-weight-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_top4family_nodonors/broader-nodonors-stage3-top4family-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_nodonors/broader-nodonors-puf-weight-v1/manifest.json) | -| Current checkpoint CPS age-support sampling reading | Checkpoint-scale CPS sampling should guarantee at least one sampled household per observed `state x 5-year age-band` cell. This is also checkpoint-only incumbent-compatibility work: it does not change the full-data runtime, only the sampled source surface used in checkpoint experiments. On the matched broader donor run it improves capped full-oracle loss from `0.7682` to `0.7329`, and on the matched broader no-donor run from `0.7683` to `0.7368`. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_donors/broader-donors-puf-weight-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_donors/broader-donors-cps-stateage1-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_weight_nodonors/broader-nodonors-puf-weight-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_nodonors/broader-nodonors-cps-stateage1-v1/manifest.json) | -| Current checkpoint donor age-support sampling reading | On donor-inclusive checkpoints, donor survey sampling should also guarantee at least one sampled household per observed `state x 5-year age-band` cell when a donor source exposes both state and age. This stays in the same checkpoint-only incumbent-compatibility bucket as the CPS age floor, but the effect is much smaller: on the matched broader donor run it improves capped full-oracle loss from `0.7329149849` to `0.7327632809` with the same selected-constraint count. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_donors/broader-donors-cps-stateage1-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_donor_stateage1_donors/broader-donors-donor-stateage1-v1/manifest.json) | -| Current checkpoint CPS income-support sampling reading | Do not promote checkpoint CPS income-support floors yet. The household-income analogue clearly regressed the matched broader donor run from `0.7329` to `0.7554`, and the more PE-aligned tax-unit-income analogue was a near miss but still regressed the frontier metric from `0.7329` to `0.7372` even while improving uncapped full-oracle and active-solve loss. The accepted upstream checkpoint support change therefore remains the CPS `state x age-band` floor only. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_donors/broader-donors-cps-stateage1-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_income_donors/broader-donors-cps-stateage1-income-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_taxunitincome_donors/broader-donors-cps-stateage1-taxunitincome-v1/manifest.json) | -| Current PUF person-expansion reading | Keep PE-style `EARNSPLIT` randomization in the PUF PE-demographics branch, but do not promote PE-style age-bin and spouse/dependent-sex randomization into the default path yet. The winning split-only version improves the matched broader donor checkpoint from `0.7327632809` to `0.7176041064`, while the age/sex-only version regresses it to `0.7463902007`. A later retest of the full age/sex path on top of the stronger family-7 broader donor default still regresses the mission metric from `0.7044626415` to `0.7111876263`, so this remains a rejected lane rather than an unresolved default question. This keeps the upstream income-splitting alignment that helps the frontier metric without forcing the age/sex piece that currently hurts checkpoint performance. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_ageonly_donors/broader-donors-puf-personexpansion-ageonly-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_earnsplitonly_donors/broader-donors-puf-personexpansion-earnsplitonly-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_default_donors/broader-donors-puf-personexpansion-default-v2/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_rng_donors/broader-donors-puf-personexpansion-rng-v1/manifest.json), [tmp_puf_source_stage_parity_personexpansion_20260412.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_puf_source_stage_parity_personexpansion_20260412.json) | -| Current post-fix residual reading | After the raw PUF weighting fix, the checkpoint CPS `state x age-band` floor, the earnsplit-only PUF person-expansion default, and the wider deferred family gate, ACA PTC and rental mass drop sharply, but the remaining capped-error mass is now led again by age, person AGI, tax-unit AGI, and EITC child-count families. The worst individual rows are still dominated by ACA amount and ACA-eligibility cells, with a thinner stored-input tail now mostly in tax-exempt interest and a few rental states. That keeps the next upstream lane on age/AGI/EITC structure rather than another broad calibration-policy sweep. | `Provisional` | [tmp_broader_puf_personexpansion_family7_donor_drilldown_20260412.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_broader_puf_personexpansion_family7_donor_drilldown_20260412.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_nodonors/broader-nodonors-puf-personexpansion-family7-v1/manifest.json) | -| Current stored-input tail reading | Keep the accepted interest/rental donor-conditioning change, reject the property-cost extension, and reject both export-side rental normalization and direct zero-support-mask propagation in zero-inflated donor rank matching. Each looked locally plausible, but fresh `2000/2000` large no-donor source checkpoints regressed capped full-oracle loss from `1.3274` to `1.3874` and `1.9223` respectively, so the default path stays conservative here. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_current/smoke-nodonors-asset-tail-conditioning-current-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_oldsemantics/smoke-nodonors-asset-tail-old-semantics-v1/manifest.json), [tmp_policyengine_oracle_target_drilldown_asset_tail_smoke_current_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_policyengine_oracle_target_drilldown_asset_tail_smoke_current_20260411.json), [tmp_policyengine_oracle_target_drilldown_asset_tail_smoke_old_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_policyengine_oracle_target_drilldown_asset_tail_smoke_old_20260411.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_rental_export_large_nodonors/large-nodonors-rental-export-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_zero_support_mask_large_nodonors/large-nodonors-zero-support-mask-v1/manifest.json) | -| Current interest-family reading | Do not promote the `interest_income + tax_exempt_interest_share` decomposition into the default path yet. It looked strong on the `400/400` medium no-donor run, but the matched `2000/2000` no-donor confirmation regressed capped full-oracle loss from `1.3274` to `1.3555`, so the default remains separate `taxable_interest_income` and `tax_exempt_interest_income` lanes. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_interest_family_medium_nodonors/medium-nodonors-interest-family-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_interest_family_large_nodonors/large-nodonors-interest-family-v1/manifest.json) | -| Current donor-support sampling reading | Keep donor-support sampling with replacement. Forcing no-replacement support sampling looked cleaner mechanically but made the matched smoke run materially worse on both capped full-oracle and active-solve loss. | `Provisional` | [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_current/smoke-nodonors-asset-tail-conditioning-current-v1/manifest.json), [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_donor_support_sampling_smoke_nodonors/smoke-nodonors-donor-support-sampling-v1/manifest.json) | -| Current benchmark reading | On the current checkpoint artifact, harness metrics improved versus the incumbent comparator, but native broad loss is still much worse than `enhanced_cps_2024`. | `Canonical` | [pe_us_data_rebuild_parity.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_parity.json), [pe_us_data_rebuild_native_audit.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_native_audit.json) | -| Current cross-run regression reading | Across 66 scored modelpass checkpoint runs, `national_irs_other` appears in the top 3 every time, `state_agi_distribution` in 63/66, and `state_aca_spending` in 54/66. Near-term model work should target those recurring families directly rather than broad tuning. | `Provisional` | [live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json) | -| Current `national_irs_other` drilldown reading | The audited `national_irs_other` failures are concentrated in filing-status-sensitive IRS cells and coincide with large `SINGLE` and `JOINT` overcounts plus `SEPARATE` undercounts. The first remediation step is to preserve source-authoritative filing-status inputs into the PE construction path. | `Provisional` | [live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json) | - -## Canonical pipeline - -The current broad US pipeline is: - -1. Load raw survey/tax sources into canonical observation frames. -2. Apply source semantics and variable semantics. -3. Build donor blocks and donor-condition surfaces. -4. Impute donor-only variables into the scaffold population. -5. Synthesize a candidate population. -6. Build PolicyEngine-ingestable entity tables. -7. Export final H5. -8. Run PolicyEngine materialization and compare implied aggregates to the active - target DB. -9. Save artifact bundles, sidecars, and registry/index records. - -This is a fresh Microplex pipeline with a PolicyEngine evaluation boundary, not -an attempt to make PE-US-data the runtime architecture. - -## What is currently canonical - -- Source and variable semantics are declared in Microplex-owned registries and - manifests. -- Final evaluation uses the shared PE-US runtime and active targets DB. -- Artifact discipline is required for serious runs: - - `manifest.json` - - `data_flow_snapshot.json` - - `policyengine_harness.json` when harness evaluation runs - - `policyengine_native_scores.json` when PE-native broad loss runs - - `pe_us_data_rebuild_parity.json` for incumbent-compatibility checkpoints - - `pe_us_data_rebuild_native_audit.json` for target/family/support audit - - `run_registry.jsonl` - - `run_index.duckdb` -- Incumbent-compatibility modes are allowed when they improve attribution. -- Materially different model choices should be explicit challenger variants. - -## What is still provisional - -- The default imputation stack is still under active evaluation. -- Support realism vs MAE tradeoffs are still live methodological questions. -- Full-support candidate construction and selector design are not settled. -- Calibration is still operationally important, but it is not the only or even - always the dominant methodological lever. -- Held-out evaluation is not yet the default outer loop. - -These should not be written up later as if they were settled all along. - -## Current open questions - -- Should runtime imputation selection prioritize support realism, weighted MAE, - or a gated combination of the two? -- How much conditioning structure should be imposed before flexible donor/QRF - prediction begins? -- How much of the remaining broad-loss gap is record construction versus - selection/calibration? -- Should deferred calibration eligibility stay at a single scalar trigger - (`full_oracle_capped_mean_abs_relative_error > 2.45`), or should it become - family-aware once larger source runs accumulate? -- Which incumbent-compatible modes are worth keeping as long-run options, and - which should remain diagnostic-only? -- When should held-out evaluation become a required gate rather than an optional - extra? - -## Current methodological evidence surfaces - -Use these surfaces when writing claims down later: - -- [benchmarking.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/benchmarking.md) - for the truth/comparator/operator contract -- [policyengine-oracle-compatibility.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/policyengine-oracle-compatibility.md) - for incumbent-compatibility rules -- [pe-construction-parity.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/pe-construction-parity.md) - for audited construction-layer matching vs intentional difference -- saved artifact bundles for actual run-level evidence -- tests for the code-enforced contract behind those claims - -For the current checkpoint-style evidence bundle, the most useful files are: - -- [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/manifest.json) -- [data_flow_snapshot.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/data_flow_snapshot.json) -- [policyengine_harness.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/policyengine_harness.json) -- [policyengine_native_scores.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/policyengine_native_scores.json) -- [pe_us_data_rebuild_parity.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_parity.json) -- [pe_us_data_rebuild_native_audit.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_native_audit.json) -- [imputation_ablation.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/imputation_ablation.json) -- [live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json) -- [live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json) - -## Decision log - -### 2026-04-10: Project framing - -- Decision: - - describe `policyengine-us` as the oracle/evaluator and - `policyengine-us-data` as the incumbent comparator -- Why: - - this matches how the system is actually being used - - it avoids understating the novelty of the Microplex runtime - - it keeps incumbent-compatibility work from swallowing the whole project -- Evidence: - - [benchmarking.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/benchmarking.md) - - [policyengine-oracle-compatibility.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/policyengine-oracle-compatibility.md) - -### 2026-04-10: Imputation evaluation contract - -- Decision: - - keep support realism and MAE as separate evidence channels - - do not summarize imputation quality using post-calibration loss alone -- Why: - - the current checkpoint artifact shows a real tradeoff - - `structured_pe_conditioning` wins support - - `top_correlated_qrf` wins weighted MAE - - collapsing the two too early would hide methodology risk -- Evidence: - - [pe_us_data_rebuild_parity.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_parity.json) - - [pe_us_data_rebuild_native_audit.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/pe_us_data_rebuild_native_audit.json) - - [imputation_ablation.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/imputation_ablation.json) - -### 2026-04-10: Artifact contract for headline claims - -- Decision: - - treat sidecars and registry metadata as part of the methodology, not just - engineering exhaust -- Why: - - paper-facing claims will need reproducible evidence with exact configs, - metrics, and comparison slices - - the artifact bundle is now the canonical storage layer for that evidence -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/manifest.json) - - [data_flow_snapshot.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/checkpoints/checkpoint-ablation-real-20260410a/data_flow_snapshot.json) - -### 2026-04-10: Cross-run regression priority - -- Decision: - - prioritize targeted fixes for `national_irs_other`, - `state_agi_distribution`, and then `state_aca_spending` -- Why: - - across recent modelpass checkpoint families, the same regressions recur even - when total loss improves substantially - - `national_irs_other` appears in the top 3 for all 66 scored runs - - `state_agi_distribution` appears in the top 3 for 63/66 runs and is the - largest regressing family in 34 runs - - `state_aca_spending` appears in the top 3 for 54/66 runs but is more often - a secondary or tertiary regression -- Evidence: - - [live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_modelpass_regression_summary_20260410.json) - -### 2026-04-10: First `national_irs_other` remediation target - -- Decision: - - first fix the preservation of source-authoritative filing-status inputs in - the PE-oracle rebuild path before attempting more downstream status tuning -- Why: - - audited `national_irs_other` lead runs show repeated IRS target failures in - filing-status-sensitive cells, especially `Single`, `Joint`, and high-AGI - bins - - those same audited runs show large `SINGLE` and `JOINT` count surpluses, - large `SEPARATE` deficits, and missing or distorted MFS support bins - - the saved candidate seed/synthetic/calibrated rows for leading runs retain - `marital_status` but not `filing_status_code`, so the authoritative PUF tax - filing code is disappearing before tax-unit construction -- Evidence: - - [live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_national_irs_other_drilldown_20260410.json) - -### 2026-04-11: Full-oracle accounting means the full DB - -- Decision: - - score `full_oracle_*` metrics over the full active targets DB, not just the - supported subset - - penalize unsupported rows explicitly rather than letting them disappear from - the scalar objective - - keep supported-only summaries as separate diagnostics -- Why: - - "measure everything, optimize the feasible subset" only works if the - measurement metric actually reflects unsupported misses - - otherwise frontier selection and deferred-stage triggers can be gamed by - leaving hard rows unsupported -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_medium/medium-source-corrected-oracle-v1/manifest.json) - - [policyengine-oracle-compatibility.md](/Users/maxghenis/PolicyEngine/microplex-us/docs/policyengine-oracle-compatibility.md) - -### 2026-04-11: Full DB measurement, staged calibration execution - -- Decision: - - keep the full active targets DB as one measurement catalog - - classify rows into `solve_now`, `solve_later`, or `audit_only` - - use a dense first pass plus at most one deferred pass by default on the - incumbent-compatible PE-oracle rebuild path -- Why: - - one flat broad solve is not numerically credible on thinner artifacts - - the right execution rule is support-aware staging, not shadow target CSVs - or pretending all DB rows belong in the same solve -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_medium/medium-source-corrected-oracle-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_donors/donors-source-corrected-oracle-v1/manifest.json) - -### 2026-04-11: Current deferred-stage default - -- Decision: - - default deferred calibration on the incumbent-compatible PE-oracle rebuild - path uses: - - one deferred pass at support floor `10` - - deferred-pass cap `24` - - trigger threshold `full_oracle_capped_mean_abs_relative_error > 2.45` -- Why: - - tiny-source evidence still benefits from the deferred pass - - medium, donor-inclusive, and larger replayed/source artifacts do not justify - attempting it below that threshold -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_medium/medium-source-corrected-oracle-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_donors/donors-source-corrected-oracle-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_large_donors/large-donors-source-corrected-oracle-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_large_nodonors/large-nodonors-source-corrected-oracle-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1/manifest.json) - - [tmp_corrected_oracle_large_replay_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_corrected_oracle_large_replay_20260411.json) - - [tmp_corrected_oracle_xlarge_replay_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_corrected_oracle_xlarge_replay_20260411.json) - -### 2026-04-11: Support person-to-tax-unit count targets in the PE compiler - -- Decision: - - support `person -> tax_unit/family/spm_unit` boolean target filters in the - PE household-constraint compiler using group-membership `.any()` semantics -- Why: - - broad-oracle runs were carrying an artificial unsupported wall across 11 - whole `tax_unit_count` families such as `dividend_income`, - `taxable_interest_income`, and `unemployment_compensation` - - those targets are defined as `tax_unit_count` with person-entity domain - filters like `dividend_income > 0` plus tax-unit filters like - `tax_unit_is_filer == 1` - - removing that structural limitation dropped unsupported targets on the - large no-donor replay from `572` to `0`, and the fresh source rerun improved - capped full-oracle loss from `2.4329` to `1.3274` -- Evidence: - - [tmp_large_source_cross_entity_fix_replay_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_large_source_cross_entity_fix_replay_20260411.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1/manifest.json) - -### 2026-04-11: Residual oracle work should target age, EITC-child-count, AGI, and OR/GA/MO - -- Decision: - - prioritize post-fix model and construction work against the remaining large-run - oracle leaders rather than more deferred-stage tuning -- Why: - - fresh donor and no-donor `2000/2000` source runs now share the same top - full-oracle residual families and geographies - - the largest remaining families are age counts, `tax_unit_count` for - `eitc_child_count`, and AGI count families; the leading geographies are - `state:OR`, `state:GA`, and `state:MO` - - within those geographies, the worst cells are concentrated in ACA PTC, - AGI counts, SALT, rental income, tax-exempt interest income, and - pass-through income -- Evidence: - - [tmp_policyengine_oracle_regressions_cross_entity_fix_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_policyengine_oracle_regressions_cross_entity_fix_20260411.json) - - [tmp_policyengine_oracle_target_drilldown_cross_entity_fix_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_policyengine_oracle_target_drilldown_cross_entity_fix_20260411.json) - -### 2026-04-11: Keep the interest/rental conditioning change, reject the property-cost extension - -- Decision: - - keep the richer interest/rental donor-conditioning semantics - - do not promote the property-cost semantic extension into the default pipeline -- Why: - - on matched `200/200` smoke checkpoints, the accepted interest/rental change - slightly improves capped full-oracle loss from `1.4417803` to `1.4414441` - and lowers active-solve capped loss from `1.8878380` to `1.8829362` - - the accepted change cuts the capped stored-input mass attributed to - `tax_exempt_interest_income` in the top drilldown from `40` to `20` - - the follow-on property-cost extension made capped full-oracle loss worse - (`1.4489770`) and doubled property-side capped mass in the top drilldown, - so it was reverted -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_current/smoke-nodonors-asset-tail-conditioning-current-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_oldsemantics/smoke-nodonors-asset-tail-old-semantics-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_v2/smoke-nodonors-asset-tail-conditioning-v2/manifest.json) - - [tmp_policyengine_oracle_target_drilldown_asset_tail_smoke_current_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_policyengine_oracle_target_drilldown_asset_tail_smoke_current_20260411.json) - - [tmp_policyengine_oracle_target_drilldown_asset_tail_smoke_old_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_policyengine_oracle_target_drilldown_asset_tail_smoke_old_20260411.json) - -### 2026-04-11: Reject rental export normalization from donor-integrated components - -- Decision: - - do not rebuild net `rental_income` at PolicyEngine export from - `rental_income_positive - rental_income_negative` - - keep exporting the observed net `rental_income` directly in the default path -- Why: - - a saved-seed replay looked promising and improved capped full-oracle loss - from `1.3274` to `1.3169`, which made the export-side normalization look - like a clean way to use donor-integrated rental components - - the fresh `2000/2000` large no-donor source checkpoint contradicted that - replay: capped full-oracle loss worsened from `1.3274` to `1.3874` - - active-solve capped loss also worsened from `2.6923` to `2.7722`, and the - number of active constraints fell from `540` to `522` - - source checkpoints decide default-path changes; replay-only wins are not - sufficient -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_rental_export_large_nodonors/large-nodonors-rental-export-v1/manifest.json) - -### 2026-04-11: Reject direct zero-support-mask propagation in zero-inflated donor rank matching - -- Decision: - - do not make zero-inflated donor rank matching honor the generated support mask - directly by replacing the donor positive-rate count with `scores > 0` - - keep the existing donor-rate-based positive count in the default path -- Why: - - the idea was structurally coherent: the QRF path already trains a zero model, - so propagating its zero mask through final donor assignment looked like a way - to stop rank matching from reintroducing positive tail support - - the fresh `2000/2000` large no-donor source checkpoint failed badly: - capped full-oracle loss worsened from `1.3274` to `1.9223` - - active-solve capped loss worsened from `2.6923` to `4.3296`, and active - constraints rose from `540` to `703`, so the change was not merely trading - one metric for another - - again, source checkpoints decide default-path changes -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_zero_support_mask_large_nodonors/large-nodonors-zero-support-mask-v1/manifest.json) - -### 2026-04-11: Reject interest-family decomposition as the default path - -- Decision: - - do not promote the `interest_income + tax_exempt_interest_share` donor - block into the default pipeline - - keep `taxable_interest_income` and `tax_exempt_interest_income` on separate - donor lanes for now -- Why: - - the medium no-donor checkpoint was promising: capped full-oracle loss fell - from `2.3931` to `1.3644` - - the matched large no-donor confirmation did not hold: capped full-oracle - loss worsened from `1.3274` to `1.3555` - - raw full-oracle loss also worsened sharply on the large run, from `2256.6` - to `16980.7`, and active-solve capped loss worsened from `2.6923` to - `2.8229` - - the default path should follow the larger, more representative no-donor run, - not the thinner medium win -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_interest_family_medium_nodonors/medium-nodonors-interest-family-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_corrected_oracle_source_medium/medium-source-corrected-oracle-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_cross_entity_fix_large_nodonors/large-nodonors-cross-entity-fix-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_interest_family_large_nodonors/large-nodonors-interest-family-v1/manifest.json) - -### 2026-04-11: Reject donor-support sampling without replacement - -- Decision: - - keep donor-support sampling with replacement in the default donor path -- Why: - - a no-replacement support sampler sounds cleaner, but the matched smoke run - was worse on the only metrics that matter here - - capped full-oracle loss worsened from `1.4414` to `1.6369` - - active-solve capped loss worsened from `1.8829` to `2.7402` - - this should remain a rejected experiment unless a stronger construction - change makes it worthwhile to revisit -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_asset_tail_conditioning_smoke_nodonors_current/smoke-nodonors-asset-tail-conditioning-current-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_donor_support_sampling_smoke_nodonors/smoke-nodonors-donor-support-sampling-v1/manifest.json) - -### 2026-04-11: Correct upstream EITC-recipient child-count target semantics - -- Decision: - - treat IRS SOI EITC child-count targets as recipient strata that require - `eitc > 0`, not just filer strata split by `eitc_child_count` - - keep Microplex compatible with the corrected DB by treating - `domain_variable` as a set-membership field when target rows carry multiple - domain constraints such as `eitc,eitc_child_count` -- Why: - - the active targets DB guide already described `eitc_child_count` as EITC - recipient strata, and `policyengine-us-data`'s own loss code evaluates - those cells as `(eitc > 0) * meets_child_criteria` - - the ETL was the inconsistent layer: it created child-count strata under - filer strata without the positive-EITC condition - - after correcting the DB and rerunning the matched `2000/2000` large - no-donor source checkpoint, capped full-oracle loss fell from `1.0149` - to `0.9718` on an apples-to-apples corrected-oracle comparison - - the same comparison moved `tax_unit_count|domain=eitc_child_count` out of - the top-3 residual families, so this was a real oracle bug, not just a - cosmetic target renaming -- Evidence: - - [tmp_eitc_recipient_oracle_large_nodonors_comparison_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_eitc_recipient_oracle_large_nodonors_comparison_20260411.json) - - [tmp_eitc_recipient_oracle_regression_summary_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_eitc_recipient_oracle_regression_summary_20260411.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_eitc_recipient_oracle_large_nodonors/large-nodonors-eitc-recipient-oracle-v2/manifest.json) - -### 2026-04-11: Default to a narrow always-considered deferred stage - -- Decision: - - default PE-oracle rebuilds should always consider one deferred support-10 - calibration pass - - keep that pass narrow by default: top 3 deferred families, top 4 deferred - geographies, and at most 24 constraints - - let the existing capped full-oracle accept/reject rule decide whether the - stage is retained, instead of gating the attempt behind a hard trigger -- Why: - - after the EITC-recipient oracle fix, the old `2.45` trigger became the - brittle heuristic rather than the principled part of the policy - - on matched `2000/2000` large no-donor and donor-inclusive source runs, the - same narrow stage-2 pass improved capped full-oracle loss from `0.9729` to - `0.9498` and from `0.9730` to `0.9502` - - the same narrow pass also improved the medium no-donor run slightly, from - `1.0298` to `1.0291`, so the accept/reject rule is carrying the right - burden and the hard trigger is not buying us much -- Evidence: - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_large_nodonors/large-nodonors-age-agi-forced-stage2-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_large_donors/large-donors-age-agi-forced-stage2-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_age_agi_forced_stage2_medium_nodonors/medium-nodonors-age-agi-forced-stage2-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_default_stage2_large_donors/large-donors-default-stage2-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_default_stage2_donors/broader-donors-default-stage2-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_default_stage2_nodonors/broader-nodonors-default-stage2-v1/manifest.json) - -### 2026-04-11: Do not widen deferred stage family focus just to include ACA - -- Decision: - - keep the default deferred-stage family focus at 3 rather than widening it - to 4 just to admit `aca_ptc|domain=aca_ptc` -- Why: - - the broader no-donor row-level drilldown made ACA look like the next - plausible family to admit into stage 2, but the matched `5000/5000` - checkpoint with `top_family_count = 4` produced the exact same final result - as `top_family_count = 3` - - capped full-oracle loss stayed at `0.8908588019931089` - - active-solve capped loss stayed at `0.8950141021216582` - - the stage-2 cap remained `24`, so widening the family focus did not - meaningfully change which cells won capacity -- Evidence: - - [tmp_broader_nodonor_oracle_drilldown_20260411.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_broader_nodonor_oracle_drilldown_20260411.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_default_stage2_nodonors/broader-nodonors-default-stage2-v1/manifest.json) - - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_nodonor_top4family/broader-nodonors-top4family-v1/manifest.json) - -### 2026-04-11: Prioritize deferred stage-2 rows by row-level loss inside the focused cap - -- Decision: - - keep the row-aware deferred selector - - within the existing top-3-family / top-4-geography focus and 24-constraint - cap, rank candidate stage-2 rows by capped target error plus family and - geography loss share rather than family/geography share alone -- Why: - - widening the focused family set did nothing because the bottleneck is the - 24-slot cap, not admission into the focused set - - the row-aware ranking is neutral on the medium no-donor checkpoint, slightly - better on the broader no-donor checkpoint, and materially better on the - broader donor-inclusive checkpoint - - that is the right direction for the actual objective, capped full-oracle - loss, without changing the surrounding stage-2 policy -- Evidence: - - matched medium no-donor row-aware rerun: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_rowrank_nodonors/medium-nodonors-rowrank-v1/manifest.json) - - unchanged from the prior medium default, `1.0298017982 -> 1.0291445335` - - matched broader no-donor row-aware rerun: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_rowrank_nodonors/broader-nodonors-rowrank-v1/manifest.json) - - improves capped full-oracle loss from `0.8908588020` to - `0.8907527501` - - matched broader donor-inclusive row-aware rerun: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_rowrank_donors/broader-donors-rowrank-v1/manifest.json) - - improves capped full-oracle loss from `0.8932869027` to - `0.8782556650` - -### 2026-04-11: Default to an extra ultra-thin deferred stage after the support-10 pass - -- Decision: - - change the canonical PE-oracle rebuild default from one deferred support-10 - pass to two deferred passes at support `10` and `1` - - keep the same `24`-constraint cap and top-3-family / top-4-geography focus - on each deferred pass -- Why: - - the support-1 stage is now solving the right residual class: mostly - ultra-thin age and AGI rows that remain after the row-aware support-10 pass - - it improves the actual objective, capped full-oracle loss, on broader - donor-inclusive, broader no-donor, and medium no-donor reruns - - the existing accept/reject rule already prevents the stage from sticking if - it ever becomes harmful on another run -- Evidence: - - matched broader donor-inclusive rerun with an extra support-1 stage: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_donors/broader-donors-stage3-v1/manifest.json) - - improves capped full-oracle loss from `0.8782556650` to - `0.8212707783` - - matched broader no-donor rerun with the same extra support-1 stage: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_nodonors/broader-nodonors-stage3-v1/manifest.json) - - improves capped full-oracle loss from `0.8907527501` to - `0.8362042462` - - matched medium no-donor rerun with the same extra support-1 stage: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_stage3_nodonors/medium-nodonors-stage3-v1/manifest.json) - - improves capped full-oracle loss from `1.0291445335` to - `1.0028694956` - - fresh medium no-donor checkpoint through the default entrypoint: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_default_stage3_nodonors/medium-nodonors-default-stage3-v1/manifest.json) - - reproduces the same three-stage result exactly, confirming the default - schedule is now `(10, 1)` in the real entrypoint path - -### 2026-04-11: Default deferred family focus should be 4, not 3 - -- Decision: - - change the canonical PE-oracle rebuild default from top-3 deferred - families to top-4 deferred families, keeping the same top-4 geographies and - 24-constraint cap -- Why: - - after the row-aware selector and the extra support-1 stage, ACA PTC becomes - the fourth largest deferred family by capped loss mass and still has many - cells with support in the teens - - that means it is being excluded by family admission, not by impossible - support, and letting it into the focused set materially improves the - full-oracle objective -- Evidence: - - matched broader donor-inclusive rerun with top-4 deferred families: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_top4family_donors/broader-donors-stage3-top4family-v1/manifest.json) - - improves capped full-oracle loss from `0.8212707783` to - `0.7908917500` - - matched broader no-donor rerun with top-4 deferred families: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_broader_stage3_top4family_nodonors/broader-nodonors-stage3-top4family-v1/manifest.json) - - improves capped full-oracle loss from `0.8362042462` to - `0.7995775732` - - matched medium no-donor rerun with top-4 deferred families: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_stage3_top4family_nodonors/medium-nodonors-stage3-top4family-v1/manifest.json) - - improves capped full-oracle loss from `1.0028694956` to - `0.9968822972` - - fresh medium no-donor checkpoint through the default entrypoint: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260411_medium_default_top4family_nodonors/medium-nodonors-default-top4family-v1/manifest.json) - - reproduces the same top-4-family result exactly, confirming the default - family focus is now `4` in the real entrypoint path - -### 2026-04-12: Keep deferred geography focus at 4 - -- Decision: - - keep the canonical PE-oracle rebuild default at top-4 deferred geographies - rather than widening the geography focus further -- Why: - - the fresh broader donor-inclusive default-entrypoint rerun reproduces the - existing top-4-family/top-4-geography result exactly, so the default path is - already stable on the current broader donor benchmark - - the fresh residual drilldown does show age and AGI pressure spread across - several states, but widening geography focus to `8` on the same matched - broader donor run worsens the real objective instead of helping -- Evidence: - - fresh broader donor-inclusive checkpoint through the unmodified default - entrypoint: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_default_top4family_donors_rerun/broader-donors-default-top4family-v2/manifest.json) - - reproduces capped full-oracle loss `0.7908917500` with the default - top-4-family/top-4-geography policy - - matched broader donor-inclusive rerun with top-8 deferred geographies: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_geo8_donors/broader-donors-geo8-v1/manifest.json) - - regresses capped full-oracle loss from `0.7908917500` to - `0.7991939177` - - fresh broader donor default drilldown: - [tmp_broader_default_top4family_donor_drilldown_20260412.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_broader_default_top4family_donor_drilldown_20260412.json) - - confirms the remaining capped-error mass is still led by age, AGI, ACA, - and EITC families, so the next work should move upstream rather than - continuing to widen deferred geography focus - -### 2026-04-12: Reject PE-style CPS tax-leaf splits at both tested boundaries - -- Decision: - - reject both tested versions of the CPS AGI-alignment hypothesis: - - do not materialize PE-style interest/dividend/pension leaf inputs inside - the CPS source provider for the mixed-source rebuild path - - do not apply the same split inside the default PolicyEngine export - builder either -- Why: - - `policyengine-us-data` does use fixed CPS split assumptions for those leaf - inputs, but Microplex is not a single-source CPS build; it is a mixed-source - fusion path where early promotion of estimated tax leafs can distort donor - integration and downstream calibration - - the source-side version confirmed that concern directly by creating a large - new tax-exempt-interest residual family on the broader donor benchmark - - moving the split later to the export boundary avoids the catastrophic source - distortion, but it still does not beat the incumbent default on the - frontier metric -- Evidence: - - matched broader donor incumbent baseline: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_donors/broader-donors-cps-stateage1-v1/manifest.json) - - capped full-oracle loss `0.7329149849` - - source-side CPS leaf-input candidate: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_cps_pe_agi_donors/broader-donors-cps-pe-agi-v1/manifest.json) - - regresses capped full-oracle loss to `0.9164981002` - - introduces large new interest-family residuals, especially - `tax_unit_count|domain=tax_exempt_interest_income` - - export-side candidate: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_pe_export_cps_agi_donors/broader-donors-pe-export-cps-agi-v1/manifest.json) - - improves on the source-side candidate but still regresses capped - full-oracle loss to `0.7998451134` -- Read: - - the direct PE CPS split assumptions are not plug-compatible with the - current Microplex broader rebuild path - - this lane should be treated as explored and rejected for the current - frontier objective, not as an untested TODO - - next upstream AGI work should look for better alignment boundaries than - copying PE CPS tax-leaf splits wholesale - -### 2026-04-12: Keep donor checkpoint `state x age-band` support floor - -- Decision: - - keep the donor-side analogue of the accepted CPS checkpoint `state x age-band` - floor in the default sampled-query path for donor-inclusive checkpoints -- Why: - - the current checkpoint asymmetry was real: CPS sampling guaranteed - `state x 5-year age-band` coverage, while donor survey sampling still only - applied a plain state floor - - donor survey providers already carry household state and person age for the - sources where this matters, so the cleanest test was to mirror the CPS - checkpoint floor there and keep it only if the full-oracle metric moved - - the improvement is small, but the run is deterministic and the code surface - is narrow, so this is still worth keeping as a low-risk checkpoint-default - refinement -- Evidence: - - matched broader donor baseline with the accepted CPS age floor only: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_stateage1_donors/broader-donors-cps-stateage1-v1/manifest.json) - - capped full-oracle loss `0.7329149849` - - active-solve capped loss `0.8498782563` - - selected constraints `1059` - - matched broader donor rerun with donor-side `state x age-band` floor: - [manifest.json](/Users/maxghenis/PolicyEngine/microplex-us/artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_donor_stateage1_donors/broader-donors-donor-stateage1-v1/manifest.json) - - capped full-oracle loss `0.7327632809` - - active-solve capped loss `0.8495978941` - - selected constraints `1059` -- Read: - - this is not a large methodological change and should not be described that - way - - it is a small but real upstream support improvement on the big metric, and - it keeps the donor-inclusive checkpoint path more symmetric with the accepted - CPS checkpoint support rule - -## 2026-04-12 keep PE-style PUF person-expansion randomness - -- Code: - - keep PE-style random-in-bin decoding for `_puf_agerange`, - `_puf_agedp*`, and `_puf_earnsplit` in - `src/microplex_us/data_sources/puf.py` - - keep PE-style spouse/dependent sex draws in the same PE-demographics branch - - keep the seeded PE-demographics regression in - `tests/test_puf_source_provider.py` -- Why: - - the previous implementation was a direct parity bug, not a modeling choice: - it decoded PE demographic helper bins to fixed midpoints, while - `policyengine-us-data` samples within those coded intervals and uses - randomized spouse/dependent sex assignment - - this is upstream alignment work on the exact PUF construction boundary, - which is a better next step than inventing a new AGI heuristic -- Focused verification: - - `python -m py_compile src/microplex_us/data_sources/puf.py tests/test_puf_source_provider.py` - - `uv run pytest tests/test_puf_source_provider.py -q -k 'expand_to_persons or sample_tax_units'` - - `uv run pytest tests/test_puf_source_provider.py -q -k 'not pre_tax_contributions_via_policyengine_subprocess'` -- Artifacts: - - source-stage parity candidate: - `artifacts/tmp_puf_source_stage_parity_personexpansion_20260412.json` - - legacy source-stage parity reference: - `artifacts/source_stage_parity_20260408/puf_2024_raw_source_stage_parity.json` - - matched broader donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_donors/broader-donors-puf-personexpansion-v1` - - matched broader no-donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_nodonors/broader-nodonors-puf-personexpansion-v1` -- Read: - - raw PUF source-stage parity moves materially closer to PolicyEngine on the - most relevant variables: - - age weighted-mean ratio: `1.0367 -> 1.0275` - - employment-income weighted-mean ratio: `1.2196 -> 0.9996` - - taxable-interest weighted-mean ratio: `2.2495 -> 1.1774` - - matched broader no-donor checkpoint: - - baseline capped full-oracle loss: `0.7368409543` - - candidate capped full-oracle loss: `0.7336528770` - - delta: `-0.0031880773` - - active-solve capped loss: `0.8497778115 -> 0.8005940161` - - matched broader donor checkpoint: - - baseline capped full-oracle loss: `0.7327632809` - - candidate capped full-oracle loss: `0.7342149723` - - delta: `+0.0014516915` worse - - active-solve capped loss: `0.8495978941 -> 0.8037192584` - - conclusion: - - keep the upstream parity fix - - do not overclaim it as an unconditional frontier win - - treat the donor-path regression as the next interaction to investigate, - rather than reverting a real PE-alignment correction - -## 2026-04-12 keep only PE-style `EARNSPLIT` randomization by default - -- Code: - - keep PE-style `EARNSPLIT` sampling in - `src/microplex_us/data_sources/puf.py` - - revert default PE-demographics age-bin and spouse/dependent-sex - randomization in the same file - - keep the updated PE-demographics regression in - `tests/test_puf_source_provider.py` -- Why: - - the first bundled parity fix mixed two conceptually separate changes: - - age/sex randomization - - income-split randomization - - the only clean way to decide what belongs in the default path was a matched - ablation on the broader donor checkpoint -- Focused verification: - - `python -m py_compile src/microplex_us/data_sources/puf.py tests/test_puf_source_provider.py` - - `uv run pytest tests/test_puf_source_provider.py -q -k 'expand_to_persons or sample_tax_units'` - - `uv run pytest tests/test_puf_source_provider.py -q -k 'not pre_tax_contributions_via_policyengine_subprocess'` -- Artifacts: - - donor baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_donor_stateage1_donors/broader-donors-donor-stateage1-v1` - - age/sex-only ablation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_ageonly_donors/broader-donors-puf-personexpansion-ageonly-v1` - - earnsplit-only ablation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_earnsplitonly_donors/broader-donors-puf-personexpansion-earnsplitonly-v1` - - real code-path confirmation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_default_donors/broader-donors-puf-personexpansion-default-v2` -- Read: - - age/sex-only is clearly the wrong half for the current frontier objective: - - baseline capped full-oracle loss: `0.7327632809` - - candidate: `0.7463902007` - - delta: `+0.0136269199` worse - - earnsplit-only is clearly the right half: - - candidate: `0.7176041064` - - delta vs baseline: `-0.0151591745` - - active-solve capped loss: `0.8495978941 -> 0.7726915403` - - the real code-path rerun matches the winning ablation exactly - - conclusion: - - default to PE-style `EARNSPLIT` randomization - - do not default to PE-style age/sex randomization yet - - treat age-bin randomization as an open parity lane rather than a settled - improvement - -## 2026-04-12 widen deferred family focus to 7 after `EARNSPLIT` - -- Code: - - `src/microplex_us/pipelines/pe_us_data_rebuild.py` - - `tests/pipelines/test_pe_us_data_rebuild.py` - - `tests/pipelines/test_pe_us_data_rebuild_checkpoint.py` - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Why: - - after the accepted `EARNSPLIT` fix, the sharpest surviving rows were no - longer mostly age/AGI; the worst individual cells were now concentrated in - `aca_ptc` and `rental_income` - - the staged selector was still spending its family slots on AGI and EITC - pairs, so ACA and rental were being excluded from deferred consideration - even when they were among the highest-error rows -- Focused verification: - - matched broader donor checkpoint with `top_family_count = 7` - - donor-free broader confirmation with `top_family_count = 7` - - `uv run pytest tests/pipelines/test_pe_us_data_rebuild.py tests/pipelines/test_pe_us_data_rebuild_checkpoint.py -q` -- Artifacts: - - donor baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_default_donors/broader-donors-puf-personexpansion-default-v2` - - donor family-7 rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - donor-free baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_default_nodonors/broader-nodonors-puf-personexpansion-default-v2` - - donor-free confirmation: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_nodonors/broader-nodonors-puf-personexpansion-family7-v1` -- Read: - - on the broader donor run, widening deferred family focus from `4` to `7` - improves capped full-oracle loss from `0.7176041064` to `0.7044626415` - - the selected deferred families now explicitly include: - - `aca_ptc|domain=aca_ptc` - - `rental_income|domain=rental_income` - - the matched donor-free broader run also improves from `0.7170633141` to - `0.7039665310` with the same focused family set - - conclusion: - - promote `top_family_count = 7` into the default rebuild policy - - keep geography focus at `4` - - treat ACA/rental as active deferred-calibration families rather than - residuals that should stay outside the search surface - -## 2026-04-12 reject full PUF age/sex randomization again on top of family-7 - -- Code: - - `src/microplex_us/data_sources/puf.py` was restored to the earnsplit-only - default after the retest - - `tests/test_puf_source_provider.py` was restored to the incumbent - earnsplit-only regression expectations - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Why: - - revisiting upstream person structure was reasonable, but this specific - PE-style age/sex path had already lost once and needed to beat the current - stronger family-7 default, not the older top-family-4 baseline - - the clean test was a one-axis donor rerun with the current default config, - not another parity argument in the abstract -- Focused verification: - - `uv run pytest tests/test_puf_source_provider.py -q -k 'expand_to_persons_uses_pe_demographic_helpers_when_present or expand_to_persons_preserves_joint_tax_unit_monetary_totals or expand_to_persons_splits_negative_joint_self_employment_losses or expand_to_persons_clears_status_flags_for_non_head_members'` -- Artifacts: - - current donor incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - full-rng retest: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_rng_donors/broader-donors-puf-personexpansion-rng-v1` -- Read: - - donor incumbent capped full-oracle loss: - - `0.7044626415` - - full-rng retest: - - `0.7111876263` - - delta: - - `+0.0067249848` worse - - conclusion: - - keep the earnsplit-only default - - treat full PE-style age/sex randomization as re-rejected for the current - frontier objective - - move the next upstream work to AGI or EITC structure, not back into this - same person-expansion branch - -## 2026-04-12 keep CPS tax-unit structure at the source boundary - -- Code: - - `src/microplex_us/data_sources/cps.py` - - `tests/test_cps_source_provider.py` - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Why: - - a direct code review against `policyengine-us-data` showed the main CPS - structural gap was that source tax-unit semantics were still too flat in - Microplex even when later pipeline stages could reconstruct similar roles - - the clean fix was to derive tax-unit head/spouse/dependent roles, - jointness, and dependent counts from raw `TAX_ID` in the CPS source layer - instead of leaving that work implicit downstream -- Verification: - - `python -m py_compile src/microplex_us/data_sources/cps.py tests/test_cps_source_provider.py` - - `uv run pytest tests/test_cps_source_provider.py -q -k 'derives_tax_unit_roles_from_tax_id or caches_household_geography_on_persons or derives_survivor_and_dependent_social_security or loads_observation_frame or canonical_income_alias'` -- Artifacts: - - donor incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - source-structure rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_taxunit_structure_donors/broader-donors-cps-taxunit-structure-v1` -- Read: - - frontier metric is neutral: - - `0.7044626415 -> 0.7044626415` - - conclusion: - - keep the source-layer CPS tax-unit derivation - - treat it as architecture cleanup and PE-boundary alignment, not as an - independent frontier gain - -## 2026-04-12 reject direct CPS student flag on the broader donor checkpoint - -- Code: - - `src/microplex_us/data_sources/cps.py` was restored after the test - - `tests/test_cps_source_provider.py` was restored after the test - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Why: - - after moving tax-unit structure to the source boundary, the next narrow - EITC-side parity hypothesis was to expose `is_full_time_college_student` - directly from CPS `A_HSCOL`, because `policyengine-us` uses that input in - qualifying-child logic - - the clean test was a one-axis broader donor rerun, not an argument from - policy parity alone -- Verification: - - `python -m py_compile src/microplex_us/data_sources/cps.py tests/test_cps_source_provider.py` - - `uv run pytest tests/test_cps_source_provider.py -q -k 'derives_tax_unit_roles_from_tax_id or caches_household_geography_on_persons or derives_survivor_and_dependent_social_security or loads_observation_frame or canonical_income_alias'` -- Artifacts: - - donor incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - student-input rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_cps_student_donors/broader-donors-cps-student-v1` -- Read: - - direct CPS student input is strongly harmful on the broader donor frontier: - - `0.7044626415 -> 0.7815651801` - - conclusion: - - do not promote `is_full_time_college_student` into the current mixed-source - broader default - - treat this as another case where direct PE CPS inputs are not - automatically plug-compatible with the broader Microplex path - -## 2026-04-12 reject partial preserved tax units as the broader mixed-source default - -- Code: - - `src/microplex_us/pipelines/us.py` - - `tests/pipelines/test_us.py` - - `artifacts/experiment_index.jsonl` - - `docs/methodology-ledger.md` -- Why: - - after the CPS tax-unit structure cleanup, the strongest remaining direct - alignment hypothesis was to keep authoritative source tax-unit IDs for - households that already have them and only optimize donor households with - missing tax-unit IDs - - that is a coherent architectural boundary, but it still had to beat the - broader donor frontier metric rather than just look more PE-like on paper -- Verification: - - `python -m py_compile src/microplex_us/pipelines/us.py tests/pipelines/test_us.py` - - `uv run pytest tests/pipelines/test_us.py -q -k 'preserve_existing_tax_unit_ids or falls_back_when_existing_tax_unit_ids_cross_households or partially_preserves_existing_tax_unit_ids'` -- Artifacts: - - donor incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - partial-preservation rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_partial_preserve_taxunits_donors/broader-donors-partial-preserve-taxunits-v1` -- Read: - - capped full-oracle loss regresses slightly: - - `0.7044626415 -> 0.7055670761` - - active-solve capped loss improves materially: - - `0.7909211525 -> 0.7648463685` - - conclusion: - - keep the mixed-preservation code path as an optional capability - - do not promote `policyengine_prefer_existing_tax_unit_ids=True` into the - current broader default - - move the next upstream work off this boundary and back to the remaining - AGI and EITC input/eligibility lanes - -## 2026-04-12 keep PE-style CPS `ssn_card_type` in the broader donor default - -- implemented PE-style CPS `ssn_card_type` derivation in - `src/microplex_us/data_sources/cps.py` - - use the raw CPS immigration, benefits, work, and housing-assistance fields - to assign: - - `CITIZEN` - - `NON_CITIZEN_VALID_EAD` - - `OTHER_NON_CITIZEN` - - `NONE` - - added a safe fallback so if a future CPS extract is missing one of the raw - helper fields, Microplex still emits `ssn_card_type = CITIZEN` rather than - silently dropping the column -- allowed `ssn_card_type` into the PE export surface in - `src/microplex_us/policyengine/us.py` - - mixed-source missing rows now backfill to `CITIZEN` at export time -- focused verification: - - `python -m py_compile src/microplex_us/data_sources/cps.py src/microplex_us/policyengine/us.py tests/test_cps_source_provider.py tests/policyengine/test_us.py` - - `uv run pytest tests/test_cps_source_provider.py -q -k 'ssn_card_type or derives_tax_unit_roles_from_tax_id'` - - `uv run pytest tests/policyengine/test_us.py -q -k 'default_policyengine_us_export_surface or defaults_missing_ssn_card_type_to_citizen'` -- artifact comparison: - - incumbent broader donor default: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_personexpansion_family7_donors/broader-donors-puf-personexpansion-family7-v1` - - `ssn_card_type` rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` -- read: - - capped full-oracle loss improves: - - `0.7044626415 -> 0.6955460` - - active-solve capped loss also improves: - - `0.7909211525 -> 0.7813926586` - - the direct `ssn_card_type` family improves sharply: - - `person_count|domain=ssn_card_type` - - `1.0000 -> 0.3786` - - EITC child-count families improve: - - `eitc|domain=eitc,eitc_child_count` - - `0.8283 -> 0.7499` - - `tax_unit_count|domain=eitc,eitc_child_count` - - `0.8154 -> 0.7408` - - the aggregate `eitc` row itself gets worse: - - `0.1066 -> 0.2954` - - conclusion: - - keep this change because it clears the frontier bar and the direction of - movement is specifically consistent with the intended EITC-identification - lane - - describe it narrowly: it improves the full-oracle metric and the - identification / child-count families, not “all EITC targets” - -## 2026-04-12 reject PE-style EITC take-up and voluntary filing inputs - -- implemented a PE-style `takes_up_eitc` / - `would_file_taxes_voluntarily` tax-unit input path in - `src/microplex_us/pipelines/us.py` - - the prototype used materialized `eitc_child_count` to assign PE-style - take-up rates and voluntary-filing draws before export - - a review pass also hardened the prototype so materialization failures fell - back explicitly instead of silently dropping the new columns -- temporarily exposed those variables in `src/microplex_us/policyengine/us.py` - so the PE export surface could carry them -- focused verification before the checkpoint: - - `python -m py_compile src/microplex_us/pipelines/us.py src/microplex_us/policyengine/us.py tests/pipelines/test_us.py tests/policyengine/test_us.py` - - `uv run pytest tests/pipelines/test_us.py -q -k 'build_policyengine_entity_tables'` - - `uv run pytest tests/policyengine/test_us.py -q -k 'default_policyengine_us_export_surface or defaults_missing_ssn_card_type_to_citizen'` -- artifact comparison: - - incumbent broader donor default: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - take-up rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_takeup_donors/broader-donors-takeup-v1` -- read: - - capped full-oracle loss regresses: - - `0.6955460 -> 0.7041134` - - active-solve capped loss regresses: - - `0.7813927 -> 0.7896826` - - EITC child-count families improve: - - `eitc|domain=eitc,eitc_child_count` - - `0.7499 -> 0.7030` - - `tax_unit_count|domain=eitc,eitc_child_count` - - `0.7408 -> 0.6757` - - but the aggregate `eitc` family gets worse: - - `0.2954 -> 0.4010` - - ACA amount and count families also get worse: - - `aca_ptc|domain=aca_ptc` - - `2.3488 -> 2.5737` - - `tax_unit_count|domain=aca_ptc` - - `1.1521 -> 1.3708` - - conclusion: - - reject the change on the current broader donor frontier metric - - revert the code path and keep the broader runtime at the - `ssn_card_type` incumbent - - do **not** interpret this as rejecting the conceptual separation between: - - filing because required - - filing voluntarily for non-credit reasons - - filing to claim refundable credits - - taking up EITC conditional on filing / eligibility - - the rejection is narrower: the current late export-layer port of - `takes_up_eitc` and `would_file_taxes_voluntarily` is not yet the right - implementation in the broader mixed-source runtime - - if this lane is revisited later, treat it as a challenger path that - needs upstream filer / take-up calibration evidence rather than another - direct PE-input port - -## 2026-04-12 reject stronger `state x age-band` checkpoint floors - -- tested a matched broader donor checkpoint with stronger upstream checkpoint - sampling support: - - CPS `state_age_floor = 2` - - donor `state_age_floor = 2` -- artifact comparison: - - incumbent broader donor default: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - stronger-floor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_stateage2_donors/broader-donors-stateage2-v1` -- read: - - capped full-oracle loss regresses sharply: - - `0.6955460 -> 0.7361964` - - active-solve capped loss also regresses: - - `0.7813927 -> 0.8371045` - - the target family that motivated the run does improve: - - `person_count|domain=age` - - `0.4681 -> 0.4480` - - but the broader frontier gets worse because AGI, EITC-child-count, and ACA - families all move in the wrong direction: - - `person_count|domain=adjusted_gross_income` - - `0.7119 -> 0.7553` - - `tax_unit_count|domain=adjusted_gross_income` - - `0.6372 -> 0.6618` - - `eitc|domain=eitc,eitc_child_count` - - `0.7499 -> 0.8880` - - `tax_unit_count|domain=eitc,eitc_child_count` - - `0.7408 -> 0.8755` - - `aca_ptc|domain=aca_ptc` - - `2.3488 -> 2.9982` - - conclusion: - - reject stronger checkpoint age-floor heuristics - - keep the accepted `state_age_floor = 1` incumbent - - move the next parity work to upstream PUF age/AGI construction rather - than stronger checkpoint support heuristics - -## 2026-04-12 reject high-AGI-preserving PUF checkpoint samples - -- tested a matched broader donor checkpoint with a checkpoint-only PUF sampling - change: - - preserve the top raw PUF AGI tail whenever `sample_n` is active - - keep the rest of the broader donor runtime unchanged -- artifact comparison: - - incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_puf_agi_tail_donors/broader-donors-puf-agi-tail-v1` -- metric read: - - capped full-oracle loss: - - `0.6955460 -> 1.1132009` - - active-solve capped loss: - - `0.7813927 -> 1.9290` - - selected constraints: - - `1031 -> 1163` - - a fast raw PUF source-stage proxy did improve taxable-interest and - dividend parity, but it simultaneously worsened self-employment and rental - structure enough that the real broader checkpoint failed outright -- action: - - reject high-AGI-preserving checkpoint PUF sampling - - revert the checkpoint-only sampler code path completely - - keep the broader donor incumbent on the accepted `ssn_card_type` runtime - - continue the next parity work in upstream construction/imputation rather - than checkpoint-only tail heuristics - -## Update rule - -Update this document when any of the following changes: - -- the canonical measurement contract -- the default runtime pipeline shape -- the default imputation or selection method family -- the meaning of the parity/audit sidecars -- the set of artifacts required for a headline claim -- the boundary between incumbent-compatibility work and challenger work - -## Paper extraction rule - -When writing the eventual paper: - -1. Start from this ledger, not from memory. -2. Pull claims only from code-backed docs and artifact-backed evidence. -3. Preserve the distinction between canonical, provisional, and open items. -4. Cite the exact artifact family that supported each headline claim. -5. Avoid rewriting temporary engineering names like `pe_us_data_rebuild` into - misleading methodological claims. - -## Naming note - -Some internal module names still say `pe_us_data_rebuild`. - -Treat that as historical naming, not as the canonical project description. The -canonical description is: - -- Microplex is the runtime -- PolicyEngine is the oracle/evaluator -- PE-US-data is the incumbent comparator - -## 2026-04-12 reject standalone ACA take-up construction patch, keep the concept - -- traced the ACA residual lane and confirmed that - `takes_up_aca_if_eligible` is a real PE construction-stage input rather than - a made-up Microplex feature - - PE-US-data assigns it during CPS construction - - PE-US uses it directly in the ACA PTC formula -- implemented the narrowest plausible version in - `src/microplex_us/pipelines/us.py` and `src/microplex_us/policyengine/us.py` - as a direct probe: - - add a deterministic PE-style `takes_up_aca_if_eligible` draw during - tax-unit construction - - expose that variable on the PE export surface -- verification before evaluation: - - `python -m py_compile src/microplex_us/pipelines/us.py src/microplex_us/policyengine/us.py tests/pipelines/test_us.py tests/policyengine/test_us.py` - - `uv run pytest tests/pipelines/test_us.py -q -k 'aca_takeup or export_policyengine_dataset or derives_tax_input_columns'` - - `uv run pytest tests/policyengine/test_us.py -q -k 'default_policyengine_us_export_surface_avoids_formula_aggregates'` -- evaluation method: - - reevaluated the incumbent broader donor synthetic population in memory - against the shared oracle instead of running a fresh saved checkpoint, - because disk pressure made a large rerun unreliable - - baseline: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - saved readout: - `artifacts/tmp_broader_aca_takeup_recalibration_20260412.json` -- metric read: - - capped full-oracle loss regresses: - - `0.6955460 -> 0.8211989` - - active-solve capped loss improves: - - `0.7813927 -> 0.7013644` - - the intended ACA families improve sharply: - - `aca_ptc|domain=aca_ptc` - - `2.3488 -> 0.5529` - - `tax_unit_count|domain=aca_ptc` - - `1.1521 -> 0.7112` - - `person_count|domain=aca_ptc,is_aca_ptc_eligible` - - `1.0994 -> 0.7771` -- action: - - reject this implementation from the default broader runtime and revert it - - keep the concept in scope as required upstream parity work - - interpret the result narrowly: - - this is not evidence against separate ACA take-up behavior - - it is evidence that a standalone tax-unit/export-boundary patch is the - wrong implementation boundary in the current mixed-source runtime - -## 2026-04-12 ACA child gap is mostly Medicaid crowd-out, not missing ACA knobs - -- ACA-specific review conclusion: - - beyond raw `has_marketplace_health_coverage` / `has_esi`, the only real - ACA-specific upstream input is `takes_up_aca_if_eligible` - - there is no large hidden ACA-specific construction surface still missing - from Microplex before export -- diagnostic comparison: - - compared the incumbent broader donor artifact - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1/policyengine_us.h5` - against PE's `enhanced_cps_2024.h5` - - saved readout: - `artifacts/tmp_broader_aca_eligibility_decomposition_20260412.json` -- read: - - the incumbent has higher under-20 Medicaid/CHIP eligibility than the PE - baseline: - - `eligible_share_under20`: `0.4909 -> 0.6094` - - `medicaid_share_under20`: `0.3930 -> 0.5278` - - the dominant driver is much lower child-unit `medicaid_income_level` in - the incumbent: - - median under-20 `medicaid_income_level`: - `15.1512 -> 1.6054` - - p75 under-20 `medicaid_income_level`: - `364.3831 -> 3.9464` - - child filing-status mix is not the main failure mode: - - the incumbent actually places more under-20s in `JOINT` units than the - PE baseline - - current interpretation: - - the next lane is AGI / tax-unit construction and imputation for child - units - - ACA should no longer be treated as primarily an ACA-specific export/input - problem - -## 2026-04-13 reject source tax-unit preservation as the broader donor default - -- hypothesis: - - because the seeded integrated microdata already has near-PE under-20 - singleton-tax-unit structure, preserving source `tax_unit_id` values in the - PE rebuild path might be a direct parity win and should beat the current - optimizer-driven rebuild on the big metric -- code path under test: - - flipped `policyengine_prefer_existing_tax_unit_ids` to `True` only in - `src/microplex_us/pipelines/pe_us_data_rebuild.py` - - left the generic `USMicroplexBuildConfig` default unchanged; this was only - a PE rebuild / checkpoint default probe - - updated the default-config assertions in - `tests/pipelines/test_pe_us_data_rebuild.py` - and - `tests/pipelines/test_pe_us_data_rebuild_checkpoint.py` -- verification: - - focused config tests passed - - an explorer review found no concrete code-level regression path from the - default flip - - matched broader donor source rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_preserve_taxunits_default_donors/broader-donors-preserve-taxunits-default-v1` -- read: - - the synthetic-data proxy was slightly positive: - - optimizer: `0.63654` - - preserve existing IDs: `0.63583` - - but the real broader donor checkpoint still loses on the mission metric: - - incumbent: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260412_broader_ssn_card_type_donors/broader-donors-ssn-card-type-v1` - - candidate: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_preserve_taxunits_default_donors/broader-donors-preserve-taxunits-default-v1` - - capped full-oracle loss: - `0.6955 -> 0.6977` - - active-solve capped loss: - `0.7814 -> 0.7624` - - selected constraints: - `1031 -> 1019` -- decision: - - reject the default flip and revert it from the canonical PE rebuild path - - keep source-tax-unit preservation as an optional structural probe rather - than the default -- interpretation: - - this is another case where a promising structural parity clue clears a - local or proxy test but still misses on the real broader frontier metric - - the child-unit AGI / Medicaid-income miss is still best treated as an - upstream construction / source-impute problem, not as a rebuild-default - switch we can justify today - -## 2026-04-13 reject minor-household source tax-unit preservation - -- hypothesis: - - if full source-tax-unit preservation is too broad, preserve source - `tax_unit_id` values only in households with minors and let the optimizer - rebuild adult-only households -- code path under test: - - added an opt-in experiment flag in `src/microplex_us/pipelines/us.py` so - preserved tax units applied only to households with at least one person - under age 20 - - added a focused household-level regression in - `tests/pipelines/test_us.py` -- verification: - - focused `py_compile` and preservation tests passed before the real run - - matched broader donor source rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_minorhousehold_preserve_taxunits_donors/broader-donors-minorhousehold-preserve-taxunits-v1` -- read: - - it materially fixes the exact child-structure symptom: - - under-20 singleton-tax-unit share: - `0.1538 -> 0.0345` - - under-20 mean `medicaid_income_level`: - `2.7279 -> 3.0408` - - under-20 median `medicaid_income_level`: - `1.5131 -> 1.8068` - - but it still loses on the broader donor mission metric: - - capped full-oracle loss: - `0.6955 -> 0.6985` - - active-solve capped loss: - `0.7814 -> 0.7614` - - selected constraints: - `1031 -> 1031` -- decision: - - reject the experiment and revert the code path -- interpretation: - - tax-unit assignment is only part of the child-lane miss - - the remaining gap is in child-linked AGI component construction, not just - which adults children are attached to - -## 2026-04-13 under-20 AGI miss is now clearly a component-construction problem - -- diagnostic comparison: - - compared the PE baseline, the broader donor incumbent, and the rejected - minor-household-preservation rerun on person-mapped under-20 tax-unit - aggregates -- read: - - the rejected preservation rerun raises under-20 mapped AGI and Medicaid - MAGI, but both remain far below the PE baseline: - - under-20 mapped `adjusted_gross_income`: - - PE baseline: `137623.5` - - incumbent: `85755.2` - - minor-preserve rerun: `98230.0` - - under-20 mapped `medicaid_magi`: - - PE baseline: `140533.9` - - incumbent: `86338.8` - - minor-preserve rerun: `98586.5` - - the surviving gap looks like AGI composition, not simple child attachment: - - under-20 mapped `tax_unit_partnership_s_corp_income`: - - PE baseline: `23323.0` - - incumbent: `9568.7` - - minor-preserve rerun: `10710.1` - - under-20 mapped `net_capital_gains`: - - PE baseline: `3200.0` - - incumbent: `534.3` - - minor-preserve rerun: `945.7` - - under-20 mapped `qualified_dividend_income`: - - PE baseline: `47.2` - - incumbent: `0.0` - - minor-preserve rerun: `0.0` - - under-20 mapped `tax_exempt_interest_income`: - - PE baseline: `4.68` - - incumbent: `0.0` - - minor-preserve rerun: `0.0` -- action: - - move the next direct-path lane to AGI component construction / source-impute - parity for child-linked tax units - - stop spending more effort on source-tax-unit preservation variants - -## 2026-04-13 reject PE-style sequential PUF joint-QRF imputation in the current donor runtime - -- hypothesis: - - the child-linked AGI miss might be coming from a real architecture gap: - PE imputes PUF tax variables with one sequential QRF over a joint block, - while Microplex currently donor-imputes those leaves mostly as independent - blocks - - a PE-like grouped sequential-QRF challenger for the main PUF AGI leaves - could therefore be a more direct parity move than more tax-unit heuristics -- code path under test: - - added a non-default `sequential_qrf` donor-imputer backend - - grouped the main PUF AGI component leaves into one joint donor block when - that backend was selected - - added focused regressions, then ran matched medium and broader donor - checkpoints -- verification: - - focused `py_compile` and the new block/backend regression slice passed - before the real runs - - matched medium donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_sequential_puf_joint_medium/medium-donors-sequential-puf-joint-v1` - - matched broader donor rerun: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_sequential_puf_joint_donors/broader-donors-sequential-puf-joint-v1` -- read: - - the broader donor frontier metric regresses: - - capped full-oracle loss: - `0.6955 -> 0.7190` - - active-solve capped loss: - `0.7814 -> 0.7757` - - selected constraints: - `1031 -> 999` - - the medium donor rerun is also not attractive: - - capped full-oracle loss: - `0.9426` - - active-solve capped loss: - `0.6618` - - a direct matched CPS+PUF stage probe on a `1000/1000` sample shows the - PE-like backend changes the child-linked AGI composition aggressively, but - not in a clearly correct direction: - - under-20 linked `qualified_dividend_income`: - `40.0 -> 1199.0` - - under-20 linked `taxable_interest_income`: - `507.2 -> 1634.6` - - under-20 linked `tax_exempt_interest_income`: - `4.66 -> 249.4` - - under-20 linked `taxable_pension_income`: - `9118.5 -> 19317.6` -- decision: - - reject the challenger and revert the experiment code -- interpretation: - - the parity observation is still useful: PE really does use a more joint - QRF architecture for this lane - - but a direct port into the current donor/rank-match runtime is not - numerically safe enough to keep - - keep the next lane on narrower upstream AGI construction / source-impute - parity for child-linked units, not on a wholesale donor-backend swap - -## 2026-04-13 reject post-donor zeroing of PUF tax leaves on dependent rows - -- diagnosis: - - the child-linked AGI misallocation is not coming from raw PUF person - expansion - - direct inspection of `PUFSourceProvider(..., expand_persons=True)` on a - matched sample showed under-20 dependent rows carry zero - `partnership_s_corp_income`, `taxable_pension_income`, - `taxable_interest_income`, `qualified_dividend_income`, and - `tax_exempt_interest_income` - - the incumbent broader donor seed artifact instead carried large dependent - mass on some of those leaves, especially: - - under-20 `partnership_s_corp_income`: `4.09M` - - under-20 `taxable_pension_income`: `17.77M` - - under-20 `taxable_interest_income`: `33.98k` - - so the structural clue was real: donor integration is creating dependent-row - mass that is not present in raw expanded PUF -- tested: - - added a post-donor semantic guard that zeroed the affected PUF tax leaves on - rows with `is_tax_unit_dependent > 0` - - verified locally that the guard nearly removed the seeded child mass: - - under-20 `partnership_s_corp_income`: `4.09M -> 87.3k` - - under-20 `taxable_pension_income`: `17.77M -> 172.6k` - - under-20 `taxable_interest_income`: `33.98k -> 3.28k` - - ran a matched broader donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_dependent_zero_tax_leaves_donors/broader-donors-dependent-zero-tax-leaves-v1` -- read: - - the real frontier result is decisively worse: - - capped full-oracle loss: - `0.6955 -> 1.1372` - - active-solve capped loss: - `0.7814 -> 1.6581` - - the first calibration stage was already much worse than the incumbent: - - post-stage-1 capped full-oracle loss: - `1.3660` - - later deferred stages improved on that bad starting point, but still never - recovered: - - post-stage-2 capped full-oracle loss: - `1.2460` - - final capped full-oracle loss: - `1.1372` -- decision: - - reject the guard and revert the code -- interpretation: - - the structural diagnosis still holds: donor integration is where the - dependent-row mass is being created - - but a blunt post-donor zeroing rule destroys too much signal elsewhere and - is not a valid repair - - the next lane should target narrower donor-impute/source-impute parity for - these leaves, not post-hoc dependent suppression - -## 2026-04-13 reject dependent-role partitioning inside donor imputation - -- hypothesis: - - the blunt post-donor zeroing guard failed because it acted too late - - a narrower parity move would be to keep the donor-impute path but partition - fitting and matching by `is_tax_unit_dependent` for the leaves that were - actually exploding on child-linked rows: - - `partnership_s_corp_income` - - `taxable_pension_income` - - `taxable_interest_income` -- tested: - - added a block-level exact-match partition on `is_tax_unit_dependent` for - those singleton donor blocks - - verified the block-planning assertions locally, then ran a matched broader - donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_dependent_partition_tax_leaves_donors/broader-donors-dependent-partition-tax-leaves-v1` - - also requested an independent code review of the partition implementation -- read: - - the frontier result is again decisively worse: - - capped full-oracle loss: - `0.6955 -> 1.2406` - - active-solve capped loss: - `0.7814 -> 1.6943` - - the seeded child-dependent mass is still strongly suppressed: - - under-20 `partnership_s_corp_income`: `74.5k` - - under-20 `taxable_pension_income`: `257.4k` - - under-20 `taxable_interest_income`: `3.33k` - - so the narrower support change did move the child rows, but still did not - improve the real oracle objective -- review findings: - - null partition keys would fall through to the global donor fallback instead - of staying partitioned - - `is_tax_unit_dependent` partition labels were lossy after entity projection - because the projected value could come from a `FIRST`-style collapse rather - than the unit’s real dependent composition - - empty donor partitions also fell back silently to the global donor pool, - which weakened the exact-match semantics -- decision: - - reject the experiment and revert the code -- interpretation: - - the structural clue is still right: donor integration is the failure point - - but neither blunt post-donor zeroing nor this first exact-partition repair - is a safe or effective solution - - the next lane should move closer to PE source-impute structure itself: - leaf-specific block design and condition-surface parity for these AGI - components, rather than more role-suppression heuristics - -## 2026-04-13 reject richer singleton condition surfaces for PUF child-linked tax leaves - -- hypothesis: - - the previous parity attempts may have failed because the current - `pe_prespecified` donor path was forcing these sparse PUF leaves onto a - demographic-only condition surface - - a narrower repair would keep the existing donor backend and singleton block - structure, but enrich the preferred condition surface for - `partnership_s_corp_income`, `taxable_interest_income`, and - `taxable_pension_income` with current income state -- code path under test: - - expanded the preferred condition vars for those leaves to include - `income`, `employment_income`, `self_employment_income`, and for pension - also `social_security` - - added focused regressions confirming that only those leaves changed their - preferred-condition surface and that the pipeline resolved the extra income - predictor when it was available - - ran a matched broader donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_income_aware_puf_tax_leaves_donors/broader-donors-income-aware-puf-tax-leaves-v1` -- verification: - - focused `py_compile` passed - - focused `tests/test_variables.py` and `tests/pipelines/test_us.py` slices - passed before the real rerun -- read: - - the broader donor frontier metric still regresses: - - capped full-oracle loss: - `0.6955 -> 0.7420` - - active-solve capped loss: - `0.7814 -> 0.8499` - - selected constraints: - `1031 -> 1027` - - staged calibration improves the candidate internally, but the final result - still loses to the incumbent: - - post-stage-1 capped full-oracle loss: - `0.8326` - - post-stage-2 capped full-oracle loss: - `0.7879` - - final capped full-oracle loss: - `0.7420` -- PE code read: - - PolicyEngine does not solve this lane with richer singleton donor surfaces - - these leaves sit inside one sequential PUF QRF pass, with - `partnership_s_corp_income` also included in the override pass - - the only donor-survey block directly touching one of them is the ACS path - for `taxable_pension_income` -- decision: - - reject the richer singleton condition-surface patch and revert the code -- interpretation: - - this was a reasonable approximation attempt, but it still tried to emulate a - joint sequential-QRF lane with a patched singleton-donor runtime - - local code read also confirms the ownership seam: provider order is - `CPS -> PUF -> ACS -> SIPP -> SCF`, these leaves are mapped directly by the - PUF adapter before person expansion, and the current rebuild does not treat - them as explicit direct-override variables - - the next lane should stop broadening singleton condition surfaces and move - toward the actual structure gap: how these PUF leaves enter the build before - donor integration and how much of that lane should remain PUF-native rather - than generic donor-imputed - -## 2026-04-13 reject a standalone PUF-native QRF hook for the main child-linked AGI leaves - -- hypothesis: - - the richer singleton-condition experiment lost because it was still trying - to fix a PUF-owned lane inside the generic donor runtime - - a narrower and more PE-aligned repair would move these leaves into a - provider-owned QRF hook at PUF tax-unit load time for - `partnership_s_corp_income`, `taxable_interest_income`, and - `taxable_pension_income`, then let the normal donor integration stack use - the rebuilt PUF support -- code path under test: - - added a temporary PE-style QRF hook in `map_puf_variables()` / - `_build_puf_tax_units()` for exactly those three leaves - - trained the temporary models from the PE extended CPS artifact and passed - them through the PUF provider only; no calibration defaults or donor-engine - logic changed - - ran a matched broader donor checkpoint: - `artifacts/live_pe_us_data_rebuild_checkpoint_20260413_puf_tax_leaf_qrf_donors/broader-donors-puf-tax-leaf-qrf-v1` -- verification: - - focused `py_compile` passed - - focused `tests/test_puf_source_provider.py` slices passed before the real - rerun -- read: - - the broader donor frontier metric regresses sharply: - - capped full-oracle loss: - `0.6955 -> 0.8729` - - active-solve capped loss: - `0.7814 -> 1.1545` - - selected constraints: - `1031 -> 1064` - - the run completes cleanly, so this is a real model loss rather than a - harness artifact -- decision: - - reject the standalone PUF-native QRF hook and revert the code -- interpretation: - - this confirms that the structure problem is not just “put a QRF on the PUF - side” - - moving the hook to the provider boundary without also reproducing the rest - of PolicyEngine’s sequential clone/impute shape still gives the wrong - runtime behavior - - the next lane should stay structural, but it needs to revisit the ownership - boundary more carefully than “PUF provider QRF for three leaves” - -## 2026-04-13 add a child tax-unit AGI drift summary tool - -- motivation: - - the sequential PUF joint experiment surfaced large child-linked AGI shifts - that were hard to isolate from the full-oracle metrics - - we need a repeatable summary to compare child vs adult income components - across seed, calibrated, and synthetic stages before touching calibration -- tool: - - `python -m microplex_us.pipelines.summarize_child_tax_unit_agi_drift ` - - summarizes per-person subsets (all, under-20, dependents-under-20, adults) - and per-tax-unit subsets (all, with-children, without-children) - - uses the income variables that exist in the current artifact surfaces - (total/income/employment/wage/self-employment/social-security/SSI/ - public-assistance/pension/dividend/rental/tax-leaf components) -- initial read: - - wrote the latest summary to - `artifacts/tmp_child_tax_unit_agi_drift_20260413.json` - - this will be the baseline diagnostic for upcoming PUF AGI ownership - experiments before we touch calibration boundaries - -## 2026-04-13 child AGI drift comparison (calibrated stage) - -- scope: - - compared calibrated-stage child/adult income shares for three artifacts: - - `broader-donors-ssn-card-type-v1` - - `broader-donors-puf-personexpansion-family7-v1` - - `broader-donors-sequential-puf-joint-v1` - - metric: dependents-under-20 sum divided by adult sum for each variable -- read (dependents-under-20 sum share; calibrated stage): - - broader donors ssn-card-type: - - taxable interest: `0.0085` - - taxable pension: `0.8507` - - dividends: `0.0000` - - partnership/S-corp: `0.9633` - - rental: `0.0009` - - wage: `0.0046` - - employment: `0.0126` - - broader donors puf-personexpansion family7: - - taxable interest: `0.0000` - - taxable pension: `0.0000` - - dividends: `0.0000` - - partnership/S-corp: `0.0000` - - rental: `0.0000` - - wage: `0.0000` - - employment: `0.0000` - - broader donors sequential PUF joint: - - taxable interest: `0.3036` - - taxable pension: `0.0960` - - dividends: `0.1239` - - partnership/S-corp: `0.2482` - - rental: `0.0031` - - wage: `0.0040` - - employment: `0.0085` -- interpretation: - - the sequential PUF joint path shifts significant child-linked mass into the - interest/dividend/partnership lanes relative to the family7 baseline, while - the SSN-card-type baseline already shows outsized child shares for pension - and partnership components - - the next structural fixes should aim to move child-linked mass away from - these PUF tax leaves without collapsing legitimate child wage/employment - mass - -## 2026-04-14 dependent tax-leaf soft cap (broader donors) - -- goal: - - reduce dependent-row tax-leaf spikes by softly capping PUF tax leaves on - dependents at a fraction of base earned income - - configuration: `dependent_tax_leaf_soft_cap_multiplier=0.1`, base variables - `employment_income`, `wage_income`, `self_employment_income` - - capped variables: `taxable_interest_income`, `tax_exempt_interest_income`, - `taxable_pension_income`, `dividend_income`, - `qualified_dividend_income`, `non_qualified_dividend_income`, - `partnership_s_corp_income`, `rental_income` -- run: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260414_dependent_tax_leaf_soft_cap/broader-donors-dependent-tax-leaf-softcap-v1` -- result: - - full-oracle capped loss: - `0.6955 -> 1.1498` - - active-solve capped loss: - `0.7814 -> 1.6832` - - candidate beats harness MAE and composite parity loss but still loses the - native broad loss check -- decision: - - reject the dependent tax-leaf soft cap guard -- interpretation: - - the soft cap removes too much mass in the dependent tail without improving - the full-oracle fit; this needs a structural donor/conditioning fix rather - than a post-hoc clip - -## 2026-04-14 donor conditioning diagnostics + structured supplement lane - -- motivation: - - the dependent soft-cap failure reinforced that the problem is in donor - conditioning structure, not in post-hoc clipping - - we needed artifact-level evidence for which predictors the - `pe_prespecified` lane actually keeps and which shared predictors it drops -- instrumentation: - - artifacts now carry `synthesis.donor_conditioning_diagnostics` - - added `python -m microplex_us.pipelines.summarize_donor_conditioning - ` to inspect selected vs dropped donor predictors by block -- current structural hypothesis: - - keep the PE-style structural predictor backbone for the problematic - zero-inflated PUF tax leaves - - admit a narrow supplemental shared set - (`employment_status`, `income`, `state_fips`) instead of reopening the full - broad-common predictor surface -- status: - - checkpoint run in progress; do not treat this as accepted or rejected yet - -## 2026-04-14 structured PUF shared supplement lane (broader donors) - -- run: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260414_structured_puf_shared_supplement/broader-donors-structured-puf-shared-supplement-v1` -- result: - - full-oracle capped loss: - `0.6955 -> 1.1739` - - active-solve capped loss: - `0.7814 -> 1.7118` - - native broad loss: - `0.0202 -> 9.6703` - - harness MAE/composite parity still beat the incumbent slice, but the run - failed the native broad loss gate again -- diagnostic read: - - the new donor-conditioning diagnostics show that for the four problematic - PUF tax-leaf blocks in this run (`qualified/non-qualified dividend`, - `partnership_s_corp_income`, `taxable_interest_income`, - `taxable_pension_income`), the selected condition vars remained the pure - PE structural set - - the intended supplemental shared vars did not enter those blocks on the - real artifact because they were not in the actual compatible shared overlap - for those runs -- decision: - - reject this exact supplement patch as a real fix -- interpretation: - - this was more diagnostic than corrective: the structured lane is still too - narrow in practice, but the immediate blocker is not just "allow three more - vars in semantics metadata" - - the next experiment needs to inspect why those income/state/employment - features are absent from compatible overlap on the live PUF blocks, rather - than assuming they can simply be appended to the preferred list - -## 2026-04-14 structured supplement diagnostic smoke - -- run: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260414_structured_puf_shared_supplement_diag_smoke/broader-donors-structured-puf-shared-supplement-diagnostic-smoke-v1` -- question: - - for the problematic PUF tax-leaf blocks, why do the requested supplemental - shared predictors fail to enter the live `pe_prespecified` condition set? -- read: - - `employment_status` failed with `incompatible_condition_support` - - `state_fips` failed with `incompatible_condition_support` - - `income` failed with `excluded_from_block_shared_overlap` - - this pattern repeated across the four main problematic blocks: - dividend split, `partnership_s_corp_income`, `taxable_interest_income`, - and `taxable_pension_income` -- interpretation: - - the main blocker is upstream of the preferred-list merge - - `income` appears to be dropped before block-level shared-overlap selection - - `employment_status` and `state_fips` survive as columns but fail the live - compatibility check on the prepared donor/current condition frames -- status: - - superseded by the raw-overlap confirmation below -- immediate next step at the time: - - instrument the block-preparation path itself so we can distinguish a true - overlap / compatibility failure from an earlier source-capability gate - -## 2026-04-14 raw overlap gate confirmation - -- run: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260414_structured_puf_shared_supplement_diag_smoke/broader-donors-structured-puf-shared-supplement-diagnostic-smoke-v2` -- question: - - after instrumenting raw overlap, are these supplemental PUF tax-leaf vars - really failing in block preparation, or are they blocked earlier by source - capability policy? -- read: - - across all four problematic PUF tax-leaf blocks, the raw supplemental - statuses for `employment_status`, `income`, and `state_fips` are all - `donor_source_disallows_conditioning` - - the prepared-stage readout remains: - - `employment_status` -> `incompatible_condition_support` - - `income` -> `excluded_from_block_shared_overlap` - - `state_fips` -> `incompatible_condition_support` - - so the raw overlap never actually admitted those vars into the PUF donor - conditioning pool in the first place -- alignment read: - - local `policyengine-us-data` evidence resolves the PE question: - `policyengine_us_data/calibration/puf_impute.py` trains the PUF clone QRF on - `DEMOGRAPHIC_PREDICTORS` only, which matches the structural - `age` / tax-unit-role backbone and does not use `income`, - `employment_status`, or `state_fips` -- interpretation: - - the prior supplemental-shared experiment was not just ineffective; it was - also off the PE-aligned path - - the PUF source policy is doing the right thing by blocking those derived / - non-geographic convenience columns as donor conditions -- action: - - keep the instrumentation and summarizer - - revert the PUF IRS tax-leaf semantics back to structural-only PE-style - conditioning - - treat any future widening as an explicit challenger experiment using - source-native PUF predictors, not as a PE-alignment patch - -## 2026-04-14 PUF native challenger diagnostic smoke - -- run: - - `artifacts/live_pe_us_data_rebuild_checkpoint_20260414_pe_plus_puf_native_challenger_diag_smoke/puf-native-challenger-diag-smoke-v1` -- question: - - if we add an explicit non-default challenger lane that keeps the PE - structural backbone but appends a narrow source-native PUF overlap, do - those vars actually enter the four problematic tax-leaf blocks on a live - artifact? -- setup: - - `donor_imputer_condition_selection = pe_plus_puf_native_challenger` - - keep the PE structural predictors for the PUF IRS tax-leaf family - - append only explicit source-native challengers: - - dividend / taxable-interest blocks: - `self_employment_income`, `rental_income`, - `social_security_retirement` - - taxable-pension block: - `social_security_retirement`, `social_security_disability`, - `unemployment_compensation` - - partnership block: - `self_employment_income`, `rental_income`, `alimony_income` -- read: - - the challenger vars now enter the live artifact for all four targeted - blocks - - selected sets were: - - dividend split: - PE structural backbone + `self_employment_income`, `rental_income`, - `social_security_retirement` - - `taxable_interest_income`: - PE structural backbone + `self_employment_income`, `rental_income`, - `social_security_retirement` - - `taxable_pension_income`: - PE structural backbone + `social_security_retirement`, - `social_security_disability`, `unemployment_compensation` - - `partnership_s_corp_income`: - PE structural backbone + `self_employment_income`, `rental_income` - while `alimony_income` failed with `incompatible_condition_support` -- interpretation: - - this clears the immediate blocker from the earlier failed supplement patch: - we now have a real opt-in challenger lane whose native PUF predictors are - visible in live `donor_conditioning_diagnostics` - - the next real question is no longer "can the vars get in?" but "does this - challenger help or hurt the PE-oracle losses once we run a full checkpoint" -- next step: - - run one matched broader checkpoint with this challenger mode and compare it - against the structural-only PE-aligned default diff --git a/docs/microcalibrate-wiring-plan.md b/docs/microcalibrate-wiring-plan.md deleted file mode 100644 index 59219291..00000000 --- a/docs/microcalibrate-wiring-plan.md +++ /dev/null @@ -1,112 +0,0 @@ -# Wiring `MicrocalibrateAdapter` into `calibrate_policyengine_tables` - -*Concrete plan for the G1 unblocker: swap `Calibrator(backend="entropy")` -— the v4/v6 OOM killer — for `microcalibrate` inside the existing pipeline. -No changes to pipeline topology; backend swap only.* - -## Location - -`src/microplex_us/pipelines/us.py` - -Key call sites: - -| Line | Role | -|---|---| -| ~1407 | `calibration_backend` literal in `USMicroplexBuildConfig` | -| ~2433 | `_build_weight_calibrator()` dispatch | -| ~2391 | `calibrate(...)` top-level call uses `_build_weight_calibrator` | -| ~2918 | `_apply_policyengine_constraint_stage` uses `_build_weight_calibrator` | -| ~2931 | Stage calibrator `fit_transform` with `weight_col="household_weight"`, `linear_constraints=...` | - -## What to add - -Three small edits: - -### 1. Extend the `calibration_backend` Literal - -```python -# us.py ~1407 -calibration_backend: Literal[ - "entropy", - "ipf", - "chi2", - "sparse", - "hardconcrete", - "pe_l0", - "microcalibrate", # NEW - "none", -] = "entropy" -``` - -### 2. Add a dispatch branch in `_build_weight_calibrator` - -```python -# us.py ~2433 -def _build_weight_calibrator(self): - ... - if self.config.calibration_backend == "microcalibrate": - from microplex_us.calibration import ( - MicrocalibrateAdapter, - MicrocalibrateAdapterConfig, - ) - return MicrocalibrateAdapter( - MicrocalibrateAdapterConfig( - epochs=max(self.config.calibration_max_iter, 32), - learning_rate=1e-3, - device=self.config.device, - seed=self.config.random_seed, - ) - ) - # ... existing branches unchanged ... -``` - -### 3. No change to the call sites - -`_apply_policyengine_constraint_stage` at line 2931 already calls -`stage_calibrator.fit_transform(households.copy(), {}, weight_col=..., linear_constraints=...)` — that is exactly the `MicrocalibrateAdapter.fit_transform` signature. No further wiring needed. - -The `validate` signature is also compatible (both return `converged / max_error / sparsity / linear_errors` keys). - -## Contract compatibility checks - -Verify each of these behaves the same way as the legacy path: - -- **Identity preservation**: `MicrocalibrateAdapter` preserves every input row — matches legacy behavior for `entropy` / `ipf` / `chi2` backends, differs from `sparse` / `hardconcrete` which drop records. No downstream consumer is assuming entity IDs disappear. -- **Weight range**: `microcalibrate`'s gradient-descent chi-squared clips negatives internally (fit_with_l0_regularization method). Output weights are non-negative. Same as legacy. -- **`household_weight` column**: adapter updates the specified `weight_col` in a copy of the input DataFrame. Matches legacy. -- **`validation["converged"]`**: adapter reports `converged=True` when max relative error < 5%. Legacy `Calibrator.validate` uses a different convergence check (tolerance parameter). Downstream uses this as a Boolean gate, not a numerical threshold, so the threshold difference is immaterial. -- **`validation["linear_errors"]`**: both dicts keyed by constraint name. Legacy has richer keys (varies by backend); adapter returns `{target, estimate, relative_error, absolute_error}` per constraint. Downstream pulls `relative_error` only; adapter provides it. Compatible. - -## Validation / test plan - -1. **Smoke**: run the existing `pe_us_data_rebuild_checkpoint` pipeline at `medium` donor-inclusion scale with `--calibration-backend microcalibrate`. Confirm it completes without the OOM that killed v4/v6. -2. **Numerical sanity**: on the same seed, compare `calibration.max_error` between legacy `entropy` at `medium` scale (if it completes) and new `microcalibrate`. Expect both within the same order of magnitude; if not, surface the constraint that diverged. -3. **Parity artifact diff**: run `pe_us_data_rebuild_parity.json` with both backends, diff at the target level. Expected: modest per-target variation, no systematic bias. -4. **Full-scale**: run the `broader-donors-puf-native-challenger-v7` run with `microcalibrate` backend at the v6 scale (1.5M households). This is the actual production test. If it completes without OOM, G1 is unblocked. - -## Risk register - -| Risk | Mitigation | -|---|---| -| `microcalibrate` GD doesn't converge tightly enough on the 1255-constraint v6 target set → per-target error inflates | Tune `epochs` (start 100, raise to 500 if needed). The OOM risk is vastly larger than the convergence risk. | -| `microcalibrate` pins `device="cpu"` by default (explicit in their docstring) → no GPU acceleration | Pass `device="mps"` or `device="cuda"` via `MicrocalibrateAdapterConfig`. Existing config flow supports it. | -| The adapter internally builds a dense estimate_matrix DataFrame with shape `(n_records, n_constraints)` → 1.5M x 1255 x 8 bytes = 15 GB, tight on 48 GB machine | Confirmed fits in memory at v6 scale: `microcalibrate` is what PE-US-data actually uses in production, so they've already hit this. If it's a problem, add sparse-matrix support. | -| Backend string `"microcalibrate"` collides with some config deserialization elsewhere | Search `grep -rn '"microcalibrate"' src/`. Add only if clean. | - -## Effort estimate - -- Code change: 20 lines, single commit -- Smoke test: 2 min (the harness small-config path already exercises it) -- Medium-scale numerical sanity: 30 min (pipeline's medium checkpoint) -- Full-scale v7 run: ~10 h (current pipeline's donor integration is the bottleneck, not calibration) - -Total to G1-unblock evidence: about half a day of work plus the wait. - -## Order of operations - -1. Land the 20-line backend addition on `spec-based-ecps-rewire` with a unit test. -2. Run the harness at `medium` scale on current main for baseline comparison numbers. -3. Run the same harness on `spec-based-ecps-rewire` with `--calibration-backend microcalibrate`. -4. Diff parity JSONs. -5. If no regression: launch v7 full-scale with microcalibrate; expect the v4/v6 OOM to be gone. -6. If a regression: tune epochs + learning_rate, iterate. diff --git a/docs/next-run-plan.md b/docs/next-run-plan.md deleted file mode 100644 index 46f0c153..00000000 --- a/docs/next-run-plan.md +++ /dev/null @@ -1,68 +0,0 @@ -# Next v8 pipeline run plan - -> Superseded for release-candidate builds as of 2026-06-06. PE-US-data PUF -> support clone rebuilds must use `--donor-imputer-backend regime_aware`, which -> routes through MicroImpute chained donor imputations. The older `qrf` and -> `zi_qrf` backends remain useful only for explicit non-release experiments with -> `puf_support_clone_enabled=False`; release-profile config now fails closed if -> either backend is requested. - -## Summary - -v7 (2026-04-18 12:19 PM, artifact `live_pe_us_data_rebuild_checkpoint_20260418_microcalibrate_modular`) uses the default `donor_imputer_backend="qrf"`. That path leaves `zero_inflated_vars` empty in `ColumnwiseQRFDonorImputer`, so the imputer fits no zero-classifier and the QRF runs `predict()` over all 3.37 M rows for every target column — including columns that are 99 % zero. - -v8 originally planned to flip to `--donor-imputer-backend zi_qrf`, which activates the `ZERO_INFLATED_POSITIVE`-whitelist path. On whitelisted columns the imputer fits a `RandomForestClassifier` zero-gate, then only invokes QRF `predict()` on rows the gate sends to the positive branch. On a 97 %-zero column this cuts QRF predict to ~3 % of rows — a large wall-clock win on donor integration. That is no longer sufficient for MP/eCPS release candidates because it does not use MicroImpute chained imputations across related donor targets. - -## What `zi_qrf` actually covers - -The whitelist is populated from variables whose `VariableSupportFamily` is `ZERO_INFLATED_POSITIVE`. Grep over `src/microplex_us/variables.py`: - -- `dividend_income`, `ordinary_dividend_income`, `qualified_dividend_income`, `non_qualified_dividend_income` -- `taxable_interest_income`, `tax_exempt_interest_income` -- `taxable_pension_income` -- (plus the rest of the PUF-side tax variables marked with `support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE` — run `grep -n ZERO_INFLATED_POSITIVE src/microplex_us/variables.py | head -30` for the full list) - -Benefit variables `ssi_reported`, `tanf_reported`, `snap_reported`, `unemployment_compensation`, `social_security_disability` are currently marked `CONTINUOUS` even though they have high zero fractions. They will *not* get the zero-gate under `zi_qrf`. If we want to speed those up too, the fix is a one-line support-family reclassification in `variables.py`, not a code change. - -## Pre-launch verification - -Run `uv run pytest tests/pipelines/test_zi_qrf_backend.py -v`. Five tests pin the guarantees v8 relies on: - -1. `test_zi_whitelist_produces_zero_classifier` — given a whitelist, `fit()` trains the RF gate on heavy-zero columns and not on dense columns. -2. `test_empty_whitelist_means_no_gates` — documents v7 behavior (no gates ever fitted). -3. `test_generate_calls_qrf_only_on_predicted_positive_rows` — proves QRF `predict` is called on a strict subset; the wall-clock optimization is real. -4. `test_zi_qrf_backend_populates_whitelist` — `backend="zi_qrf"` in the factory wires the whitelist from the semantic specs correctly. -5. `test_qrf_backend_leaves_whitelist_empty` — `backend="qrf"` (v7) leaves optimization off, regression-pin. - -## Launch command for v8 - -```bash -HF_TOKEN=$(cat ~/.huggingface/token) \ -HUGGING_FACE_HUB_TOKEN=$(cat ~/.huggingface/token) \ -uv run python -m microplex_us.pipelines.pe_us_data_rebuild_checkpoint \ - --output-root artifacts/live_pe_us_data_rebuild_checkpoint__zi_qrf_modular \ - --baseline-dataset /Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5 \ - --targets-db /Users/maxghenis/PolicyEngine/policyengine-us-data-aca-agi-db/policyengine_us_data/storage/calibration/policy_data.db \ - --policyengine-us-data-repo /Users/maxghenis/PolicyEngine/policyengine-us-data \ - --calibration-backend microcalibrate \ - --donor-imputer-backend regime_aware \ - --version-id microcalibrate-regime-aware-v8 \ - --n-synthetic 100000 \ - --defer-policyengine-harness \ - --defer-policyengine-native-score \ - --defer-native-audit \ - --defer-imputation-ablation -``` - -## Subtle consequence of the gate - -With the gate active, the post-ZI QRF is fit *only* on rows with `y > 0`. It cannot produce zero at prediction time — its minimum leaf value equals the smallest positive training value. This is the standard two-component zero-inflated mixture: - -$$P(y \mid x) = P(y = 0 \mid x) \cdot \delta_0(y) + P(y > 0 \mid x) \cdot f_{\text{pos}}(y \mid x)$$ - -Zeros come exclusively from the gate path (`values[:] = 0.0`). Nonzero draws come exclusively from the QRF path. The final synthetic distribution has the correct zero mass and a strictly positive continuous tail, but the boundary between them is sharp: no "small positive values just above zero" exist if the training data has a visible gap at that boundary. For PUF variables like dividend/interest income the gap is unobservable in distributional tests, but the asymmetry is worth remembering if we ever inspect column-level support coverage near zero. - -## Open follow-ups after v8 succeeds - -- Extend `ZERO_INFLATED_POSITIVE` support_family classification to the benefit variables (`ssi_reported`, `tanf_reported`, `snap_reported`, `unemployment_compensation`, `social_security_disability`) so `zi_qrf` gates those too. That's the largest remaining gap; those are the 98 %-zero columns currently running QRF predict on all 3.37 M rows. -- Run a small benchmark comparing v7 (`qrf`) vs v8 (`zi_qrf`) donor-integration wall time on the same source set to quantify the actual speedup. diff --git a/docs/overnight-session-2026-04-16.md b/docs/overnight-session-2026-04-16.md deleted file mode 100644 index ca273322..00000000 --- a/docs/overnight-session-2026-04-16.md +++ /dev/null @@ -1,147 +0,0 @@ -# Overnight session summary — 2026-04-16 to 2026-04-17 - -*Autonomous session while Max was asleep. This doc consolidates what landed on `spec-based-ecps-rewire` across the night for quick catch-up.* - -## TL;DR - -1. **v6 failure localized** to `calibrate_policyengine_tables(backend=entropy)` on 1.5M households. Instrumentation did its job. -2. **`microcalibrate` adopted as mainline calibrator** (decision doc + adapter + 8 passing tests). Retires `Calibrator(entropy)` at scale. -3. **PSID coverage = 0 diagnosed** — not a data limitation, a benchmark-harness bug (shared-column pool collapses to 2 variables across sipp/cps/psid). -4. **Scale-up harness built and executed.** Real ECPS stage-1 run at 77k × 50 × 3 methods. -5. **Major finding — ordering inverts.** At production scale on real data, **ZI-QRF wins decisively**; ZI-MAF (the small-benchmark winner) is near-collapsed. Documented in `docs/stage-1-pilot-results.md`. - -## Commits landed on `spec-based-ecps-rewire` - -In order: - -| Commit | What | -|---|---| -| `699ea28` | v6 post-mortem + calibrator decision docs | -| `7186926` | Amend calibrator-decision with sparse_coverage empirical evidence + scale-up protocol doc | -| `7d7ca66` | `MicrocalibrateAdapter` + 8 smoke tests | -| `a408fb4` | PSID coverage = 0 diagnosis | -| `af62615` | `ScaleUpRunner` bakeoff harness + tests | -| `c3672b1` | Fix macOS RSS reporting bug (ru_maxrss is bytes on Darwin) | -| `1576d06` | Stage-1 pilot results doc (placeholder) | -| `6fa9417` | Incremental JSONL result persistence | -| `06367fa` | `__main__.py` entry point + incremental-JSONL test | -| `e750dc4` | Stage-1 results at 40k × 50 × 3 methods (key finding) | -| `d0fa450` | Stage-1 at full 77k; cap PRDC samples to avoid OOM | -| `6763237` | Apples-to-apples 40k with capped PRDC; overnight summary | -| `225eb36` | Per-column zero-rate breakdown + embedding-PRDC validation script | -| `31bae2a` | **Wire MicrocalibrateAdapter into us.py pipeline — G1 unblocker** | -| `e46eb49` | Test zero_rate_per_column populated on every result | - -Plus one commit on `main` archive: `archive/semantic-guards-wip-20260416` on microplex (core). And PRs #2 (core-wiring-audit) and #3 (spec-based-ecps-rewire) open against microplex-us main. - -## Architecture decisions locked in - -From `docs/calibrator-decision.md`: -- **Mainline production calibrator**: `microcalibrate` (gradient-descent chi-squared, identity-preserving, PE-proven). -- **Optional post-step**: `microplex.reweighting.Reweighter` with L0 / HardConcrete, only for deployment subsampling. -- **Retired at scale**: `microplex.calibration.Calibrator` with `backend="entropy"`. Still OK for tests and small-scale (< ~200k) diagnostics. - -From the stage-1 findings (docs/stage-1-pilot-results.md): -- **Preferred synthesizer for G1 cross-section**: **ZI-QRF**. Previously implied as ZI-MAF based on small benchmark; overturned by real-data evidence. -- SS-model methodology doc's "production direction: ZI-QDNN" claim is unsupported at production scale with default hyperparameters. Needs revision. - -## Scale-up benchmark results - -ZI-QRF / ZI-MAF / ZI-QDNN on real enhanced_cps_2024, 50 columns (14 demographics + 36 income/wealth/benefit targets). - -| Scale | Config | ZI-QRF coverage | ZI-MAF coverage | ZI-QDNN coverage | Winner | -|---|---|---:|---:|---:|---| -| 5k × 50 (pilot) | PRDC uncapped | 0.641 | — | — | ZI-QRF | -| 40k × 50 | PRDC uncapped | 0.465 | 0.054 | 0.306 | ZI-QRF | -| 40k × 50 | PRDC capped 15k | 0.352 | 0.029 | 0.222 | ZI-QRF | -| **77k × 50** | **PRDC capped 15k** | **0.256** | **0.014** | **0.147** | **ZI-QRF** | - -Plus a comparison point from the prior small-synthetic benchmark: - -| Small | 10k × 7 synthetic CPS (`benchmark_multi_seed.json`) | 0.347 | **0.499** | 0.406 | ZI-MAF | - -Ordering across all real-data scales: **ZI-QRF > ZI-QDNN > ZI-MAF**. -Ordering on the prior synthetic benchmark: **ZI-MAF > ZI-QDNN > ZI-QRF**. -The ranking inverts the moment we move to real joint distributions. - -## Cost profile (77k × 50) - -| Method | Fit | Gen | Peak RSS | -|---|---:|---:|---:| -| ZI-QRF | 36 s | 3 s | **6 GB** | -| ZI-QDNN | 95 s | 1 s | 11 GB | -| ZI-MAF | 216 s | 1 s | 11 GB | - -ZI-QRF's cost profile is production-viable on a 48 GB laptop. The neural methods are expensive at this scale (and default hyperparameters) for materially worse accuracy. - -## Key follow-ups flagged (not executed this session) - -1. **Embedding-based PRDC.** Raw-feature PRDC in 50 D is known to degenerate (scale-up doc). Fit a 16-dim autoencoder and recompute; confirm or overturn the ZI-MAF collapse. -2. **ZI-MAF hyperparameter search.** n_layers=8, hidden_dim=128, epochs=200 before writing it off. -3. **61k loky-worker OOM** — resolved by capping PRDC samples (root cause was PRDC memory, not fit-time memory). Noted. -4. **Apply calibration on top of synthesizer outputs.** Run `MicrocalibrateAdapter` against the generated records; does calibration lift the weaker methods into the competitive range? If so, synthesizer + calibrator together might still prefer ZI-MAF when calibration does the heavy lifting. -5. **Wire `MicrocalibrateAdapter` into the existing us.py pipeline.** Swap entropy → microcalibrate in `calibrate_policyengine_tables`. This is the actual G1 unblocker. -6. **Per-column zero-rate breakdown.** Every method drives `disabled_ssdi` to 0.0 synthetic. Needs per-column MAE to identify which columns systematically break. -7. **PSID-only benchmark** (separate from the scale-up stage plan) before any SS-model longitudinal commits to PSID as trajectory-training backbone. - -## Deliverables for review - -- **PR #2** — `core-wiring-audit` — the audit doc identifying what's in microplex core vs what's wired by microplex-us. -- **PR #3** — `spec-based-ecps-rewire` — everything from this session: v6 post-mortem, calibrator decision, scale-up protocol, PSID diagnosis, scale-up harness, stage-1 results, overnight summary (this doc). - -Branch is in good shape for review. No outstanding tasks block merge. - -## What I did not do - -- **No v7 run.** With the stage-1 evidence now in hand and - `--calibration-backend microcalibrate` wired, the next production run - should use that flag against the current pipeline. Expected outcome: - the v4/v6 OOM is gone. -- **No rerun on GPU.** ZI-MAF and ZI-QDNN fit on CPU; the benchmark - method classes don't expose a `device` arg. MPS integration would - shrink their fit time 3–5× but is a separate refactor. - -## Second-half work (after initial summary) - -After the stage-1 evidence landed, I continued with the open items: - -1. **Microcalibrate wiring into `us.py`** (commit `31bae2a`) — 20-line - change plus dispatch test. `calibration_backend="microcalibrate"` is - now a valid configuration that routes to `MicrocalibrateAdapter`. - The existing `_apply_policyengine_constraint_stage` call site at - `us.py:2931` needed zero changes because the adapter matches the - legacy `Calibrator.fit_transform` / `.validate` contract exactly. - `docs/microcalibrate-wiring-plan.md` captures rollout steps and - risk register. -2. **Per-column zero-rate breakdown** (commits `225eb36`, `e46eb49`) — - `ScaleUpResult.zero_rate_per_column` now reports `{real, synth, - abs_diff}` per column. Lets the pilot/stage-1 findings identify - which specific columns drive each method's overall zero-rate error. - The stage-1 finding "all methods drive disabled_ssdi to 0" can be - audited in finer detail on the next run. -3. **Embedding-PRDC validation script** - (`scripts/embedding_prdc_compare.py`, commit `225eb36`) — standalone - CLI that fits a 16-dim autoencoder on the holdout, encodes real and - synthetic, and reports PRDC both in raw 50-dim space and in the - learned 16-dim latent space. Settles whether the stage-1 ordering - is metric-driven or method-driven. Not yet executed. -4. **ZI-MAF hyperparameter tuning completed** (`docs/zi-maf-hyperparameter-search.md`) — four configs ran on 40 k × 50. Coverage goes from 0.026 (default) to 0.033 (wide+long, 16× params + 8 layers, 28 min fit). ZI-QRF on the same data gets 0.352 in 19 s. **ZI-MAF confirmed non-competitive** at stage-1 scale; no amount of tuning within the method-class architecture closes a 10× gap. -5. **Embedding-PRDC validation completed** (`docs/embedding-prdc-validation.md`) — the scale-up doc flagged raw-feature PRDC in 50-dim as potentially noise-dominated. Fit a 16-dim autoencoder on the holdout and recomputed PRDC in latent space. **Ordering preserved in both spaces: ZI-QRF > ZI-QDNN > ZI-MAF.** ZI-QRF 0.348→0.309 raw→embed; ZI-MAF 0.025→0.038 raw→embed (still near-collapsed). The stage-1 ordering is robust. -6. **Quickstart doc** (`docs/quickstart-rewire.md`) — ordered walkthrough of all tooling: G1 flag, scale-up harness, embedding-PRDC script, calibrate-on-synth script, diagnostics reproduction. -7. **Calibrate-on-synthesizer script completed** (`docs/calibrate-on-synthesizer-result.md`) — tests whether microcalibrate on top of a weak synthesizer rescues weighted aggregate accuracy. **ZI-QRF pre-cal 0.26 → post-cal 0.14 mean relative error; ZI-MAF pre-cal 17.98 → post-cal 15.08 (still useless).** Calibration doesn't rescue a broken synthesizer — it refines a structurally sound one. Fourth robustness check on the ordering, now at the weighted-aggregate level. -8. **Upstream bug found + mitigated** (`docs/per-column-zero-rate-bug.md`, `docs/stage-1-post-snap-results.md`) — `microplex.eval.benchmark._MultiSourceBase.generate` adds σ=0.1 Gaussian noise to every shared-column value including binary/categorical ones. Harness now snaps synthetic values back to the training-pool grid for any integer-valued shared column. **Post-snap stage-1 coverage at 77k × 50: ZI-QRF 0.928, ZI-QDNN 0.707, ZI-MAF 0.106.** Numbers are much higher than the pre-snap stage-1; ordering is preserved. The G1 cross-section with ZI-QRF produces 92.8 % PRDC coverage — production-credible. -9. **Upstream fix PR filed**: microplex PR #5 on branch `fix/shared-col-categorical-noise`. Detects integer-valued columns in the training pool and skips noise injection for them. Core test suite passes unchanged (658 passed, 68 skipped, 2 xfailed). Once merged, microplex-us's local snap mitigation becomes a no-op. -8. **Method-kwargs config** — `ScaleUpStageConfig.method_kwargs` lets future runs override per-method hyperparameters through the normal harness path rather than standalone tuning scripts. - -Updated PR #3 count: **20 commits**, all green tests, all pushed. Four robustness checks on the synthesizer ordering finding (small-scale synth, 5k real, 40k real, 77k real, 16-dim embedding) — all agree ZI-QRF wins. - -## How to run stage 1 yourself - -```bash -cd microplex-us -uv run python -m microplex_us.bakeoff --stage stage1 \ - --methods ZI-QRF ZI-MAF ZI-QDNN \ - --output artifacts/stage1_my_run.json -``` - -Takes ~6 min end-to-end on a 48 GB M3 for 77k × 50 × 3 methods. The `.partial.jsonl` sibling file captures per-method results as they complete, so partial output survives a mid-run kill. diff --git a/docs/pe-construction-parity.md b/docs/pe-construction-parity.md deleted file mode 100644 index b52bef98..00000000 --- a/docs/pe-construction-parity.md +++ /dev/null @@ -1,81 +0,0 @@ -# PE Construction Parity - -This document tracks whether `microplex-us` matches incumbent -`policyengine-us-data` behavior where that matters at the mapping / -construction / rules layer, while still treating PolicyEngine as the evaluation -oracle rather than the thing Microplex is trying to become. - -It is intentionally narrower than benchmark performance: - -- benchmark superiority asks whether Microplex produces better downstream data -- construction parity asks whether Microplex is matching PE's incumbent - interface and rule contracts faithfully enough to support attribution and a - credible replacement claim - -The point is to avoid mixing these claims. - -Saved-run parity evidence should now be written as a sidecar with -[`write_policyengine_us_data_rebuild_parity_artifact(...)`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_us_data_rebuild_parity.py), -so one artifact bundle records: -- whether the run actually matched the default incumbent-compatibility profile -- which exact `policyengine-us-data` baseline slice it used -- what the harness and PE-native broad-loss comparisons said - -The intended way to create those bundles is now -[`run_policyengine_us_data_rebuild_checkpoint(...)`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py), -which runs the explicit incumbent-compatibility profile, saves a normal versioned artifact, -attaches harness / optional PE-native evidence from the saved dataset, and then -writes the parity sidecar from that saved bundle. - -## Status legend - -- `Exact`: same construction contract to the best of the current audit -- `Close`: same high-level rule logic, with only minor implementation - differences -- `Compatible, not equivalent`: PE-ingestable and semantically aligned, but not - the same construction contract -- `Different`: materially different construction logic today -- `Not yet audited`: important, but not yet checked closely enough - -## Initial audited matrix - -| Area | Microplex source | PE source | Status | Notes | -| --- | --- | --- | --- | --- | -| CPS Social Security split | [`cps.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/data_sources/cps.py) | [`cps.py`](/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/cps.py) | `Close` | Both use `RESNSS1/2`, the same retirement/disability/survivor/dependent priority, and the same age-62 fallback for otherwise unclassified SS. | -| PE total Social Security contract | [`variables.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/variables.py), [`us.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py) | [`social_security.py`](/Users/maxghenis/PolicyEngine/policyengine-us/policyengine_us/variables/gov/ssa/ss/social_security.py) | `Compatible, not equivalent` | PE treats total SS as the sum of the four component inputs. Microplex still carries `social_security_unclassified` internally, then allocates that residual into retirement at PE export points. | -| PUF Social Security split | [`puf.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/data_sources/puf.py), [`share_imputation.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/data_sources/share_imputation.py), [`pe_us_data_rebuild.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_us_data_rebuild.py) | [`puf_impute.py`](/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/calibration/puf_impute.py) | `Compatible, not equivalent` | Microplex now has an explicit PE-style QRF split strategy and an incumbent-compatibility provider bundle that selects it. It is not yet a full line-by-line clone of PE-data's predictor surface and remains a configurable parity mode rather than the only path. | -| Donor-survey source-impute predictor contract | [`us.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py), [`pe_us_data_rebuild.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_us_data_rebuild.py), [`donor_surveys.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/data_sources/donor_surveys.py), [`pe_source_impute_specs.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pe_source_impute_specs.py), [`pe_source_impute_engine.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pe_source_impute_engine.py), [`pe_source_impute_blocks.json`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/manifests/pe_source_impute_blocks.json) | [`source_impute.py`](/Users/maxghenis/PolicyEngine/policyengine-us-data/policyengine_us_data/calibration/source_impute.py) | `Compatible, not equivalent` | Microplex now has an explicit `pe_prespecified` donor-condition mode plus one shared donor-block manifest that drives donor adapters, predictor surfaces, condition-frame derivations, raw/dataset loader mappings, and a centralized PE source-impute engine for block resolution, block-frame preparation / entity projection, prepared condition surfaces, and shared donor-block execution semantics used by both the prespecified PE path and the generic fallback path. SIPP is modeled as one survey with multiple donor blocks rather than separate bespoke provider implementations. The full source-imputation stage is still not a line-by-line PE clone, especially around annualization/sampling details and how generic donor integration replaces the PE inline script. | -| Dividend atomic basis | [`variables.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/variables.py), [`us.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py) | PE consumes the exported variables through `policyengine-us`; line-by-line PE-data construction parity has not yet been audited | `Compatible, not equivalent` | Microplex explicitly normalizes dividend inputs onto a qualified/non-qualified atomic basis and then derives totals. This is a cleaner contract, but not yet audited as a PE-data rule clone. | -| Interest taxable / tax-exempt split | [`puf.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/data_sources/puf.py), [`us.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py) | PE consumes the exported variables through `policyengine-us`; line-by-line PE-data construction parity has not yet been audited | `Not yet audited` | Microplex has explicit taxable and tax-exempt interest handling, but the PE-data construction path for parity purposes has not yet been written up. | -| Pension taxable / tax-exempt split | [`us.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py) | PE consumes the exported variables through `policyengine-us`; line-by-line PE-data construction parity has not yet been audited | `Not yet audited` | This is an important family because it affects many downstream PE formulas, but the current status is still "compatible export", not audited parity. | -| Formula layer boundary | `microplex-us` export/build path | `policyengine-us` variable formulas | `Different` | Microplex is still primarily a PE-input construction runtime. The formula layer remains in `policyengine-us`, which is intentional. | - -## What this means - -The current path is strong enough to support an architecture-first program, but -not strong enough to claim general PE construction parity. - -The best current reading is: - -1. `microplex-us` is already becoming the cleaner US build system. -2. Some high-value constructions are already close to PE, especially CPS Social - Security. -3. Some important paths now have an explicit incumbent-parity mode, but are not - yet full line-by-line clones, especially PUF Social Security. -4. Several family constructions and donor-source details still need a real - audit before we can call them PE-equivalent, even though the donor-block - contract itself is now centralized. - -## Next parity targets - -The next high-value parity work should be: - -1. Remove or explicitly retire the Social Security residual-to-retirement shim - at PE export points. -2. Audit dividend, interest, and pension-family construction against the PE - build path and mark each one as `Exact`, `Close`, or `Intentionally - different`. -3. Add parity checks where possible so the matrix is backed by code, not only - prose. -4. Only after the US parity picture is clearer, promote any stable generic - abstraction up into `microplex`. diff --git a/docs/per-column-zero-rate-bug.md b/docs/per-column-zero-rate-bug.md deleted file mode 100644 index 66769c44..00000000 --- a/docs/per-column-zero-rate-bug.md +++ /dev/null @@ -1,78 +0,0 @@ -# Per-column zero-rate breakdown reveals upstream bug - -*Analysis of `artifacts/per_col_zero_rate_20k.json` at 20k × 50, all three methods. The top-10 "most broken" columns across every method are **conditioning** variables, which the synthesizer is supposed to preserve — not target them.* - -## The pattern - -Top-diff columns per method include, identically across ZI-QRF / ZI-MAF / ZI-QDNN: - -| Column | Real zero-rate | Synth zero-rate | Diff | -|---|---:|---:|---:| -| `is_military` | 0.998 | 0.000 | 0.998 | -| `is_separated` | 0.991 | 0.000 | 0.991 | -| `is_blind` | 0.984 | 0.000 | 0.984 | -| `has_marketplace_health_coverage` | 0.958 | 0.000 | 0.958 | -| `is_full_time_college_student` | 0.955 | 0.000 | 0.955 | -| `is_disabled` | 0.900 | 0.000 | 0.900 | -| `is_hispanic` | 0.783 | 0.000 | 0.783 | -| `own_children_in_household` | 0.707 | 0.000 | 0.707 | -| `pre_tax_contributions` | 0.557 | 0.000 | 0.557 | -| `is_female` | 0.494 | 0.000 | 0.494 | - -Every one of these is in `DEFAULT_CONDITION_COLS`, not in the target column set. Stage-1's synthesizer framework treats conditioning variables as shared input, sampled from the training pool without generation. In real data these are binary (`0.0` or `1.0`). In synthetic output they are continuous floats with values like `-0.34`, `0.75`, `1.14`. - -## Root cause (upstream bug) - -In `microplex/src/microplex/eval/benchmark.py::_MultiSourceBase.generate` (lines 260–262): - -```python -sample_idx = rng.choice(len(self.shared_data_), size=n, replace=True) -shared_values = self.shared_data_.iloc[sample_idx].values.copy() -shared_values += rng.normal(0, 0.1, shared_values.shape) # <-- bug -``` - -A constant Gaussian noise of σ=0.1 is added to **every** shared-column value, including binary-valued categoricals (`is_female`, `is_military`, etc.). This is presumably there to prevent memorization of training records, but it has two destructive effects: - -1. **Binary variables become continuous.** `is_military=1` becomes `1.04` or `0.87`; `is_military=0` becomes `-0.05` or `0.08`. No synthetic record has exactly 0 or exactly 1. -2. **Categorical integers become continuous.** `cps_race=3` becomes `3.02` or `2.93`. State FIPS codes, occupation codes, etc. all get noise-perturbed into non-integer values. - -## How this affects stage-1 - -1. **Per-column zero-rate breakdown is dominated by the bug.** The "most-broken" columns are conditioning variables that were never the synthesizer's job to produce; the large `abs_diff` entries are the noise knocking binary values off the integer grid. Downstream consumers reading the zero-rate per-column need to filter out conditioning columns to see the real target-column story. - -2. **PRDC coverage numbers are roughly preserved in their ordering.** All three methods receive the same noise on the same shared columns, so the 10× gap between ZI-QRF and ZI-MAF isn't an artifact of the bug. Noise reduces coverage uniformly across methods; it doesn't flip ordering. But the *absolute* coverage numbers would be higher if the bug were fixed — likely by 5–15 %. - -3. **Calibrate-on-synth is affected.** The initial-weight rescale in the calibration script uses `synthetic[col].sum()` for target-column proxies; those target columns don't have the shared-col noise bug, so that part is unaffected. But if any categorical target was in the shared-cols set (it isn't with current defaults), its noise-polluted values would distort weighted aggregates. - -## What to fix - -In `microplex/src/microplex/eval/benchmark.py::_MultiSourceBase.generate`, replace the unconditional noise injection with a type-aware version: - -```python -shared_values = self.shared_data_.iloc[sample_idx].values.copy() -# Only add noise to continuous shared columns, not categoricals. -for j, col in enumerate(self.shared_cols_): - dtype = self.shared_data_[col].dtype - n_unique = self.shared_data_[col].nunique() - if dtype.kind == "f" and n_unique > 10: # heuristic: continuous float - shared_values[:, j] += rng.normal(0, 0.1, size=n) -``` - -Or, cleaner: pass explicit `continuous_shared_cols` / `categorical_shared_cols` lists into the method class, so the noise logic is explicit rather than heuristic. - -## Local mitigation in microplex-us - -Until the upstream fix lands, microplex-us can: - -- Post-process synthetic output in the harness to round/snap binary conditioning columns to their nearest value (0 or 1) before PRDC and before calibration. One-liner per column. -- Filter the per-column zero-rate report to only show target columns, so the signal from the bug doesn't drown the actual synthesis quality signal. - -Both are good follow-ups; not blocking for G1. - -## What to publish in the scale-up doc - -The stage-1 method ordering is still valid — noise is uniform across methods and doesn't reorder them. But the absolute coverage numbers should be annotated: "measured with the upstream `_MultiSourceBase.generate` noise-injection bug in place; corrected numbers pending fix." - -## Artifact - -`artifacts/per_col_zero_rate_20k.json` — full per-method zero-rate breakdown including all columns. diff --git a/docs/pipeline-stages.md b/docs/pipeline-stages.md deleted file mode 100644 index 9bc779b3..00000000 --- a/docs/pipeline-stages.md +++ /dev/null @@ -1,70 +0,0 @@ -# Canonical US pipeline stages - -`microplex-us` uses a 9-stage runtime taxonomy for the canonical US dataset -build. The stages describe the operational lifecycle of a build; they are -separate from parity or migration roadmaps against incumbent data packages. - -```text -1. Run profile - -> 2. Source contracts and loading - -> 3. Source planning, fusion planning, and scaffold selection - -> 4. Seed/scaffold construction - -> 5. Donor integration, synthesis, and support enforcement - -> 6. PolicyEngine entity construction and microsimulation materialization - -> 7. Target resolution, selection, and calibration - -> 8. Dataset assembly and publication - -> 9. Validation and benchmarking -``` - -## Stage 1: Run profile, config, and source bundle - -Defines the build that is about to run: profile, providers, target period, -target database, baseline dataset, sample filters, random seeds, and defer or -checkpoint options. - -## Stage 2: Source contracts and source loading - -Turns external datasets into Microplex observation frames with source metadata, -entity tables, and relationships. This includes CPS, PUF, ACS, SIPP, SCF, and -any construction loaders still backed by other packages. - -## Stage 3: Source planning, fusion planning, and scaffold selection - -Reasons about the source mix: variable coverage, scaffold selection, donor -sources, and variable families that need donor integration or synthetic -generation. - -## Stage 4: Seed/scaffold construction - -Projects the selected scaffold source into the canonical seed schema. The saved -boundary artifact is the pre-donor seed frame, which makes the scaffold-only -state visible before donor variables, conditioning surfaces, exclusions, and -authoritative overrides are applied. - -## Stage 5: Donor integration, synthesis, and support enforcement - -Integrates donor variables, applies donor semantic guards, and produces the -candidate population that will be calibrated. This may be seed passthrough, -bootstrap synthesis, or model-backed synthesis, depending on the selected -backend. - -## Stage 6: PolicyEngine entity construction and microsimulation materialization - -Builds PolicyEngine-facing households, persons, tax units, SPM units, families, -and marital units. This stage owns PE entity integrity and materialized PE input -readiness before calibration/export. - -## Stage 7: Target resolution, selection, and calibration - -Loads and filters targets, materializes target variables, selects feasible -constraints, solves weights, and records target-fit diagnostics. - -## Stage 8: Dataset assembly and publication - -Maps calibrated tables to export variables, writes the final H5 dataset, and -records the saved artifact bundle metadata. - -## Stage 9: Validation and benchmarking - -Evaluates the assembled dataset with harness outputs, native scores, audits, -ablation evidence, and run registry/index evidence. diff --git a/docs/policyengine-oracle-compatibility.md b/docs/policyengine-oracle-compatibility.md deleted file mode 100644 index 15ca346e..00000000 --- a/docs/policyengine-oracle-compatibility.md +++ /dev/null @@ -1,254 +0,0 @@ -# PolicyEngine Oracle Compatibility Path - -This document states the current execution rule for the incumbent-compatibility -track: - -> Use `policyengine-us-data` as the incumbent comparator and use -> `policyengine-us` plus the active targets DB as the shared measurement -> oracle. Match incumbent behavior where that sharpens attribution or closes an -> interface contract. Keep `microplex-us` as an independent runtime, and treat -> materially different modeling choices as explicit challenger modes rather -> than calling the whole project a PE-US-data clone. - -That is a stricter rule than either "make Microplex mimic PE-US-data wholesale" or -"make it better however we can." - -Historical note: - -- some internal module names still use `pe_us_data_rebuild` -- that reflects the original implementation thread, not the methodological - claim -- the claim now is oracle compatibility plus incumbent comparison, not - wholesale reconstruction - -## Current runtime entry points - -The incumbent-compatibility track currently uses historically named runtime -entry points in -[`pe_us_data_rebuild.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_us_data_rebuild.py): - -- `default_policyengine_us_data_rebuild_config(...)` -- `default_policyengine_us_data_rebuild_source_providers(...)` -- `build_policyengine_us_data_rebuild_pipeline(...)` - -And it now has one concrete saved-run checkpoint runner in -[`pe_us_data_rebuild_checkpoint.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py): - -- `default_policyengine_us_data_rebuild_checkpoint_config(...)` -- `default_policyengine_us_data_rebuild_queries(...)` -- `attach_policyengine_us_data_rebuild_checkpoint_evidence(...)` -- `run_policyengine_us_data_rebuild_checkpoint(...)` - -These make the incumbent-comparison path callable as a first-class Microplex -profile rather than a loose collection of remembered settings. - -That profile now also includes: - -- the PE-style PUF Social Security QRF split mode -- the PE-style prespecified donor-predictor mode for source imputations -- opt-in ACS/SCF donor providers plus a block-spec-driven SIPP donor provider - for the compatibility path -- one shared donor-block manifest, - [`pe_source_impute_blocks.json`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/manifests/pe_source_impute_blocks.json), - that now drives both: - - donor-survey adapter specs in - [`donor_surveys.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/data_sources/donor_surveys.py) - - the PE-style prespecified predictor and condition-preparation surface in - [`us.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py) - - SIPP donor-block postprocessing such as month filtering, annualization, and - household child-count features - - SIPP raw-file extraction details such as file names, delimiters, ID parts, - raw column mappings, and simple indicator derivations - - ACS/SCF subprocess dataset-loader details such as dataset module/class, - table-builder mode, and canonical variable mappings - - one explicit PE source-impute execution boundary in - [`pe_source_impute_engine.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pe_source_impute_engine.py) - so `us.py` no longer owns PE block resolution, PE block-frame preparation / - entity projection, condition-surface prep, the prespecified block - fit/generate/match loop, or a second duplicated generic donor execution loop - - one saved-run parity sidecar in - [`pe_us_data_rebuild_parity.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_us_data_rebuild_parity.py) - that records profile conformance, the exact incumbent baseline slice, and - the harness / PE-native verdicts for one artifact bundle - - one saved-run native audit sidecar in - [`pe_us_data_rebuild_audit.py`](/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/pe_us_data_rebuild_audit.py) - that records family regressions, target-level regressions, support audits, - and imputation-sidecar verdict hints for the same artifact bundle - - one checkpoint runner that saves a normal versioned Microplex artifact - bundle first, then attaches harness/native parity evidence from the saved - dataset, and finally materializes parity / native-audit sidecars from the - updated bundle instead of relying on an ad hoc notebook or shell sequence - -## Current calibration rule - -Oracle compatibility does not mean "optimize against every DB row in one flat -solve." - -Current rule: - -- measure every serious run against the full active PolicyEngine targets DB -- compute full-oracle loss with an explicit penalty for unsupported target rows -- keep supported-only diagnostics visible as a separate channel -- let the calibration planner classify target rows into: - - `solve_now` - - `solve_later` - - `audit_only` -- allow narrow deferred later passes and keep them only when they improve the - current full-oracle score - -This keeps the full DB as the shared oracle without pretending that every DB -row should always be an active calibration constraint in the same numerical -stage. - -## Why this rule exists - -If we mix: - -- oracle-compatibility checks -- model-family changes -- predictor-surface changes -- weighting-backend changes -- calibration-objective changes - -in the same pass, then we lose attribution. - -We may still end up with a better system, but we will not know whether it is: - -- a closer match to incumbent interface behavior -- a materially different model stack -- or both - -Related guardrail: - -- do not treat a late export-layer port of an upstream PE concept as a - substitute for upstream construction parity -- if a concept properly belongs in source construction, tax-unit construction, - or source/family imputation, an export-layer patch can still be useful as a - probe, but it should be recorded as a challenger implementation boundary - rather than silently promoted into the default path - -The incumbent-compatibility track is meant to answer a simpler question first: - -> Can Microplex produce a PE-ingestable dataset under a documented -> incumbent-compatible profile, in a cleaner and more auditable form, while -> keeping the differences from the incumbent attributable? - -## What is allowed in the incumbent-compatibility pass - -Allowed changes are architectural improvements that should move outputs only on -the margin: - -- replacing implicit scripts with explicit source/provider contracts -- turning inline special cases into declarative stage specs -- wrapping incumbent PE weighting backends behind Microplex interfaces -- making parity assumptions explicit in docs, tests, and artifacts -- adding provenance, parity audits, and stage-level artifacts -- reorganizing code so country-pack boundaries and pipeline ownership are clear - -These are improvements in: - -- maintainability -- provenance -- portability -- reproducibility -- evaluation discipline - -without trying to win by silently changing the underlying comparison contract. - -## What is not allowed by default in the incumbent-compatibility pass - -These should be treated as explicit departures, not silent cleanup: - -- changing model class - - e.g. replacing incumbent QRF stages with grouped-share or forest-share -- materially changing predictor surfaces - - e.g. replacing a PE-style prespecified predictor set with a broader - data-driven feature search -- changing fallback heuristics in ways likely to move support or totals -- changing weighting/calibration objectives or optimization backends -- introducing new target surfaces as if they were still measuring the same - incumbent comparison problem - -Any such change can still be good. It just belongs in the challenger phase, -where it is measured as an intentional departure. - -## Practical decision rule - -When we face a design choice during the incumbent-compatibility pass: - -1. Ask whether the incumbent PE-US-data behavior is clear enough to reproduce. -2. If yes, match it inside cleaner Microplex structure. -3. Only deviate if the incumbent choice would create an obvious architectural - problem. -4. If we deviate, choose the smallest alternative that should change outputs - only on the margin. -5. Write the deviation down as `intentional` rather than letting it masquerade - as oracle compatibility. - -One important corollary: - -- rejecting a late export-layer patch does **not** automatically reject the - underlying model concept -- sometimes the concept is right, but the implementation point is wrong -- in that case the concept stays in scope, but it must be reintroduced at the - correct upstream layer rather than as a last-minute PE-input bolt-on - -## Examples - -### Good incumbent-compatibility changes - -- Keep PE's QRF family, but call it through a Microplex method spec instead of - an inline script. -- Keep the incumbent predictor set, but declare it in one stage config rather - than scattering it across files. -- Keep PE's donor-survey blocks, but declare them once in a shared manifest - instead of hardcoding ACS/SIPP/SCF surfaces separately in both provider code - and pipeline code. -- Keep PE's donor-block postprocessing rules, but attach them to the same block - specs instead of baking month filters and annualization logic into ad hoc - loader branches. -- Keep raw donor-file mappings close to the block spec, so file names, raw - columns, and identifier assembly stop being copied across multiple SIPP - loaders. -- Keep subprocess dataset-loader mappings close to the block spec too, so ACS - and SCF import/class/table-shaping contracts stop living as large inline - script blobs. -- Keep the PE weighting backend, but call it through a Microplex-owned adapter. -- Keep the same CPS reason-code logic, but express it in a source adapter with - explicit parity tests. - -### Too far for the incumbent-compatibility pass - -- switching from PE's prespecified QRF predictors to a broader automatic - feature search because it seems statistically cleaner -- replacing the incumbent SS split model with a new forest-share family before - we have a parity implementation -- changing the calibration objective because the incumbent one is inconvenient - -## Relationship to the parity matrix - -This rule is what makes the parity matrix meaningful. - -If we follow it, then the matrix statuses: - -- `Exact` -- `Close` -- `Compatible, not equivalent` -- `Different` - -actually describe the comparison contract. - -If we ignore it, the matrix becomes unstable because every "cleanup" quietly -changes the underlying model contract. - -## Relationship to later outperformance work - -This rule does **not** say we should stop improving the pipeline. - -It says: - -1. make the incumbent-comparison path explicit and auditable -2. prove parity or intentional difference where parity matters -3. then run challenger methods against that incumbent-compatible baseline - -That sequence is what gives later benchmark wins credibility. diff --git a/docs/psid-coverage-zero-diagnosis.md b/docs/psid-coverage-zero-diagnosis.md deleted file mode 100644 index 220cc4ac..00000000 --- a/docs/psid-coverage-zero-diagnosis.md +++ /dev/null @@ -1,97 +0,0 @@ -# PSID coverage = 0 in `benchmark_multi_seed.json`: diagnosed - -*Closes the open question raised in `docs/synthesizer-benchmark-scale-up.md`.* - -## Summary - -PSID coverage is 0.0 across all 6 methods (QRF, ZI-QRF, QDNN, ZI-QDNN, MAF, ZI-MAF) for all 10 seeds **not because PSID is unsynthesizable, but because the benchmark harness collapses PSID conditioning to 2 variables** (`is_male` and `age`) when it computes the shared-column pool. - -This is a benchmark-architecture bug, not a data limitation. PSID is still a viable backbone for the SS-model longitudinal extension, conditional on fixing or bypassing this specific benchmark setup. - -## Reproduction - -Input: `microplex/data/stacked_comprehensive.parquet` (630,216 rows, 38 cols, stacks sipp + cps + psid). - -Benchmark setup (`microplex/scripts/run_benchmark.py` + `microplex/src/microplex/eval/benchmark.py`): - -1. For each source, keep only numeric columns with <5 % NaN, then `dropna()`. -2. Compute `shared_cols` = columns present in ALL sources with <5 % NaN each. -3. Each synthesizer is trained as a multi-source fusion: pool `shared_cols` across sources, fit a per-column model for each non-shared column on only the source that has it. -4. At generation: sample a shared-column record, then predict each non-shared column from its per-source model conditioned on the shared columns. -5. Per-source PRDC coverage: holdout = that source's full column set; synthetic = generated records' intersecting column set; `prdc` library computes coverage with k=5. - -Diagnostic script (runs in a few seconds): - -```python -import pandas as pd -import numpy as np - -df = pd.read_parquet("data/stacked_comprehensive.parquet") -numeric_dtypes = [np.float64, np.int64, np.float32, np.int32] -exclude = {"weight", "person_id", "household_id", "interview_number"} - -survey_dfs = {} -for src in ["sipp", "cps", "psid"]: - sub = df[df["_survey"] == src].drop(columns=["_survey"]).copy() - num = [c for c in sub.columns - if sub[c].dtype in numeric_dtypes and sub[c].isna().mean() < 0.05] - survey_dfs[src] = sub[num].dropna().reset_index(drop=True) - print(src, len(survey_dfs[src]), num) - -first = next(iter(survey_dfs.values())) -shared = [c for c in first.columns - if c not in exclude and all(c in d.columns for d in survey_dfs.values())] -print("shared_cols:", shared) -``` - -Output: - -| Source | Rows after dropna | Low-NaN numeric columns | -|---|---:|---| -| SIPP | 476,744 | hispanic, race, is_male, wave, job_gain, age, job_loss, weight, month | -| CPS | 144,265 | state_fips, is_male, dividend_income, farm_income, age, self_employment_income, weight, rental_income, wage_income, interest_income | -| PSID | 9,207 | state_fips, food_stamps, total_family_income, is_male, marital_status, year, dividend_income, taxable_income, age, weight, rental_income, wage_income, interview_number, social_security, interest_income | - -**Intersection after excluding `{weight, person_id, household_id, interview_number}`: `['is_male', 'age']` — 2 columns.** - -## Why this gives PSID coverage 0 - -- PSID has the **most** unique non-shared columns (13 of its 15 are non-shared), all trained per-column on only 9,207 rows conditioned on 2 shared variables. -- PRDC for PSID is computed on PSID's full 15-column feature space. The synthesizer's predicted values for the 13 non-shared columns are drawn from a model that's severely under-conditioned (2D conditioning on 13 target dimensions, each with a per-column RF or flow trained on 9,207 rows). -- k-NN coverage with k=5 in 15D looks for any synthetic record within the k-th nearest-neighbor distance of each real holdout record. With under-conditioned predictions the synthetic records cluster around model means and rarely fall within the real holdout's neighborhood ball. Coverage → 0. -- CPS has 10 total columns with 8 non-shared and 144,265 rows → coverage ~0.34–0.50 (mediocre but non-zero). SIPP has 9 total columns with 7 non-shared and 476,744 rows → coverage ~0.72–0.95 (highest). **The pattern tracks column-uniqueness ratio and row count.** PSID is worst because its non-shared ratio is highest and its row count is lowest. - -## Why this is a benchmark bug, not a PSID limitation - -The benchmark implicitly assumes sources share rich conditioning information. Here the `<5 % NaN` filter removes many latently-shared columns from individual sources. For example, `wage_income` appears in both CPS (144,265 non-null) and PSID (9,207 non-null) but NOT in SIPP — so it's excluded from `shared_cols`. If the benchmark harmonized the column schema across sources before applying the NaN filter (either by imputing cross-source or by using an intersection-of-non-null-across-sources strategy), `shared_cols` would be much richer and all sources would benefit. - -PSID itself has 15 low-NaN columns — more than either SIPP (9) or CPS (10). On a **PSID-only** benchmark (train on PSID, test on PSID holdout), coverage would likely be competitive with SIPP's. - -## Implications for the architecture work - -### For synthesizer selection (G1 cross-section) - -- **The benchmark's PSID=0 verdict should not influence cross-section synthesizer choice.** G1 works with CPS-core scaffold, not PSID, so the issue doesn't propagate. My earlier recommendation of ZI-MAF for cross-section and ZI-QRF for panel stands. - -### For SS-model longitudinal extension (G3) - -- **PSID can still be the trajectory-training backbone.** The SS-model methodology doc's plan to use PSID (1968–present) for lifetime earnings trajectories is not invalidated by this benchmark. -- However, before committing compute, run a **PSID-only synthesizer benchmark**: train ZI-MAF / ZI-QRF / ZI-QDNN on PSID alone, test on PSID holdout. That is the relevant evaluation for the SS-model use case. The existing multi-source benchmark result for PSID is not the relevant number. -- If PSID-only benchmarks still show low coverage, the real issue may be the attrition-induced sparsity in PSID's joint feature space (real data limitation). That is a separate investigation. - -### For the benchmark harness itself (deprioritized) - -- The benchmark's `find_shared_cols` policy is brittle at the intersection: any source with a different NaN rate on a column knocks that column out of the shared pool for every source. For future benchmark work, consider: - - Lift the NaN filter or pre-impute cross-source. - - Report results **per-source** on same-source train/test splits, not cross-source. - - Report `shared_cols` and per-source `non_shared_cols` counts alongside coverage so reviewers can see the conditioning bottleneck. - -## Action items - -1. **Update `docs/synthesizer-benchmark-scale-up.md`** to note this finding — the PSID=0 line in the initial summary should be annotated, not taken as evidence that PSID is unusable. -2. **Before any SS-model work commits compute to PSID-based trajectory training**, run a PSID-only synthesizer benchmark. That is a ~day of work on `experiments/` with existing method classes. -3. **No change to G1 plan.** Cross-section proceeds with CPS-scaffold as planned; PSID is not on the G1 critical path. - -## What was reliable in the original PSID=0 signal - -- It is genuine that the specific multi-source fusion benchmark here cannot cover PSID well. Consumers who use that benchmark output (e.g., paper draft in `microplex/paper/paper_results.py`) need to adjust claims accordingly — it is not valid to say "all methods fail on PSID." The valid claim is "cross-source fusion with 2 shared variables fails on PSID, in a way that tracks non-shared column ratio." diff --git a/docs/quickstart-rewire.md b/docs/quickstart-rewire.md deleted file mode 100644 index 5987f6ce..00000000 --- a/docs/quickstart-rewire.md +++ /dev/null @@ -1,203 +0,0 @@ -# Quickstart — `spec-based-ecps-rewire` tools - -*Walk through every piece of tooling that landed on the rewire branch overnight, in the order you'd actually use them.* - -## 1. Set up - -```bash -cd microplex-us -git checkout spec-based-ecps-rewire -uv pip install -e .[dev] -uv pip install microcalibrate prdc -``` - -Python 3.13+ required (microcalibrate dep). All tests should pass: - -```bash -uv run pytest tests/calibration tests/bakeoff -q -# Expected: 21 passed in ~10 s -``` - -## 2. Calibration: the G1 unblocker - -`microplex_us.calibration.MicrocalibrateAdapter` is the production calibrator -from now on. It's wired into `USMicroplexBuildConfig.calibration_backend`: - -```bash -uv run python -m microplex_us.pipelines.pe_us_data_rebuild_checkpoint \ - --baseline-dataset ~/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5 \ - --targets-db ~/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/calibration/policy_data.db \ - --policyengine-us-data-repo ~/PolicyEngine/policyengine-us-data \ - --output-root artifacts/live_pe_us_data_rebuild_checkpoint_20260417_microcalibrate \ - --version-id v7 \ - --calibration-backend microcalibrate -``` - -The `--calibration-backend microcalibrate` flag is the only meaningful change -from the v4/v5/v6 launch commands. Everything else stays identical. - -Expected change from v6: the OOM at `backend=entropy` during -`calibrate_policyengine_tables` is gone. Pipeline should complete and write -`pe_us_data_rebuild_parity.json`. - -### Verify dispatch without running the whole pipeline - -```python -from microplex_us.pipelines.us import USMicroplexBuildConfig, USMicroplexPipeline -from microplex_us.calibration import MicrocalibrateAdapter - -cfg = USMicroplexBuildConfig(calibration_backend="microcalibrate") -pipeline = USMicroplexPipeline(cfg) -calibrator = pipeline._build_weight_calibrator() -assert isinstance(calibrator, MicrocalibrateAdapter) -``` - -Covered by `tests/calibration/test_us_pipeline_dispatch.py`. - -## 3. Synthesizer scale-up benchmark - -```bash -# Defaults: ZI-QRF + ZI-MAF + ZI-QDNN, all 77k rows × 50 columns -uv run python -m microplex_us.bakeoff \ - --stage stage1 \ - --methods ZI-QRF ZI-MAF ZI-QDNN \ - --output artifacts/scale_up_stage1.json - -# Completes in ~6 minutes on a 48 GB M3. -# Per-method results land in artifacts/scale_up_stage1.json.partial.jsonl -# as soon as each method finishes. -``` - -### Run a single method at a smaller scale - -```python -from pathlib import Path -from microplex_us.bakeoff import ScaleUpRunner, ScaleUpStageConfig, stage1_config - -base = stage1_config() -cfg = ScaleUpStageConfig( - stage="quick_zi_qrf", - n_rows=20_000, - methods=("ZI-QRF",), - condition_cols=base.condition_cols, - target_cols=base.target_cols, - holdout_frac=0.2, - seed=42, - k=5, - n_generate=16_000, - data_path=base.data_path, - year=base.year, - rare_cell_checks=base.rare_cell_checks, - prdc_max_samples=15_000, -) -results = ScaleUpRunner(cfg).run(incremental_path=Path("artifacts/quick.jsonl")) -for r in results: - print(r.method, r.coverage, r.fit_wall_seconds) -``` - -### Tune per-method hyperparameters - -```python -cfg = ScaleUpStageConfig( - # ... other fields ... - method_kwargs={ - "ZI-MAF": {"n_layers": 8, "hidden_dim": 128, "epochs": 200, "lr": 5e-4}, - }, -) -``` - -Every field in the method class's `__init__` signature can be overridden. - -### Interpret the result - -`ScaleUpResult` fields: - -- `coverage` — PRDC coverage (fraction of real records with a synthetic neighbor within k-NN). Higher is better. Sample-size sensitive (see the PRDC cap note below). -- `precision`, `density` — other PRDC metrics. -- `fit_wall_seconds`, `generate_wall_seconds` — timing. -- `peak_rss_gb_during_fit` — process RSS (on macOS, corrected for the bytes-vs-KB units bug). -- `zero_rate_mae` — scalar mean absolute error in per-column zero-rate. -- `zero_rate_per_column` — per-column `{real, synth, abs_diff}`. Identifies which specific columns drive the error. -- `rare_cell_ratios` — synth-count / real-count for designated rare subpopulations (elderly self-employed, young dividend, disabled SSDI, top-1 % employment). - -### Known quirks - -- **PRDC sample size matters.** Coverage drops as real sample grows (tighter k-NN radius). Compare across stages only when `prdc_max_samples` is the same. -- **ZI-MAF / ZI-QDNN at default settings are not competitive** on real ECPS. Stage-1 result: ZI-QRF 0.256 >> ZI-QDNN 0.147 >> ZI-MAF 0.014 at 77k × 50. Hyperparameter tuning is an open investigation (see `docs/stage-1-pilot-results.md`). - -## 4. Embedding-PRDC validation (optional) - -Standalone script that settles whether stage-1's ordering is a metric artifact from 50-dim PRDC: - -```bash -uv run python scripts/embedding_prdc_compare.py \ - --n-rows 40000 \ - --output artifacts/embedding_prdc_compare.json -``` - -Trains a 16-dim autoencoder on the holdout, then computes PRDC in both raw and latent space. Takes ~5 min. - -If ordering is preserved in latent space: stage-1 finding is robust. If it changes: raw PRDC in 50-dim was noise and the stage-1 winners need re-examination in a less dimensionality-sensitive metric. - -## 5. Diagnostics - -### PSID coverage = 0 reproduction - -```python -import pandas as pd -import numpy as np - -df = pd.read_parquet("~/PolicyEngine/microplex/data/stacked_comprehensive.parquet") -exclude = {"weight", "person_id", "household_id", "interview_number"} - -survey_dfs = {} -for src in ["sipp", "cps", "psid"]: - sub = df[df["_survey"] == src].drop(columns=["_survey"]).copy() - num = [c for c in sub.columns - if sub[c].dtype.kind in "fiu" and sub[c].isna().mean() < 0.05] - survey_dfs[src] = sub[num].dropna().reset_index(drop=True) - -first = next(iter(survey_dfs.values())) -shared = [c for c in first.columns - if c not in exclude and all(c in d.columns for d in survey_dfs.values())] -print("shared_cols:", shared) # ['is_male', 'age'] — 2 variables -``` - -Full diagnosis in `docs/psid-coverage-zero-diagnosis.md`. - -## 6. What to look at for planning the next step - -Read these in order: - -1. `docs/v6-postmortem.md` — what killed v6 and why -2. `docs/calibrator-decision.md` — why microcalibrate is mainline -3. `docs/core-wiring-audit.md` — what's in microplex core, what's wired, what to swap -4. `docs/synthesizer-benchmark-scale-up.md` — how to think about scale-up -5. `docs/stage-1-pilot-results.md` — the actual numbers and what they mean -6. `docs/microcalibrate-wiring-plan.md` — rollout of the G1 unblocker -7. `docs/overnight-session-2026-04-16.md` — full session audit trail -8. `docs/psid-coverage-zero-diagnosis.md` — the PSID = 0 finding - -## 7. Production next steps - -Ordered by expected value: - -1. Launch a v7 run with `--calibration-backend microcalibrate`. Expected outcome: pipeline completes and writes parity artifact. If it OOMs, the OOM is in a *different* stage than calibration, which is a new finding. -2. After v7 completes: parse the parity artifact and compare against `broader-donors-ssn-card-type-v1` (baseline 0.6955 full-oracle capped loss). If v7 lands below that, G1 is cleared. -3. While v7 runs: execute stage-2 scale-up (1M rows × 50 cols) on the rewire branch. Requires a larger data source than ECPS (77k limit); the natural candidate is a clone-and-assign of ECPS to 1M, matching PE-US-data's local-area pattern. -4. If ZI-MAF tuning recovered it (see `artifacts/zi_maf_tuning.json` once the overnight run completes): lock in the best config as the new `ZI-MAF` default in `method_kwargs`. - -## 8. Cleanup tasks from the session - -These are tracked as follow-ups and do not block G1: - -- `disabled_ssdi` zero-rate diverges to 0.0 on all methods. Investigate per-column breakdown (now exposed) to find which other columns break. -- ZI-QRF OOM at the loky-worker level above 61k×50. Already worked around (PRDC cap). Root-cause fix would be switching `n_jobs=-1` to a bounded pool or a worker-recycling wrapper. -- MPS / CUDA for ZI-MAF + ZI-QDNN in the benchmark method classes. Would shrink fit time 3–5× but is a separate refactor of `microplex.eval.benchmark`. -- Per-method benchmark at v6 scale (1.5 M household entity table) once the v7 pipeline gives us that artifact to measure against. - -## 9. Don't do - -- Don't launch another v6-style run with `backend=entropy`. Known-OOM. Use `microcalibrate`. -- Don't take the small-benchmark (10k × 7 synthetic) ordering at face value for G1 defaults. Stage-1 evidence overturned it. -- Don't trust raw PRDC coverage in 50 dimensions as an absolute number across stages. Ordering across methods at the same stage/config is fine; absolute numbers across stages need the same PRDC cap. diff --git a/docs/source-semantics.md b/docs/source-semantics.md deleted file mode 100644 index 933cbb09..00000000 --- a/docs/source-semantics.md +++ /dev/null @@ -1,213 +0,0 @@ -# Source semantics - -`microplex-us` is moving away from pipeline-level source special cases and -toward declarative source and variable semantics. - -## Core idea - -There are two different questions for any variable in any source: - -1. Is this source authoritative for that variable? -2. Is that variable safe to use as a donor conditioning feature? - -Those are not the same question. - -Examples: - -- A source can be authoritative for a variable while still being a bad shared - conditioning feature. -- A variable can exist in two sources but be semantically incompatible as a - conditioning feature because one side is derived or placeholder-filled. - -## Source-level capabilities - -Core type: - -- `microplex.core.SourceVariableCapability` - -Attached through: - -- `microplex.core.SourceDescriptor.variable_capabilities` - -Current public helpers on `SourceDescriptor`: - -- `capability_for(variable_name)` -- `is_authoritative_for(variable_name)` -- `allows_conditioning_on(variable_name)` - -## US registry layer - -Country-specific policy lives in: - -- `src/microplex_us/source_registry.py` - -Main types: - -- `SourceVariablePolicy` -- `SourceVariablePolicySpec` -- `resolve_source_variable_capabilities(...)` - -This lets a source provider declare policy without embedding donor logic in the -pipeline itself. - -## Variable semantics - -Generic atomic-vs-derived semantics live in: - -- `src/microplex_us/variables.py` - -Main types and helpers: - -- `VariableSemanticSpec` -- `DonorImputationBlockSpec` -- `DonorMatchStrategy` -- `VariableSupportFamily` -- `VARIABLE_SEMANTIC_SPECS` -- `resolve_variable_semantic_capabilities(...)` -- `prune_redundant_variables(...)` -- `donor_imputation_blocks(...)` -- `donor_imputation_block_specs(...)` - -The donor path also now selects conditioning features per donor block rather -than using the same shared-variable set for every imputation target. There are -two distinct selection modes: - -- generic selection: - - start from variables allowed by source and variable capability metadata - - score shared variables against the donor block - - keep the strongest conditioning features instead of every available overlap -- `pe_prespecified` selection: - - build a PE-style structural predictor surface when the variable semantics - ask for it - - use the variable's declared `preferred_condition_vars` as the structural - backbone - - optionally admit a narrow `supplemental_shared_condition_vars` set from the - actual shared overlap, instead of reopening the full common-predictor pool -- `pe_plus_puf_native_challenger` selection: - - keep the same PE structural backbone - - for the explicitly marked problematic PUF tax-leaf blocks only, append a - narrow set of source-native raw-overlap predictors declared in semantics - - treat that lane as an opt-in challenger, not as a PE-alignment update - -For the problematic PUF tax-leaf family, the PE-aligned default is still the -structural backbone only. The local `policyengine-us-data` -`calibration/puf_impute.py` path trains the PUF clone QRF on demographic / -tax-unit-role predictors only, and the PUF source policy intentionally marks -derived convenience columns like `income`, `employment_status`, and synthetic -`state_fips` as not usable for donor conditioning. - -That keeps the donor path closer to the intended Microplex shape: - -- declarative semantics define what is valid -- the pipeline chooses what is useful from the data -- source-specific predictor policy lives in semantics metadata rather than - expanding ad hoc pipeline branches - -The donor blocks themselves are also now declarative: - -- native entity -- allowed condition entities -- projection aggregation for person-native controls when projected to a group -- block model variables -- restored output variables -- match strategy per modeled variable -- preferred conditioning variables -- supplemental shared conditioning variables -- optional frame preparation / restoration hooks - -That means `us.py` now executes donor block specs rather than deciding inline -which blocks need special handling. - -Artifacts now also record `synthesis.donor_conditioning_diagnostics` for each -executed donor block, including: - -- donor source -- modeled/restored variables -- raw shared overlap before block preparation -- block-level shared overlap after model-variable exclusion -- whether entity projection ran, and which shared vars survived projection -- selected condition vars -- shared vars that were available but dropped -- requested supplemental shared vars -- requested challenger shared vars -- raw-stage supplemental rejection reasons -- raw-stage challenger rejection reasons -- prepared-stage supplemental rejection reasons -- prepared-stage challenger rejection reasons -- whether the block used a prepared condition surface - -Use `python -m microplex_us.pipelines.summarize_donor_conditioning ` -to inspect those diagnostics from a finished artifact. - -When a donor block declares a non-person native entity and those IDs are -available in the working frame, the pipeline now: - -- projects scaffold and donor rows to that entity -- filters donor conditioning features through the block's declared - `condition_entities` policy -- projects person-native conditioning variables using their semantic aggregation - rule instead of blindly taking the first row -- fits the donor block once per native entity -- broadcasts imputed values back to person rows after matching - -Current example: - -- `dividend_income` and `ordinary_dividend_income` are treated as derived when - `qualified_dividend_income` and `non_qualified_dividend_income` are present. - -That means the system can automatically: - -- avoid learning redundant totals as donor targets -- avoid using redundant totals as donor conditioning features -- keep the atomic basis as the source of truth -- distinguish variable families that should use household-only controls from - those that should use person + household + native-entity controls -- respect tax-unit-native donor blocks without forcing all tax-unit variables - through the same condition policy - -## Why this matters - -This is the beginning of the general rule we want: - -- source-specific policy should be declarative -- variable-level atomic/derived semantics should be generic -- the donor integration pipeline should consume metadata, not source names - -That is what makes the approach portable to future country packs. - -## Current examples - -### PUF - -Current PUF policy expresses that: - -- `state_fips` is not a real usable donor geography in the current build -- `tenure` is scaffold filler -- `income` is a derived convenience field, not an atomic donor target -- `employment_status` is derived, not directly observed -- `employment_income` is source-native but should not be used as a shared donor - condition - -### CPS - -CPS now resolves capabilities through the same registry path, so it also picks -up generic variable semantics such as redundant dividend totals. - -## Extension rule - -When adding a new source, prefer: - -1. Declare source-specific overrides in the source registry. -2. Declare atomic-vs-derived relationships in variable semantics. -3. Let the pipeline consume those capabilities generically. - -Avoid: - -- source-name `if/else` branches in the donor path -- learning overlapping derived variables independently -- using placeholder or derived variables as donor conditions just because they - are numeric and present in both tables -- forcing tax-unit-native donor variables through person-native conditioning - only because the seed frame happens to be person-indexed -- assuming every tax-unit-native variable should share the same household-only - or person-level conditioning policy diff --git a/docs/stage-1-pilot-results.md b/docs/stage-1-pilot-results.md deleted file mode 100644 index 8acfd099..00000000 --- a/docs/stage-1-pilot-results.md +++ /dev/null @@ -1,249 +0,0 @@ -# Stage 1 pilot results — synthesizer scale-up on real ECPS - -*First execution of `docs/synthesizer-benchmark-scale-up.md`'s stage-1 protocol on real enhanced_cps_2024 data. This doc captures the pilot (5,000-row subsample, 1 method) and the first full stage-1 run (77,006 rows, 3 methods) as they complete.* - -## Data - -- Source: `~/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5` -- Full row count: **77,006** (PE's national-scale 2024 ECPS) -- Columns: 50 (14 demographics conditioning + 36 income / wealth / benefit targets) -- Stage-1 split: 61,604 train / 15,402 holdout (80/20, seed=42) - -Note: ECPS has 77k rows in its national-scale build; the 100k-row stage-1 target from the protocol doc isn't achievable from this file alone. The harness uses `n_rows=None` to take all 77k and reports actual row counts in each result. - -## Pilot — ZI-QRF at 5,000 rows × 50 columns - -First validation that the harness runs end-to-end on real data with the curated default columns. Sanity-check result, not a benchmark claim. - -| Method | Train rows | Holdout rows | Cols | Coverage | Precision | Density | Fit (s) | Gen (s) | Peak RSS | -|---|---:|---:|---:|---:|---:|---:|---:|---:|---:| -| ZI-QRF | 4,000 | 1,000 | 50 | **0.641** | 0.617 | 0.233 | 5.0 | 1.0 | 0.87 GB | - -Interpretation: PRDC coverage of 0.641 on 5k × 50 is a sensible baseline — better than the existing benchmark's 10k × 7 synthetic ZI-QRF CPS coverage of 0.347 (per `benchmark_multi_seed.json`). Two possible explanations, both worth noting: - -1. **Data realism:** real ECPS has structure that multi-source-fusion-from-synthetic doesn't. Single-source QRF can fit the real marginals and correlations directly. -2. **Column set:** the new 50-column default includes richer conditioning signal than the prior 7-column setup. - -### Rare-cell preservation (pilot) - -| Check | Synthetic / Real ratio | -|---|---:| -| elderly_self_employed | 2.00 | -| young_dividend | 4.38 | -| disabled_ssdi | 0.00 | -| top_1pct_employment | 3.91 | - -Pattern: ZI-QRF *over-samples* rare non-zero cells (elderly SE, young dividend, top-1 % employment) — the zero-inflation classifier predicts non-zero slightly too aggressively for these categories. The `disabled_ssdi` check returning 0 is concerning: the model is predicting zero SSDI for disabled persons, which is the opposite of what the underlying data structure says. Likely because SSDI receipt conditional on disability is lower in ECPS than intuition suggests, and the model learned the unconditional zero-rate. Needs follow-up at full scale. - -### Zero-rate MAE (pilot) - -0.180 — mean absolute error in per-column zero-rate between real and synthetic is ~18 percentage points. That's substantial. Most likely driven by target columns where the zero-inflation classifier diverges from real; worth breaking down per column at stage 1. - -## Stage 1 — ZI-QRF + ZI-MAF + ZI-QDNN at 40k and 77k rows × 50 columns - -Ran both scales. **Ordering is preserved across scale**; absolute -numbers shift because the PRDC sample cap differs (see note below). - -### Why the 40k intermediate run - -The first 77k attempt OOM-killed during PRDC computation, not during -synthesizer fitting. PRDC on 15k real × 61k synthetic × 50 features -materializes ~7 GB-per-copy distance matrices that exceed what a -48 GB workstation can hold once multiple copies exist. Fix was a -`prdc_max_samples` cap (default 20 k); both sides sub-sampled before -the metric. With the cap in place, 77k × 50 runs cleanly. - -40 k result is kept because it ran earlier without the cap (8 k real -vs 32 k synth) and is useful for the same-method-different-scale -comparison. - -### Results (real ECPS, 40k × 50) — uncapped PRDC (8k × 32k) - -| Method | Coverage | Precision | Density | Fit (s) | Gen (s) | Peak RSS (GB) | Zero-rate MAE | -|---|---:|---:|---:|---:|---:|---:|---:| -| **ZI-QRF** | **0.465** | **0.230** | **0.120** | 20.5 | 2.0 | **3.5** | **0.179** | -| ZI-MAF | 0.054 | 0.009 | 0.004 | 115.6 | 0.6 | 23.6 | 0.246 | -| ZI-QDNN | 0.306 | 0.155 | 0.063 | 52.3 | 0.6 | 32.5 | 0.299 | - -### Results (real ECPS, 77k × 50) — capped PRDC at 15k × 15k - -| Method | Coverage | Precision | Density | Fit (s) | Gen (s) | Peak RSS (GB) | Zero-rate MAE | -|---|---:|---:|---:|---:|---:|---:|---:| -| **ZI-QRF** | **0.256** | **0.233** | **0.121** | 36.0 | 3.0 | 6.0 | **0.177** | -| ZI-MAF | 0.014 | 0.008 | 0.003 | 216.2 | 1.0 | 11.0 | 0.246 | -| ZI-QDNN | 0.147 | 0.171 | 0.065 | 95.0 | 0.9 | 11.0 | 0.300 | - -Total 77k wall time: 362 s (6:02). ZI-MAF's 216 s fit and ZI-QDNN's -95 s fit are the compute-bottleneck stages. ZI-QRF finishes in 36 s. - -### Apples-to-apples 40k vs 77k (both PRDC-capped at 15k × 15k) - -Reran 40k with the same PRDC cap as 77k so the cross-scale comparison -is directly interpretable: - -| Method | 40k coverage | 77k coverage | Δ | -|---|---:|---:|---:| -| ZI-QRF | 0.352 | 0.256 | −27 % | -| ZI-QDNN | 0.222 | 0.147 | −34 % | -| ZI-MAF | 0.029 | 0.014 | −52 % | - -**Coverage drops with training scale, not with data quality.** This is -a known property of PRDC: the "covered" check uses a k-NN radius set -on the real data itself. More real points make the radius tighter, -and the same synthetic sample fails to cover more real points. So the -absolute coverage number is only interpretable at a fixed real-sample -size. The *ordering*, however, is invariant — and ZI-QRF wins at both -scales. That's the production-relevant fact. - -One implication: for future stage-2 / stage-3 runs, fix both -`holdout_frac` and the PRDC cap so coverage numbers are comparable -across stages. Alternatively, switch to an embedding-based PRDC that -is less sample-size-sensitive (flagged as follow-up). - -### Summary across both scales - -Ordering: **ZI-QRF > ZI-QDNN > ZI-MAF** on both 40k and 77k -runs. ZI-MAF coverage < 0.1 at both scales, effectively -near-collapsed. ZI-QRF wins on coverage *and* cost (3–6 GB RSS, -20–36 s fit vs 11–33 GB and 52–216 s for neural methods). - -### Rare-cell preservation ratios (synthetic count / holdout count) - -| Method | elderly_SE | young_dividend | disabled_SSDI | top_1% | -|---|---:|---:|---:|---:| -| ZI-QRF | 2.4 | 3.8 | **0.0** | 3.95 | -| ZI-MAF | 103.6 | 3.8 | **0.0** | 3.95 | -| ZI-QDNN | 116.7 | 3.4 | **0.0** | 3.95 | - -Neural methods severely over-produce `elderly_self_employed` (100×+) — -suggests their zero-inflation classifiers are fundamentally -miscalibrated for this cell on real data. Every method drives -`disabled_ssdi` to 0.0, consistent with the pilot finding. Every method -over-produces top-1% employment at ~4×. - -## Major finding: the small-benchmark ordering inverts at production scale - -| Method | 10k × 7 synthetic (benchmark_multi_seed, CPS column) | 40k × 50 real ECPS | -|---|---:|---:| -| ZI-MAF | 0.499 ← winner | **0.054** | -| ZI-QDNN | 0.406 | 0.306 | -| ZI-QRF | 0.347 | **0.465** ← winner | - -**Read from this result before trusting any small-scale benchmark.** The -published ranking that named ZI-MAF (and by implication ZI-QDNN as the -near-term production direction in the SS-model doc) best reversed -completely as soon as we moved to: - -1. Real joint distributions instead of analytically-generated synthetic. -2. 50 columns instead of 7 (~7× feature dimensionality). -3. 40 k rows instead of 10 k (4× data). - -## Interpretation - -1. **ZI-MAF at 0.054 is near-collapsed.** Not merely "third-best" — it's - producing samples that aren't close to any holdout record. Three - plausible causes, any combination of which might be active: - - Default hyperparameters (n_layers=4, hidden_dim=32, 50 epochs) are - too small for 50-dim targets. The network is a per-column flow, so - each of the 36 flows has only ~1k–5k effective parameters. May be - fundamentally under-capacity. - - Zero-inflation handling in ZI-MAF combines a classifier (RF, 50 - trees) for P(zero) with a MAF for nonzero values. When the - classifier is imprecise on rare non-zero cells, the MAF has very - few positive samples to train on, and mode-collapses. - - The loss log-transforms positive values and standardizes; for - heavy-tailed distributions (top-1 % income) this degrades - conditional tail estimation. -2. **ZI-QDNN at 0.306 is mid-pack.** Better than ZI-MAF but materially - worse than ZI-QRF. Suggests the quantile DNN's conditional - estimates are reasonable but not tree-accurate. Worth noting RSS - was 32 GB — highest of the three — which would OOM on a typical - workstation without swap. Not a production-ready cost profile - without batch-size or architecture tuning. -3. **ZI-QRF at 0.465 is the clear winner.** 3.5 GB RSS, 20-second fit, - and nearly 2× ZI-QDNN's coverage. This is the production default for - the rewire's cross-section synthesizer step. - -## Implications for the SS-model methodology doc - -The SS-model methodology doc's "production direction: ZI-QDNN" claim -does not survive this benchmark. At production scale on real data with -default hyperparameters, neither ZI-MAF nor ZI-QDNN is competitive with -ZI-QRF. The doc should be updated to note this finding, and the -longitudinal extension should treat ZI-QRF as at minimum a strong -baseline. - -Two caveats that keep the SS-model direction alive: - -1. Hyperparameter-tuned ZI-MAF / ZI-QDNN *might* beat ZI-QRF. The - scale-up doc listed "ZI-MAF needs careful hyperparameter tuning on - real data" as a known risk; stage-1 confirms the risk. -2. Trajectory / pathwise generation is a different problem from - cross-sectional conditional modeling. A sequence-model win at - longitudinal need not follow from cross-sectional results. -3. Both neural methods used 32-GB-class memory to train; at the 3.4 M - row v6 scale the naive extrapolation is ~1.6 TB. Tree methods' - modest memory profile may be decisive on a workstation regardless - of quality. - -## Follow-up work flagged by this run - -1. **61k ZI-QRF OOM diagnosis.** Scaling is clean up to 40 k (3.5 GB - RSS). 61 k fails silently in < 2 min with SIGKILL. Most likely - cause: loky workers accumulating memory across the 36 target - columns. Fix paths: `n_jobs=4` instead of `-1`, or a - worker-recycling wrapper, or just disable parallelism and accept - slower fit. -2. **ZI-MAF hyperparameter search.** Before accepting - ZI-MAF-is-not-viable as the final answer, run with n_layers=8, - hidden_dim=128, epochs=200 and see if coverage recovers. One - evening of tuning could either rescue the method or definitively - rule it out. -3. **Embedding-based PRDC.** Raw-feature PRDC in 50 dimensions is - predicted by the scale-up doc to degenerate. Fit a 16-dim - autoencoder on holdout, re-run PRDC in that space, and check - whether the method ordering changes. If it does, the 50 k result - is a metric artifact, not a method verdict. -4. **Per-column zero-rate breakdown.** All three methods drive - `disabled_ssdi` to 0.0 synthetic count. Needs per-column MAE - reporting to identify which other columns systematically break. -5. **`microcalibrate` applied on top.** The synthesizer results above - are uncalibrated. The mainline pipeline runs synthesis then - calibration. Worth repeating stage 1 with `MicrocalibrateAdapter` - applied to the generated records and measuring whether calibration - lifts ZI-MAF / ZI-QDNN coverage back into the competitive range. - -## Interpretation guide (for when results land) - -Key comparisons to watch for: - -1. **Does the small-benchmark ordering (ZI-MAF > ZI-QDNN > ZI-QRF on CPS) hold on real 77k × 50?** - - Previously on 10k × 7 synthetic CPS-schema: ZI-MAF 0.499 > ZI-QDNN 0.406 > ZI-QRF 0.347. - - If preserved → supports the preliminary G1 synthesizer default of ZI-MAF. - - If inverted → the small-scale ordering was an artifact of the synthetic generator's simplicity and needs revisiting. - -2. **Is ZI-QRF competitive at real 77k × 50?** - - Pilot gave 0.641 at 5k. If stage 1 sustains > 0.55 on 77k, ZI-QRF is a viable fallback for environments without PyTorch. - -3. **Rare-cell preservation at scale**: - - Does every method preserve `disabled_ssdi` at non-zero ratio, unlike the pilot? Failure at scale would confirm a systematic zero-inflation bug. - -4. **Runtime vs coverage frontier**: - - ZI-QRF fit in minutes, ZI-MAF in hours. If ZI-MAF gets 0.65 and ZI-QRF gets 0.60 but with 30× the compute, the effective production choice is ZI-QRF until ZI-MAF's lead grows or GPU acceleration lands. - -5. **Does PRDC in 50D give interpretable numbers?** - - The scale-up doc predicted PRDC may degenerate in high dimensions. If all three methods cluster between 0.60 and 0.75 (noise range) on stage 1, raw-feature PRDC has hit its ceiling and we need to add an embedding-based PRDC for stage 2+. - -## Known limitations of this stage - -- **Single-source only.** The harness runs each synthesizer on ECPS alone; the multi-source fusion aspect of the v6 pipeline is out of scope for stage 1. Fusion is exercised earlier in the microplex-us pipeline (donor integration) upstream of calibration. -- **No calibration.** These are synthesis-only results. Calibration via `MicrocalibrateAdapter` happens downstream and is not part of this benchmark. -- **CPU-only torch.** The benchmark method classes don't expose a `device` argument. ZI-MAF and ZI-QDNN fit on CPU, which is a conservative upper bound on training time. Adding MPS or CUDA support to the benchmark classes is a discrete follow-up that could shrink stage-1 wall time by 3–5×. -- **No seed replication.** Stage 1 runs at seed=42 only. Confidence intervals across seeds are in the protocol but deferred. - -## Follow-up work flagged by this stage - -1. **Incremental result persistence.** Current harness writes all results atomically at the end. If ZI-QDNN fails, ZI-QRF and ZI-MAF numbers are lost. Patch the runner to save each method's ScaleUpResult as soon as it completes. -2. **Embedding-based PRDC.** Fit a 16-dim autoencoder on `holdout` and compute PRDC in that space. Compare to raw-feature PRDC to diagnose dimensionality effects. -3. **Per-column zero-rate breakdown.** Expose `zero_rate_per_column` alongside the scalar MAE so the doc can pinpoint which columns drive the error. -4. **GPU support in benchmark methods.** Pass `device` through to torch-based methods. diff --git a/docs/stage-1-post-snap-results.md b/docs/stage-1-post-snap-results.md deleted file mode 100644 index 3dbc4988..00000000 --- a/docs/stage-1-post-snap-results.md +++ /dev/null @@ -1,77 +0,0 @@ -# Stage-1 results after fixing the shared-col noise bug - -*Corrected stage-1 numbers after the categorical-snap mitigation landed. The raw numbers in `docs/stage-1-pilot-results.md` are preserved for historical reference but should not be cited; the post-snap numbers here are the real measurement.* - -## The fix in one line - -`microplex.eval.benchmark._MultiSourceBase.generate` adds σ=0.1 Gaussian noise to *every* shared-column value, including binary / categorical ones. The harness now snaps those values back to their training-pool grid after generation. See `docs/per-column-zero-rate-bug.md`. - -## Corrected stage-1 at 40k × 50 (PRDC capped 15k/15k) - -| Method | Coverage | Precision | Density | Fit (s) | Peak RSS (GB) | Zero-rate MAE | -|---|---:|---:|---:|---:|---:|---:| -| **ZI-QRF** | **0.979** | 0.913 | 0.902 | 20.0 | 3.5 | 0.016 | -| ZI-QDNN | 0.796 | 0.848 | 0.766 | 52.5 | 11.8 | 0.136 | -| ZI-MAF | 0.168 | 0.030 | 0.022 | 114.6 | 11.8 | 0.084 | - -## Corrected stage-1 at 77k × 50 (full ECPS) - -| Method | Coverage | Precision | Density | Fit (s) | Peak RSS (GB) | Zero-rate MAE | -|---|---:|---:|---:|---:|---:|---:| -| **ZI-QRF** | **0.928** | 0.910 | 0.885 | 37.0 | 6.0 | 0.013 | -| ZI-QDNN | 0.707 | 0.835 | 0.664 | 105.5 | 11.0 | 0.136 | -| ZI-MAF | 0.106 | 0.036 | 0.025 | 227.0 | 11.0 | 0.083 | - -Total 77k wall time: 386 s. - -## Before vs after the snap fix (coverage at 77k × 50) - -| Method | Pre-snap (original stage-1) | Post-snap (this doc) | Uplift | -|---|---:|---:|---:| -| ZI-QRF | 0.256 | 0.928 | +0.672 (3.6×) | -| ZI-QDNN | 0.147 | 0.707 | +0.560 (4.8×) | -| ZI-MAF | 0.014 | 0.106 | +0.092 (7.6×) | - -Neural methods get a bigger absolute uplift because their per-column models received the noise-polluted conditioning directly; QRF's tree splits are somewhat robust to small perturbations, which reduces the pre-snap damage to it. - -## What changed in the headline story - -### Findings that STILL hold - -1. **Ordering preserved**: ZI-QRF > ZI-QDNN > ZI-MAF at every scale, every config. -2. **ZI-MAF is still the worst** method tested. Even with the bug fix, ZI-MAF at 0.106 is 9× worse than ZI-QRF at 0.928. -3. **ZI-QRF is the G1 production synthesizer** default. No change. -4. **Calibration-on-synth** result holds (ZI-MAF too far off to rescue via weights). -5. **Embedding-PRDC** validation holds. -6. **ZI-MAF hyperparameter tuning** result holds (wider/longer doesn't rescue it). - -### Findings that need revision - -1. **ZI-QRF quality is much higher than the pilot suggested.** Stage-1 coverage is 0.928 at 77k, not 0.256. The G1 cross-section is in way better shape than the pre-snap numbers implied. -2. **ZI-QDNN is legitimately competitive.** Pre-snap 0.147 looked mediocre; post-snap 0.707 is respectable. In production if compute budget allows, ZI-QDNN is a reasonable fallback. -3. **The "ZI-MAF is broken" claim is softer than the pre-snap numbers.** At 0.106 it's still worst, but it's not "1% coverage is so bad no amount of calibration rescues it." 10.6% is bad but measurable; the calibrate-on-synth result (mean rel err 15) still says the structure is too far off to rescue via weights, but the PRDC gap is not orders-of-magnitude. - -### How confident to be - -Four independent robustness checks still agree (raw 50-d PRDC at 40k, raw 50-d PRDC at 77k, embedding 16-d PRDC at 40k, calibrate-on-synth at 20k). Adding the snap fix to stage-1 gives a fifth confirmation. Ordering is robust; absolute numbers finally match the fix. - -## What this means for G1 - -The headline is now cleaner: **ZI-QRF produces 92.8% PRDC coverage on a held-out 15k-record slice of enhanced_cps_2024 at 77k × 50 scale in 37 seconds.** That's a production-credible starting point. Downstream calibration via MicrocalibrateAdapter will pull weighted aggregates to target. We have a working cross-section synthesizer. - -The next-action playbook (launch v7 with `--calibration-backend microcalibrate`, see `docs/quickstart-rewire.md`) stays the same. This snap fix is a measurement improvement, not a direction change. - -## Artifacts - -- `artifacts/stage1_40k_snap.json` -- `artifacts/stage1_40k_snap.jsonl` -- `artifacts/stage1_77k_snap.json` -- `artifacts/stage1_77k_snap.jsonl` - -Reproduction: - -```bash -uv run python -m microplex_us.bakeoff --stage stage1 --methods ZI-QRF ZI-MAF ZI-QDNN -``` - -(Uses the snap by default in the harness.) diff --git a/docs/stage-contracts.md b/docs/stage-contracts.md deleted file mode 100644 index dee1685b..00000000 --- a/docs/stage-contracts.md +++ /dev/null @@ -1,114 +0,0 @@ -# Stage contracts and manifests - -The canonical stage registry lives in -`microplex_us.pipelines.stage_contracts`. It defines each stage's purpose, -expected inputs, outputs, artifacts, diagnostics, validation placeholders, and -resume mode. - -Saved artifact bundles now include a `stage_manifest.json` derived artifact. This -file is the machine-readable saved-run overlay for the stage taxonomy. It records the -canonical stages, status for the current run, artifact paths, diagnostics owned -by each stage, and the current resume posture. - -`status` is the saved-artifact readiness view: it reports whether the artifacts -for that stage are ready, incomplete, missing, metadata-only, or deferred. -`lifecycleStatus` is the runtime view: it reports whether the stage is pending, -running, complete, failed, or deferred in the current run. Keeping these fields -separate lets a failed run say both "Stage 5 failed" and "Stage 4's saved -artifact is ready for manual replay." - -Each saved bundle also includes typed per-stage output manifests at -`stage_artifacts/manifests/.json`. These manifests are written through -`USStageRunWriter`, which validates each stage as a whole instead of updating -individual manifest keys directly. The manifest files live outside each stage's -payload directory so they do not change the content hash of reloadable stage -artifacts. - -Live runs can use `USStageRuntimeWriter` to write those same per-stage manifests -incrementally. The writer exposes `start_stage`, `update`, `record_output`, -`record_diagnostic`, `complete_stage`, `fail_stage`, `defer_stage`, and -`finalize_from_artifact_manifest`. A stage can start only after the immediately -previous stage is complete unless explicit stage-input overrides are enabled. -The canonical multi-source versioned build path reserves the versioned artifact -directory before loading sources, writes Stage 1 immediately, writes Stage 2 as -source frames load, then finalizes all stage manifests against the completed -artifact manifest during save. - -Other versioned convenience entry points still reconstruct their stage manifests -from the completed saved artifact manifest. They expose the same saved-run -contract files, but they do not yet produce live per-stage lifecycle updates -while the build is running. - -The registry exposes two seam layers: - -- `inputs` and `outputs` are structured stage resources. They identify artifact, - config, manifest, runtime, and external-data dependencies with explicit keys. -- `consumes` and `produces` remain short human-readable summaries for diagrams - and documentation. - -Artifact `required` means required for a complete canonical saved bundle. It is -separate from `resume_role`, which says whether an existing artifact is useful -for diagnostics, manual replay, manual resume, or post-artifact validation. -Partial bundles can therefore still expose a valid replay boundary while the -manifest honestly reports that the complete publication bundle is incomplete. - -## Legacy run-contract IDs - -Older run-contract summaries and dashboard payloads used operational labels -such as `preflight`, `seed_build`, `donor_integration`, -`policyengine_materialization`, `calibration`, and `finalization`. New saved-run -views should report the canonical 9-stage IDs while preserving the old labels as -legacy provenance when present. - -`canonicalize_us_pipeline_stage_id` maps those historical IDs into the stage -registry. The dashboard applies that mapping when reading `run_summary.json`, so -old and new runs sort into the same stage taxonomy instead of creating a second -parallel lifecycle. - -## Resume artifacts - -The first implementation is explicit rather than automatic. It writes reusable -boundary artifacts where the pipeline already has stable outputs: - -- Stage 4: `stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet` -- Stage 5: `seed_data.parquet` and `synthetic_data.parquet` -- Stage 6: `stage_artifacts/06_policyengine_entities/` for the pre-calibration - PolicyEngine entity-table checkpoint -- Stage 7: `calibrated_data.parquet`, `targets.json`, and - `stage_artifacts/07_calibration/calibration_summary.json`, plus the calibrated - PolicyEngine entity-table bundle used by dataset export -- Stage 8: `policyengine_us.h5` -- Stage 9: validation and benchmark evidence artifacts - -The Stage 4 artifact is the scaffold-projected seed before donor integration. It -is a diagnostic and manual replay boundary, not an automatic conditional resume -point yet. - -Conditional execution is intentionally narrow in this implementation. Stage 9 -validation and benchmarking can be replayed against an existing complete Stage 8 -dataset through `microplex-us-stage9-replay`. Earlier-stage conditional source -loading, donor integration, synthesis, calibration, and automatic graph -scheduling remain future work. The stage manifest and artifacts are designed to -make those routes possible later without changing the saved-run contract again. - -## Artifact inventory and readiness - -Saved bundles also expose two Stage 8 diagnostic artifacts: - -- `stage_artifacts/artifact_inventory.json` lists canonical stage artifacts, - whether each path exists, whether it was referenced by the run manifest, its - resume role, size/file counts, and content hashes. -- `stage_artifacts/conditional_readiness.json` summarizes which stage outputs - are available for manual replay, manual resume, post-artifact evidence, or - diagnostics only. - -These reports are advisory. They do not skip or rerun stages, and they do not -silently accept stale artifacts. If a requested config is supplied to the -readiness builder, config mismatches are reported as `must_rerun`. - -## Validation hooks - -Each stage contract includes concise validation descriptors. These describe the -checks the stage should eventually own, but they do not run a shared validation -engine yet. That keeps this change focused on contracts, artifacts, and docs -without changing build behavior. diff --git a/docs/superseding-policyengine-us-data.md b/docs/superseding-policyengine-us-data.md deleted file mode 100644 index 0f1ec7bb..00000000 --- a/docs/superseding-policyengine-us-data.md +++ /dev/null @@ -1,431 +0,0 @@ -# Superseding `policyengine-us-data` - -This document is the current working roadmap for fully superseding -`policyengine-us-data` on the US path. - -It is not a paper claim. It is the operational plan that ties together: - -- the `microplex-us` runtime -- the `microplex-evals` benchmark stack -- the current PE-US measurement contract -- the remaining gates between "diagnostic replacement path" and "real - supersession" - -It also uses one important framing rule: - -- `microplex-us` is not defined as a rebuild of `policyengine-us-data` -- `policyengine-us` plus the active targets DB are the measurement oracle -- `policyengine-us-data` is the incumbent comparator and interface reference - -## Core principle - -`policyengine-us-data` is the incumbent dataset, not truth. - -Truth is the active PE-US target database, measured through the shared -`policyengine-us` runtime. So the supersession question is: - -> Can `microplex-us` produce a PE-ingestable dataset that is more useful than -> `policyengine-us-data` on the real PE-US target estate, under the same -> measurement operator, with stable runtime and artifact discipline? - -That means "supersede" is not one thing. It has layers. - -## Three distinct success claims - -We should keep three different claims separate: - -1. Architectural supersession - - `microplex-us` is a cleaner, more modular, more spec-driven US runtime - than `policyengine-us-data`, with better provenance, eval discipline, and - portability. -2. PE-construction parity - - for the important mapping and rule layers, `microplex-us` either matches - PE's construction logic or differs intentionally with the difference - documented. -3. PE-benchmark superiority - - the resulting Microplex build beats matched-size PE baselines on the - canonical PE-native benchmark frontier. - -These are ordered. (1) is already valuable on its own. (2) is the main bridge -between a cleaner architecture and a trustworthy replacement claim. (3) remains -the ultimate performance goal, but it should not be the only lens used to judge -progress. - -## What superseded means - -There are four increasingly strong meanings of supersession: - -1. Benchmark supersession - - On the canonical PE-US broad frontier metric, a Microplex build beats the - matched-size PE baseline. -2. Runtime supersession - - The Microplex build/export/evaluate path is stable enough to replace the - PE-US-data build path for regular US experiments. -3. Local-area supersession - - The Microplex path is good enough to replace PE-US-data for the - local-area and subnational production use cases we actually care about. -4. Architectural supersession - - The Microplex approach is no longer "PE-US-data plus wrappers"; it is the - canonical US runtime, with only shared measurement still delegated to - `policyengine-us`. - -Today we are somewhere between (1) as an active benchmark mission and a partial -version of (2) for research runs. We are not yet at (3) or (4). - -In practice, the current program should read as: - -1. architecturally supersede the PE-US-data build path -2. prove parity or intentional difference at the construction layer -3. then push for stable benchmark superiority - -## Non-goals - -This roadmap is **not**: - -- a plan to replace `policyengine-us` -- a plan to declare victory from one narrow target slice -- a plan to port unstable US abstractions into `microplex` core early -- a plan to freeze a final architecture before the benchmark frontier settles - -## Current replacement contract - -The intended replacement path already has these pieces: - -1. Canonical source loading - - CPS, PUF, and other source providers load into observation frames with - source metadata and variable capability metadata. -2. Canonical fusion and donor semantics - - source registry - - variable semantic registry - - donor block specs - - native-entity-aware donor execution where IDs exist -3. PE-US-compatible entity build/export - - a final H5 that `policyengine-us` can ingest directly -4. Real-target evaluation - - candidate and PE baseline are both scored through the same PE-US - materialization/target compiler stack -5. Durable run registry - - artifact bundle - - `policyengine_harness.json` - - `run_registry.jsonl` - - `run_index.duckdb` -6. Separate eval workspace - - method bakeoffs, family benchmarks, and paper-facing evidence live in - `microplex-evals` - -## Canonical mission metric - -The current US mission is: - -- beat PE on the PE-native broad loss frontier - -The canonical comparison should be: - -- `Microplex@N` vs `PE@N` - -where: - -- `N` is matched household/sample scale -- PE is allowed to be reweighted/recalibrated after sampling where the - comparison contract requires that - -Important caveat: - -- the full `enhanced_cps_2024` PE dataset remains the stretch reference, but it - is not the only pass/fail bar - -## Full roadmap - -### Phase 0: Measurement and artifact discipline - -Goal: - -- make every serious claim reproducible and comparable - -Required capabilities: - -- common-target PE-US harness -- active targets DB as truth -- durable artifact bundles -- run registry and DuckDB frontier index -- explicit baseline comparison against `policyengine-us-data` - -Status: - -- mostly done - -Exit criteria: - -- every headline run writes the standard artifact bundle -- frontier selection is reproducible from the registry/index alone -- candidate vs baseline comparisons are apples-to-apples on common targets - -### Phase 1: PE-compatible US runtime replacement - -Goal: - -- replace ad hoc dataset construction with a library-first US build/export path - -Required capabilities: - -- source providers with canonical metadata -- fusion planning -- donor integration from declarative source/variable semantics -- PE-style entity table build -- PE-ingestable final H5 export - -Status: - -- done enough for research use, not frozen - -Exit criteria: - -- the standard US build path no longer depends on one-off scripts -- the H5 export is stable enough to be the default input for PE-native scoring -- major semantic guards live in declarative specs, not pipeline hacks - -### Phase 2: Record-construction superiority - -Goal: - -- build records that are structurally more believable than the incumbent path - -Why this phase matters: - -- current evidence says record construction/support is still a larger bottleneck - than the final weight objective -- small calibration tricks do not rescue structurally weak records - -Required capabilities: - -- better source-backed semantics -- decomposable-family modeling where relevant -- support-aware imputation benchmarks -- explicit support diagnostics for important policy variables and family - decompositions - -Primary evidence: - -- method benchmark -- family benchmark -- family portfolio screens - -Status: - -- active - -Exit criteria: - -- the chosen runtime imputation stack clears the eval gates on the current - family portfolio -- no major support family remains known-broken on core US variables needed by - PE-US targets -- record realism improvements survive beyond one family or one source - -### Phase 3: Full-support candidate construction and selection - -Goal: - -- generate a candidate population with enough real support that PE-native - selection can operate on a strong search space - -Why this phase matters: - -- current broad-read evidence says full-support candidate construction plus - budgeted household selection is a stronger lever than source subsampling or - post-export weight tuning alone - -Required capabilities: - -- full-support candidate generation -- household-budgeted selection backends -- PE-native-loss-based selector path -- diagnostics for feasibility drop and weight collapse - -Status: - -- active, not solved - -Exit criteria: - -- the full-support selector path beats the current simpler broad baselines - consistently -- selector gains survive through export and post-selection calibration -- the selected population does not rely on extreme weight collapse to win - -### Phase 4: Broad frontier superiority - -Goal: - -- beat PE on the canonical US broad frontier, not just on a narrow diagnostic - slice - -Canonical score: - -- PE-native broad loss on common targets - -Secondary diagnostics: - -- target win rate -- supported target rate -- common-target MARE -- family and target-delta analysis from the run index - -Status: - -- not done - -Required evidence: - -- repeated broad runs, not one lucky artifact -- matched-size PE baseline comparisons -- no hidden narrowing of the target estate without explicit benchmark contract - changes - -Exit criteria: - -- Microplex wins on the canonical broad mission score against matched-size PE - baselines -- the win survives repeated runs and nearby configuration changes -- the win is not solely explained by an overly favorable or infeasible target - slice - -### Phase 5: Held-out and local-area replacement - -Goal: - -- move from parity-style broad wins to replacement-quality downstream behavior - where local-area and subnational use cases matter - -Why this phase exists: - -- broad parity alone is not enough for real replacement -- current docs explicitly say held-out target evaluation is not the default yet -- local-area production replacement is still future work - -Required capabilities: - -- held-out or shifted-target evaluation loops -- better subnational/local-area target coverage -- production-relevant calibration scopes -- explicit replacement checks for the downstream use cases that still favor - PE-US-data today - -Status: - -- future work - -Exit criteria: - -- local-area/subnational replacement claims are backed by explicit benchmark - contracts -- the path is no longer winning only on broad national/state composites while - failing production-critical local slices - -### Phase 6: Supersession and extraction - -Goal: - -- make Microplex the default US data path and extract only the stable generic - pieces upward - -Required capabilities: - -- stable benchmark win -- stable runtime/export path -- clear country-pack/core boundary -- reusable abstractions promoted to `microplex` only after surviving a second - adapter - -Status: - -- future work - -Exit criteria: - -- `microplex-us` is the canonical US dataset-generation path -- the PE-US-data path is treated as incumbent baseline/reference, not the - default runtime dependency -- only stable benchmark/runtime semantics have moved to `microplex` - -## Current blockers - -As of April 5, 2026, the highest-leverage blockers are: - -1. Record construction/support is still incomplete. - - the eval stack still rejects current aggregate challengers because support - realism is not good enough across families -2. Construction parity is still only partially audited. - - some high-value families now have explicit parity evidence, but the - construction/mapping contract is not yet written down broadly enough to - call the system PE-equivalent at the rules layer -3. Full-support candidate selection gains are still being damaged downstream. - - current build-log evidence points to post-selection entropy calibration - undoing the strongest selector path -4. Broad superiority is not stable yet. - - broad wins exist on some metrics/slices, but they do not yet amount to a - stable "Microplex broadly beats PE" claim -5. Held-out and local-area replacement are not yet in the default loop. - -## Current operating sequence - -This is the current working order of operations: - -1. Use `microplex-us` to replace PE-US-data construction logic with clearer - source specs, variable semantics, and export contracts. -2. Keep an explicit PE construction parity matrix so "match", "close", - and "intentionally different" are written down instead of implied. -3. Use `microplex-evals` to choose and prune runtime methods where method-level - variation matters. -4. Re-run broad PE-native frontier experiments on matched-size baselines at - regular checkpoints rather than after every local change. -5. Only when broad evidence stabilizes, expand held-out/local-area replacement - work. -6. Only when a pattern survives US and appears likely to generalize, lift the - abstraction into `microplex` and then port the shape to UK. - -## Concrete gates for saying "we superseded it" - -For practical purposes, we should not say "Microplex supersedes -`policyengine-us-data`" until all of these are true: - -1. Runtime gate - - the library-first US build/export path is the normal path for serious US - runs -2. Broad benchmark gate - - Microplex beats matched-size PE baselines on the canonical PE-native broad - loss frontier -3. Stability gate - - the win survives repeated runs and nearby build/config perturbations -4. Support gate - - the chosen runtime method stack clears the support-realism bar on the eval - portfolio -5. Local-area gate - - production-relevant local/subnational replacement work is no longer a - known missing layer - -If only the first three are true, we can say: - -- Microplex has a credible broad replacement path - -If the runtime gate and construction parity are true, but the broad benchmark -gate is not yet true, we can still say: - -- Microplex has architecturally superseded the PE-US-data build path - -If all five are true, we can say: - -- Microplex has fully superseded `policyengine-us-data` for the intended US - use cases - -## Where this plan lives - -- runtime architecture and implementation: - - `microplex-us` -- benchmark contracts and paper-facing evidence: - - `microplex-evals` -- stable generic abstractions that survive multiple country packs: - - `microplex` - -That split is intentional. The plan should stay written here, but the evidence -for each phase should live in the repo that actually owns it. diff --git a/docs/synthesizer-benchmark-scale-up.md b/docs/synthesizer-benchmark-scale-up.md deleted file mode 100644 index 795ede5f..00000000 --- a/docs/synthesizer-benchmark-scale-up.md +++ /dev/null @@ -1,170 +0,0 @@ -# Synthesizer benchmark — what we know, and what scale-up will test - -*Draft plan for extending the existing ZI-synthesizer benchmark to production scale.* - -## What the existing benchmark tested - -Results in `microplex/benchmarks/results/benchmark_multi_seed.json` compare six synthesizers — QRF, ZI-QRF, QDNN, ZI-QDNN, MAF, ZI-MAF — on PRDC coverage across three schemas labeled `cps`, `sipp`, `psid`. - -| Method | CPS ASEC coverage | SIPP coverage | PSID coverage | -|---|---:|---:|---:| -| QRF | 0.337 | 0.938 | 0.000 | -| ZI-QRF | 0.347 | **0.950** | 0.000 | -| QDNN | 0.380 | 0.293 | 0.000 | -| ZI-QDNN | 0.406 | 0.717 | 0.000 | -| MAF | 0.398 | 0.349 | 0.000 | -| ZI-MAF | **0.499** | 0.866 | 0.000 | - -**Data used**: synthetic population generated by `benchmarks/run_benchmarks.py::generate_realistic_microdata`, 10,000 rows, **4 target variables** (`income`, `assets`, `debt`, `savings`) conditioned on **3 predictors** (`age`, `education`, `region`). The multi-survey fusion setup partially-observes this population as different "surveys" (CPS-schema sees one subset, SIPP-schema sees another, PSID-schema sees another). - -**Important**: the `cps` / `sipp` / `psid` labels in the result JSON are partial-observation schemas over the same synthetic population, not real CPS / SIPP / PSID data. - -## Scale gap to production - -| Dimension | Existing benchmark | Production (microplex-us G1) | Gap | -|---|---:|---:|---:| -| Rows | 10,000 | 430,000 (CPS) – 3,400,000 (ACS scaffold) | 43×–340× | -| Columns | 7 (3 cond + 4 target) | 150+ joint variables | ~22× | -| Source realism | Synthetic generator with analytical zero-inflation | Real CPS + PUF + SIPP + SCF joints with real tail structure | Categorical jump | -| Held-out set | 20% of synthetic population | TBD — ECPS baseline, external targets (SOI, BEA, Census) | — | - -Combined row × column gap: **~1,000×–8,000×**. Plus the synthetic-to-real jump, which is not measurable as a multiplier because real data has structure the generator cannot produce. - -## What we expect to break at scale - -### Coverage metric itself - -**PRDC k-NN coverage concentrates in high dimensions.** With 150+ features, nearest-neighbor distances bunch up (curse of dimensionality) and a small distance threshold starts excluding almost everything while a larger one starts including almost everything. Raw-feature PRDC above ~50 columns is typically noise-dominated without dimensionality reduction or a learned embedding. - -**Mitigation**: compute PRDC in a learned embedding (autoencoder or the synthesizer's latent space) rather than raw features. Or compute per-block PRDC on demographically-stratified cells. Or switch to a metric that scales better with dimension (MMD with an RBF kernel, or mode-wise Wasserstein). - -### ZI-QRF training - -**Quantile random forests scale poorly in both rows and columns.** - -- Row scaling: train time is roughly O(N log N) per tree; memory is O(N × features × n_trees). On 1.5M rows × 150 cols × 100 trees, that's ~180 GB for naive storage without sparse leaves. Even with efficient implementations (`quantile-forest`, `lightgbm`-style histogram trees), training time is hours-to-days on CPU for a full run. -- Column scaling: splits over 150+ features explore a larger hyperparameter space; conditional coverage on rare variables gets noisier; `max_features` tuning becomes load-bearing. - -**Prediction**: ZI-QRF's dominance on small-SIPP is partly because 500-person panels fit neatly into tree leaves. At 1.5M rows, expect the advantage to narrow or invert — partly because QRF hits practical compute limits and has to subsample. - -### ZI-MAF training - -**Normalizing flows need careful hyperparameter tuning on real data.** - -- Mode-collapse risk: ZI-MAF's joint distribution over 150 variables can collapse onto a lower-dimensional manifold, especially when many variables are zero-inflated with correlated zero patterns (same person has zero across many income sources at once). -- Training time: MAF is GPU-accelerated and scales linearly in rows. 1.5M rows × 150 cols × 200 epochs is feasible on a single H100, ~several hours. On Apple Silicon (Max's 48 GB M3), ~8–16 hours with MPS backend. -- Conditioning: the existing benchmark uses 3 condition variables. Real microdata conditions on ~10–20 demographics. Adding conditioning dimensions is the easier part of scaling MAF. - -**Prediction**: ZI-MAF's lead on CPS should hold or grow at scale (flows scale well with rows). Main risk is tail coverage — top-1% income, extreme wealth — which is exactly where the SS-model application cares most. - -### ZI-QDNN training - -**Deep quantile networks scale well but need careful tuning at width + depth.** - -- Row scaling: straightforward, O(N) per epoch, linear in batch size. -- Column scaling: the pinball loss surface gets jagged with many zero-inflated targets; per-target head design matters more at 150 vars than at 4. -- Zero-inflation head: a single logistic head for `P(zero)` becomes underpowered at 150 zero-capable variables with complex joint zero patterns (observing income=0 informs dividends=0 informs wages=0). Joint zero-mask modeling is probably needed. - -**Prediction**: ZI-QDNN as currently implemented will degrade fastest under scale-up without a joint zero-mask head. Worth testing whether a graph-structured zero-mask extension rescues it. - -### PRDC coverage = 0 on PSID across all methods - -This is unresolved in the existing benchmark and is the single most important thing to diagnose before the SS-model longitudinal extension commits to PSID. Three hypotheses: - -1. **Test-setup degeneracy.** PSID-schema's observed-variable mask may overlap with the CPS / SIPP masks in a way that produces an empty held-out set. Check the mask logic. -2. **Panel structure breaks per-record PRDC.** PSID is a panel; a "record" could mean a person-year or a person. If the test set uses person-year and the synthesizer generates persons, coverage is trivially 0. Fix: switch to a panel-aware metric (per-person trajectory coverage) or generate person-years. -3. **Real limitation.** Attrition + sparse-year coverage in PSID creates tail records the synthesizers cannot cover. If this is the case, the SS-model trajectory training must either accept this ceiling, use a different panel source (SIPP panel, HRS, NLSY), or augment PSID with synthetic history. - -**Action**: diagnose before any PSID-dependent architecture work commits. - -## Proposed scale-up experiment protocol - -Run three stages, each keeping row count and column count explicit. All stages report three classes of metric: accuracy (coverage), cost (time + memory), and health (convergence + rare-cell preservation). - -### Stage 1 — medium rows, medium columns - -Scale: **100,000 rows × 50 columns** - -Data: subsample enhanced_cps_2024 to 100k persons, select 50 PE-native-relevant columns (income components, demographics, tax inputs, benefit receipts). Use a real subsample, not synthetic. - -Purpose: exercise real joint structure (tails, categorical constraints, zero correlations) without the full row cost. Should fit comfortably in 48 GB RAM on CPU, in hours. - -Metrics per method: -- PRDC coverage on 20% holdout (computed in raw features and in a 16-dim PCA embedding) -- Per-stratum coverage (age × income-bracket × filing-status cells) — specifically flag any cell with <10 records that drops to 0 coverage -- Rare-subpopulation preservation (elderly self-employed, young dividend, SSDI, top-1% earnings — the `sparse_coverage.csv` pattern) -- Training wall time -- Peak RSS during training -- Generation wall time for 100k samples -- Zero-rate MAE per variable - -### Stage 2 — large rows, medium columns - -Scale: **1,000,000 rows × 50 columns** - -Data: 10× oversample of stage 1's column set with enhanced_cps_2024 clone-and-assign style replication (as PE-US-data does for local area) to reach 1M rows. - -Purpose: expose row-scaling failures before column scaling. ZI-QRF is the most likely to fall off here. ZI-MAF should be OK. ZI-QDNN should scale cleanly. - -Same metrics as stage 1. - -### Stage 3 — full rows, full columns - -Scale: **3,373,378 rows × 155 columns** (exactly the v6 seed-ready shape, so we can compare the post-donor frame at production scale). - -Data: the actual v6 seed frame if we can retrieve it from the log (it was never persisted); otherwise regenerate by running donor integration only. Since we don't have the v6 artifact, this stage requires regenerating the seed — ~9 hours of donor integration. - -Purpose: verify which synthesizer survives production scale, in what time, at what memory cost. - -Same metrics, plus: -- Time to first valid sample (can we get ANY synthetic records out?) -- Sample quality trajectory over training time (does it stabilize, or degrade with more training?) -- Memory peak vs memory average (does it OOM on a 48 GB machine?) - -## Runtime expectations (rough a priori) - -Order-of-magnitude estimates for training one model to convergence on a 48 GB M3: - -| Method | Stage 1 (100k × 50) | Stage 2 (1M × 50) | Stage 3 (3.4M × 155) | -|---|---|---|---| -| ZI-QRF | minutes | hours, may OOM | days or infeasible; needs subsample | -| ZI-MAF | 30 min (CPU) / 5 min (MPS) | few hours (MPS) | 8–16 hours (MPS), needs batch tuning | -| ZI-QDNN | 15 min (CPU) / 3 min (MPS) | 1–2 hours (MPS) | 4–8 hours (MPS), lowest memory footprint | - -These are coarse and based on library benchmarks + extrapolation. The scale-up experiment's actual measurements are what we commit to. - -## Evaluation contract — matched-size comparison - -To avoid the "we ran ZI-MAF at 1M and ZI-QRF at 100k and declared a winner" trap, all three stages enforce: - -- **Same held-out split** across methods per stage (same 20% records). -- **Same feature set** across methods per stage. -- **Same wall-time budget** for training. (If ZI-QRF hits the budget without converging, that counts as its stage-3 result — "did not finish.") - -Report all three as a single table with method × stage × metric cells. Pick production defaults from this table alone, not from the existing 10k-row benchmark. - -## What this experiment would actually update - -1. **Production synthesizer default for G1.** Currently implied as ZI-MAF from the small benchmark. Scale-up may confirm or overturn. -2. **SS-model methodology doc's ZI-QDNN production claim.** If ZI-QDNN does not emerge as a clear winner at scale, the doc needs a pointer to this evaluation. -3. **PSID coverage ceiling.** If PSID coverage-0 is a real limitation, the longitudinal-training plan needs a fallback panel source. -4. **Compute budget for production runs.** Knowing that ZI-MAF needs 12 hours MPS at production scale changes how often we can iterate on synthesizer hyperparameters. - -## Out of scope (for now) - -- Training on real-panel data at scale. The stage-3 experiment uses the cross-section; panel synthesis is a separate scale-up that depends on PSID-coverage diagnosis first. -- Comparing against external non-microplex synthesizers (CTGAN, TVAE, TabDDPM, TabPFN) at full scale. Do after internal best is clear. -- Runtime on GPU clusters. Local laptop numbers first; remote GPU only if production bottleneck demands it. - -## Risks to the experiment itself - -1. **Retrieving the v6 seed frame requires rerunning donor integration** (~9h) because v6 never persisted. A cheaper alternative: use the enhanced_cps_2024 HDF5 at its native scale (~400k persons × ~250 columns — already close to stage-3 scale) and adapt the donor conditioning. -2. **PRDC in 150D is likely noise.** Budget time for the embedding-based variant before committing to any absolute coverage number. -3. **ZI-QRF may be infeasible at stage 3.** That is itself a finding; have a fallback "QRF on top-20-important-columns" variant ready to report as a scale-constrained baseline. -4. **The existing synthesizers may not even run at stage 3** without code changes (memory bugs at scale). Budget for 1–2 days of debugging on first attempt. - -## Minimum useful subset - -If full three-stage execution is too costly as a first pass, the minimum that informs the rearchitecture direction is **stage 1 alone**: 100k real-subsample rows × 50 real-feature columns, running all three ZI variants, reporting coverage + runtime + rare-cell preservation. - -That alone would invalidate or confirm the small-benchmark conclusions and give us enough signal to pick a G1 default. diff --git a/docs/v6-postmortem.md b/docs/v6-postmortem.md deleted file mode 100644 index 11d2bf70..00000000 --- a/docs/v6-postmortem.md +++ /dev/null @@ -1,77 +0,0 @@ -# v6 post-mortem — 2026-04-16 - -Record of the `broader-donors-puf-native-challenger-v6` run (launched 2026-04-16 10:20:10 ET, died 22:56:05 ET). - -## Outcome - -**RUN_EXIT status=1** after 12h 36m of wall time. Killed by the kernel during entropy calibration. No artifact directory created; no final dataset persisted. - -## Timeline of the post-donor window - -The post-donor stage instrumentation (commit `960ac2f`) was the single highest-value diagnostic change of the session. It let us localize the OOM to a specific named stage for the first time. - -| Time (ET) | Stage marker | -|---|---| -| 10:20:10 | RUN_START | -| ~19:29 (9h 9m in) | last donor block complete (`scf_2022/social_security_pension_income`) | -| 21:04:03 | `seed ready` → `targets start`/`complete` → `synthesis variables ready` → `synthesis start`/`complete` → `support enforcement start`/`complete` → `policyengine tables start` (all in one burst; synthesis backend = seed-copy so the burst is dominated by the strip+cap pass between donor integration and tables) | -| ~22:25 | `policyengine tables complete` [households=1,505,108, persons=3,373,378] | -| ~22:25 | `policyengine calibration start [backend=entropy]` | -| 22:56:05 | RUN_EXIT status=1, kernel signal (macOS `time -l` reported "signal: Invalid argument" on the wrapper) | - -## Memory signature - -From macOS `time -l` rusage at exit: - -| Metric | v6 | v4 (previous run) | -|---|---|---| -| Wall time | 45,355 s (12h 36m) | 39,476 s (10h 58m) | -| Max RSS | 22.0 GB | 20.5 GB | -| Peak phys_footprint | 293 GB | 287 GB | -| Instructions retired | 614 T | 612 T | -| Involuntary context switches | 317 K | 264 K | - -v6's signature is nearly identical to v4's — same killer, same point. - -## Diagnosis - -**`calibrate_policyengine_tables` with `backend=entropy` on 1.5M households is the OOM killer.** - -Proximate cause: a 48 GB machine cannot hold the working set the entropy solver needs for that scale. Peak phys_footprint of 293 GB on 48 GB RAM implies heavy compression and swap pressure; eventually the kernel kills the process. - -Likely underlying structural cost (not measured, but fits the profile): - -- Entropy calibration materializes a dense Jacobian-like matrix roughly `(n_households × n_constraints)` in float64. -- With 1,505,108 households and ~1,255 constraints post-feasibility-filter (from the 2026-03-30 review), that's 15 GB for a single copy. Multiple working copies (gradient, Hessian approximation, line-search scratch) easily exceed RAM. -- `_evaluate_policyengine_target_fit_context` then runs a full PolicyEngine simulation on the calibrated frame, which adds its own memory cost on top. - -## What survived - -v6 demonstrated that the **tables-build phase works at scale**: `build_policyengine_entity_tables` successfully produced a 1.5M-household × 3.4M-person entity bundle. This was an open question after v4. The stage isn't free (roughly 1h 25m at 180–210% CPU, RSS oscillating 0.2–16%), but it doesn't OOM. - -The donor integration also ran clean. All 129 donor blocks across CPS ASEC, IRS SOI PUF, SIPP tips, SIPP assets, and SCF completed without failure. The tax-unit entity-bundle construction took ~89 min (one-time cost per run). Multi-source donor imputation is not the bottleneck. - -## What v6 ruled out as the killer - -The initial v4 diagnosis hypothesized the silent post-donor window might be in synthesis, support enforcement, or tables-build. v6's instrumentation showed those all complete instantly or within ~1.5 hours. The killer is specifically **entropy calibration**, not an earlier stage. - -## What this means for the architecture direction - -v6 is an evidence point *for* the `spec-based-ecps-rewire` direction rather than against it: - -1. **Entropy calibration on a 1.5M-household monolithic solve is a dead end on a 48 GB machine.** The rearchitecture's hierarchical / identity-preserving calibration pattern (national → state → stratum, `microcalibrate`-style chi-squared) avoids the dense-matrix blow-up by chunking over strata. -2. **Scaffold scale is the real lever.** The 3.4M-row ACS scaffold drives both tables-build size and calibration-matrix size. CPS-core at ~430k persons cuts this at the source. -3. **The instrumentation pattern is reusable.** Keeping named stage markers at every pipeline boundary in the new pipeline will make any future OOM localizable in a single run rather than requiring multiple exploratory runs. - -## What v6 does NOT tell us - -- Whether the imputation quality would have beaten `enhanced_cps_2024` on PE-native broad loss had it finished. No parity artifact was produced. -- Whether the `pe_plus_puf_native_challenger` condition selection is an improvement. Moot now that the pipeline direction is changing. -- The actual numerical Calibrator's behavior on 1.5M households. The failure was upstream of any Calibrator numerical work — the process died while setting up the constraint matrices. - -## Status of v6 artifacts - -- Log file: `artifacts/live_pe_us_data_rebuild_checkpoint_20260414_pe_plus_puf_native_challenger_broader/broader-donors-puf-native-challenger-v6.log` (~2,224 lines) -- No output artifact directory (build never completed persistence step) -- tmux session: cleaned up -- No action required on artifacts — they stay on disk as part of the experiment trail. diff --git a/docs/zi-factorial.md b/docs/zi-factorial.md deleted file mode 100644 index d2673116..00000000 --- a/docs/zi-factorial.md +++ /dev/null @@ -1,100 +0,0 @@ -# ZI × draw-method factorial at 77k × 50 - -*Answers Max's question: should the zero-inflation strategy be chosen independently of the draw method?* - -## Design - -Four draw methods × two zero-inflation variants = eight cells. All runs on Enhanced CPS 2024 at 77,006 records × 50 columns, PRDC capped at 15,000 samples, seed 42. - -- **No ZI**: base method (`CART`, `QRF`, `QDNN`, `MAF`) — fit one per-column model on the full training set, sample or predict directly at generation. -- **ZI**: base method preceded by a `RandomForestClassifier` (50 trees) predicting $P(y > 0 \mid x)$ when training-set zero fraction exceeds 10 %. The per-column model is then fit on the non-zero subset only, and at generation time the draw is zero with probability $1 - \hat{P}(y > 0 \mid x)$. - -## Results - -PRDC coverage (bold per row = best within that draw method): - -| Draw method | No ZI | ZI | Δ | Zero-rate MAE (No ZI) | Zero-rate MAE (ZI) | -|---|---:|---:|---:|---:|---:| -| CART | 0.9055 | **0.9098** | +0.004 | 0.013 | 0.013 | -| QRF | 0.9328 | **0.9341** | +0.001 | 0.015 | 0.013 | -| QDNN | 0.6033 | **0.7068** | +0.103 | **0.582** | **0.136** | -| MAF | **0.0986** | 0.0928 | −0.006 | **0.332** | **0.081** | - -## Reading - -1. **CART and QRF are essentially indifferent to the ZI wrapper.** Coverage differences are within single-seed noise (< 0.005), and zero-rate MAE is nearly identical across the two configurations. Both methods' per-column draws naturally preserve zero mass: CART's leaf-sample-from-empirical produces zeros at the training-set leaf rate, and QRF's quantile draws reproduce zero quantiles when a leaf's training distribution has mass at zero. The RF zero-classifier is redundant for these methods. - -2. **QDNN genuinely needs ZI handling.** Coverage jumps 0.603 → 0.707 (+0.103) and zero-rate MAE drops 0.582 → 0.136. Without ZI, QDNN produces continuous-valued quantile predictions that never exactly equal zero, so all 0-valued real records are mis-covered. The ZI classifier essentially masks the neural draw to zero for records the classifier thinks are zero, restoring a credible zero-rate structure. - -3. **MAF is broken with or without ZI.** Coverage stays near 0.09, zero-rate MAE is terrible under both configurations. The per-column-independent MAF architecture is the binding constraint; the ZI wrapper saves the zero-rate MAE from 0.33 to 0.08 (helpful for diagnostics but not enough to fix coverage). Hyperparameter expansion didn't close the gap either (see `zi-maf-hyperparameter-search.md`). - -## Does ZI choice depend on draw method? Yes. - -The factorial reveals that the "ZI wrapper" is a no-op for draw methods whose leaf- or quantile-level draws already preserve zero structure implicitly (CART, QRF), and a critical fix for draw methods that produce smooth continuous predictions (QDNN, MAF). There is no single best ZI strategy; the right choice depends on what the draw method does with zero observations. - -This has two practical implications: - -1. **`ZIQRFMethod` and `ZICARTMethod` do not justify their extra complexity.** The `_MultiSourceBase` inheritance pattern that adds an RF zero-classifier before a QRF or CART draw adds 1–2 seconds of compute and meaningful memory (ZI-CART 7.8 GB vs CART 0.5 GB, because the RF classifier is kept in memory alongside the CART per column) for essentially zero accuracy gain. Production pipelines using tree methods should consider the base variants directly. - -2. **For neural methods, the ZI classifier is not optional.** QDNN without ZI produces 0-vs-0.33 zero-rate MAE and 10 coverage points of damage. Any paper or benchmark that tests QDNN-family synthesizers without explicit zero handling is measuring a different (and worse) method. - -## Production recommendation update - -The cross-section synthesizer recommendation becomes: - -- **CART (plain, no ZI)** — fastest path, competitive accuracy, and simplest to reason about. Near-synthpop default. -- **QRF (plain, no ZI)** — accuracy maximizer, ~5× the fit time of CART for 2 points of coverage. -- **Avoid ZI wrappers on tree methods.** They don't help. -- **Do use ZI wrappers on neural methods.** They rescue a substantial fraction of the damage, though not all of it. - -## ZI classifier comparison (QDNN) - -Having established that the ZI wrapper matters for QDNN, the next question is whether a different zero-classifier improves ZI-QDNN. Five classifiers were swapped into `ZI-QDNN`'s pipeline on the 77k × 50 benchmark (seed 42): - -| Classifier | Coverage | Precision | Zero-rate MAE | Fit (s) | -|---|---:|---:|---:|---:| -| **RF (default, 50 trees, uncalibrated)** | **0.7081** | 0.8343 | 0.1359 | 100 | -| HistGradientBoostingClassifier | 0.7017 | 0.8334 | 0.1370 | 137 | -| MLP (64 × 32, Adam, early stop) | 0.6984 | 0.8397 | 0.1376 | 130 | -| RF + isotonic calibration (3-fold) | 0.6983 | 0.8309 | 0.1370 | 109 | -| Logistic regression | 0.6941 | 0.8336 | 0.1362 | 107 | - -All five classifiers cluster within 0.014 coverage points, at or below our multi-seed standard deviation (≈0.002–0.003). **The ZI classifier choice does not meaningfully affect coverage on QDNN at this scale and schema.** The 50-tree RF default is effectively optimal among the alternatives tested. - -The interpretation is that the information content of $P(y > 0 \mid x)$ is already captured by a 50-tree RF — a stronger classifier (HistGB, DNN) does not extract additional signal, calibrated probabilities do not propagate to better coverage, and logistic regression is mildly worse because its linear decision boundary under-fits on some columns. - -What would actually lift ZI-QDNN above 0.71 coverage is not a better zero-classifier but an architectural change: joint zero-mask modeling (one classifier predicting the full 36-dim zero pattern so cross-target zero correlations are captured), joint quantile output (shared-backbone multivariate QDNN), or post-hoc calibration of the quantile network's own pinball-loss output. These are deferred future work. - -## Isolated log-loss evaluation - -The coverage tie above could mean either (a) the five classifiers produce genuinely similar $P(y > 0 \mid x)$, so the downstream is honestly reporting, or (b) the classifiers differ materially but the QDNN non-zero draw's error swamps the signal. An isolated per-column evaluation decouples the two. - -Protocol: same outer 80/20 train/holdout split as the coverage benchmark (seed 42), then an inner 80/20 split within training into fit/val (49,283 fit, 12,321 val). For each of the 36 target columns with training-set zero-fraction ≥ 10 % (26 eligible columns), each classifier is fit on (`X_fit`, `(~at_min)_fit`) and scored on val with log-loss, Brier, equal-width ECE (10 bins), and ROC-AUC. - -| Classifier | Log-loss (mean) | Log-loss (median) | Brier | ECE | AUC (mean) | AUC (median) | -|---|---:|---:|---:|---:|---:|---:| -| **HistGB** | **0.2252** | **0.1712** | **0.0707** | **0.0050** | **0.809** | **0.822** | -| DNN | 0.2337 | 0.1956 | 0.0732 | 0.0070 | 0.748 | 0.773 | -| RF + isotonic (3-fold) | 0.2343 | 0.1834 | 0.0739 | 0.0081 | 0.763 | 0.780 | -| Logistic regression | 0.2468 | 0.2028 | 0.0770 | 0.0180 | 0.756 | 0.763 | -| RF default (50 trees, uncalibrated) | 0.3095 | 0.2523 | 0.0810 | 0.0394 | 0.737 | 0.762 | - -**The isolated picture is the opposite of the coverage picture.** The default 50-tree RF — the classifier that was effectively tied on PRDC coverage — is the *worst* classifier on log-loss (spread 0.085, about 6× the coverage spread), Brier, AUC, and calibration. Its ECE is ~8× worse than HistGB's. The AUC gap between RF (0.737) and HistGB (0.809) is 7 points — well outside any plausible noise band. - -This resolves the earlier ambiguity cleanly: - -1. **The ZI classifier choice does matter for the quantity the ZI wrapper is ostensibly predicting.** HistGB has meaningfully better $P(y > 0 \mid x)$ than an uncalibrated 50-tree RF on nearly every axis — log-loss, Brier, calibration, discrimination. - -2. **But the downstream QDNN draw swamps the signal.** Seven points of AUC and an order-of-magnitude calibration improvement produce zero coverage gain. The bridging logic (zero with probability $1 - \hat{P}(y > 0 \mid x)$, otherwise draw from the non-zero QDNN) is dominated by error in the non-zero draw, not error in the classifier. - -3. **The binding constraint for ZI-QDNN's coverage is downstream of the classifier.** Swapping classifiers alone cannot lift ZI-QDNN past 0.71 coverage — this requires improving the non-zero quantile output (joint modeling, pinball-loss recalibration, architectural change). - -There is a secondary implication for uses of the zero-classifier as a diagnostic rather than a generator component: if we ever surface $\hat{P}(y = 0 \mid x)$ as a subgroup-level or record-level signal (e.g., "this household is 80% likely to have zero long-term capital gains"), the RF default is not the right model. HistGB or a calibrated RF should be preferred there, because the calibration and discrimination gaps that are invisible on coverage become directly user-visible on calibration plots and top-k retrieval. - -## Artifacts - -- `artifacts/stage1_77k_no_zi.json` — pure QRF, QDNN, MAF at 77k -- `artifacts/stage1_77k_cart_variants.json` — CART, ZI-CART, ZI-QRF at 77k -- `artifacts/stage1_77k_4methods.json` — ZI-CART, ZI-QRF, ZI-QDNN, ZI-MAF at 77k -- `artifacts/zi_classifier_comparison.json` — 5 ZI classifiers on QDNN at 77k (coverage) -- `artifacts/zi_classifier_isolated_eval.json` — 5 ZI classifiers in isolation (log-loss / Brier / ECE / AUC) diff --git a/docs/zi-maf-hyperparameter-search.md b/docs/zi-maf-hyperparameter-search.md deleted file mode 100644 index aae83eaf..00000000 --- a/docs/zi-maf-hyperparameter-search.md +++ /dev/null @@ -1,90 +0,0 @@ -# ZI-MAF hyperparameter search — does tuning rescue the method? - -*Direct test of the stage-1 follow-up flagged in `docs/stage-1-pilot-results.md`.* - -## Setup - -40,000 rows × 50 columns of real enhanced_cps_2024 (identical to stage-1). ZI-MAF trained at four progressively bigger configurations on the same seed and split. PRDC evaluated in 50-dim raw feature space, capped at 15 k × 15 k samples (same cap as stage-1 77 k). - -| Config | n_layers | hidden_dim | epochs | batch | lr | Approx params | -|---|---:|---:|---:|---:|---:|---:| -| default | 4 | 32 | 50 | 256 | 1e-3 | baseline | -| wide | 4 | 128 | 50 | 256 | 1e-3 | 4× params | -| long | 4 | 32 | 200 | 256 | 1e-3 | 4× training | -| wide+long | 8 | 128 | 200 | 256 | 5e-4 | 16× both + deeper | - -## Results - -| Config | Coverage | Precision | Density | Fit (s) | Gen (s) | -|---|---:|---:|---:|---:|---:| -| default | 0.0262 | 0.0083 | 0.0038 | 124 | 0.7 | -| wide | 0.0293 | 0.0088 | 0.0043 | 228 | 0.8 | -| long | 0.0318 | 0.0097 | 0.0048 | 467 | 0.6 | -| wide+long | **0.0328** | 0.0107 | 0.0050 | 1,711 | 1.0 | - -Fit time to get from 0.026 → 0.033 coverage: 14× the compute budget. Compare to ZI-QRF on the same data at the same PRDC cap: **coverage 0.352 in 19 s**. - -## Verdict - -**ZI-MAF is confirmed non-competitive at stage-1 scale with the method-class architecture.** Expanding capacity (4× width), training longer (4× epochs), and doing both with deeper layers (16× total + 8 layers) moves coverage from 0.026 to 0.033 — a 25 % relative improvement. ZI-QRF's 0.352 is 10 × higher at 1/90 the fit time. - -The stage-1 finding stands: ZI-QRF is the production synthesizer, not ZI-MAF. No amount of hyperparameter tuning at the default architectural level is going to close a 10× gap. - -## Why ZI-MAF fails here - -Hypotheses, ordered by how plausible they seem on this evidence: - -1. **Per-column independence.** `ZIMAFMethod` trains one `ConditionalMAF` per target column independently. With 36 target columns, 36 flows each only learn `P(col_i | conditioning)` — there's no mechanism to capture cross-target correlations (e.g., someone with high wage income also has zero SNAP). Joint-target flows would be architecturally different but expensive. Tree methods (ZI-QRF) implicitly capture some of these via the conditioning features, but their per-column independence is less damaging because each tree doesn't try to encode a full joint distribution. - -2. **Zero-inflation classifier + flow combo.** The method first classifies P(zero) via a 50-tree RF, then trains a flow on the non-zero subset. If the classifier over-predicts zero on rare non-zero cells (see stage-1's `disabled_ssdi` ratio = 0, `elderly_self_employed` ratio = 100+), the flow is trained on a biased subset and produces samples that don't cover the missing support. - -3. **Log-transform + standardization on heavy-tailed targets.** The flow log-transforms positive values (`np.log1p(y[y>0])`) and standardizes. For variables with extreme tails (top-1% employment income, net-worth-level wealth), this compresses the tail and the flow produces samples concentrated around the mode; the sparse tail coverage is exactly what PRDC measures. - -4. **No conditional target structure.** MAF learns `P(y | x)` where `x` is the shared demographics. 14 conditioning dims predicting 36 target dims (each modeled as 1-dim marginal flow conditional on the 14) may be under-identified at 40k × 36 samples per column. - -## What would change my mind - -A single condition that would lift ZI-MAF into competitive range: - -- **Joint-target flow**: one flow over all 36 target columns simultaneously, not 36 independent flows. Direction matches the SS-model methodology doc's "pathwise / trajectory" framing for longitudinal work. -- **Better zero-inflation handling**: a joint zero-mask model (which 36-dim binary vector does this person have?) instead of 36 independent RF classifiers. Training signal correlates zero patterns across targets. -- **Embedding-based PRDC**: the validation run flagged in `stage-1-pilot-results.md` could show ZI-MAF produces structurally-right samples that raw-feature PRDC misses. Separate investigation. - -None of these are in the current `ZIMAFMethod` class. Rewriting them is a materially different project. - -## Implication for the SS-model methodology doc - -The doc names ZI-QDNN as the production direction with ZI-MAF as a reasonable alternative. Neither survives stage-1 tuning at scale. The near-term cross-section synthesizer default on the rewire is **ZI-QRF**; any future trajectory-based modeling for the longitudinal extension will need a materially different architecture than per-column independent flows. - -## Where this leaves us - -- **G1 cross-section default**: ZI-QRF. Locked in. -- **ZI-MAF / ZI-QDNN**: not dead as research directions, but are dead as production defaults in their current `microplex.eval.benchmark` implementations. -- **Followup worth trying before fully ruling out neural**: joint-target flow + joint zero-mask model. Needs ~a week of implementation and may still not close the gap. - -## Reproducibility - -```bash -uv run python -c " -import json, time, numpy as np, pandas as pd -from microplex_us.bakeoff import ScaleUpRunner, ScaleUpStageConfig, DEFAULT_CONDITION_COLS, DEFAULT_TARGET_COLS, stage1_config -from microplex.eval.benchmark import ZIMAFMethod -from prdc import compute_prdc -from sklearn.preprocessing import StandardScaler - -base = stage1_config() -cfg = ScaleUpStageConfig( - stage='zi_maf_tuning', n_rows=40000, methods=('ZI-QRF',), - condition_cols=DEFAULT_CONDITION_COLS, target_cols=DEFAULT_TARGET_COLS, - holdout_frac=0.2, seed=42, k=5, n_generate=32000, - data_path=base.data_path, year=base.year, rare_cell_checks=(), - prdc_max_samples=15000, -) -runner = ScaleUpRunner(cfg) -df = runner.load_frame() -train, holdout = runner.split(df) -# ... fit and evaluate each config ... -" -``` - -Full results in `artifacts/zi_maf_tuning.json`. Wall time for all four configs: ~43 min. diff --git a/envs/README.md b/envs/README.md deleted file mode 100644 index d5ed10ce..00000000 --- a/envs/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Developer Testing Environments - -Use the normal `uv` install path unless PyPI cannot provide the needed binary -packages for your platform. This folder holds development-only environments for -those cases. - -## Intel macOS - -Production macOS installs require Apple Silicon (`arm64`). Intel macOS -(`x86_64`) is supported only for development and testing through conda-forge, -because modern PyPI `torch` wheels are not available for that platform. - -Create or update the Intel Mac development environment with: - -```bash -./scripts/install.sh --dev-intel-mac -``` - -That command uses `envs/macos-intel-conda-forge.yml` to install Python 3.13 and -PyTorch 2.11 from conda-forge, then installs this repository with the `dev` and -`policyengine` extras using pip inside the conda environment. - -Use the normal install script on Apple Silicon macOS and Linux: - -```bash -./scripts/install.sh --prod -./scripts/install.sh --dev -``` diff --git a/envs/macos-intel-conda-forge.yml b/envs/macos-intel-conda-forge.yml deleted file mode 100644 index 53700a27..00000000 --- a/envs/macos-intel-conda-forge.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: microplex-us-intel -channels: - - conda-forge - - nodefaults -dependencies: - - python=3.13 - - pytorch=2.11.* - - pip diff --git a/examples/cps_real_data_test.py b/examples/cps_real_data_test.py deleted file mode 100644 index b76467c7..00000000 --- a/examples/cps_real_data_test.py +++ /dev/null @@ -1,662 +0,0 @@ -""" -CPS Real Data Test for HierarchicalSynthesizer - -Complete workflow testing microplex on real CPS ASEC data: -1. Load CPS data using load_cps_for_synthesis() -2. Fit HierarchicalSynthesizer on real data -3. Generate synthetic households and persons -4. Evaluate quality using multivariate metrics: - - Distance correlation (dCor) preservation - - Energy distance - - Nearest neighbor authenticity -5. Compare derived HH income distribution to real -6. Test reweighting to state-level targets - -Run with: python examples/cps_real_data_test.py -""" - -import sys -import time -from pathlib import Path - -import numpy as np -import pandas as pd -from scipy import stats -from scipy.spatial.distance import cdist -from sklearn.preprocessing import StandardScaler - -# Add paths for local development -PACKAGE_ROOT = Path(__file__).resolve().parents[1] -sys.path.insert(0, str(PACKAGE_ROOT / "src")) - -from microplex import HierarchicalSynthesizer, HouseholdSchema, Reweighter - -from microplex_us.data import create_sample_data, load_cps_for_synthesis - - -def setup_cps_schema(): - """Create a schema tailored for CPS ASEC data.""" - return HouseholdSchema( - hh_vars=["n_persons", "n_adults", "n_children", "state_fips", "tenure"], - person_vars=["age", "sex", "income", "employment_status", "education"], - person_condition_vars=[ - "n_persons", - "n_adults", - "n_children", - "state_fips", - "tenure", - "person_number", - "is_first_adult", - "is_child_slot", - ], - derived_vars={ - "hh_income": "sum:income", - "n_workers": "count:employment_status==1", - }, - hh_id_col="household_id", - person_id_col="person_id", - ) - - -# ============================================================================ -# MULTIVARIATE METRICS -# ============================================================================ - -def distance_correlation(X: np.ndarray, Y: np.ndarray) -> float: - """ - Compute Distance Correlation between two variables. - - dCor = 0 iff X and Y are independent. Unlike Pearson correlation, - distance correlation captures ALL types of dependence (linear, - nonlinear, non-monotonic). - """ - n = len(X) - if n < 2: - return 0.0 - - X = np.asarray(X).reshape(-1) - Y = np.asarray(Y).reshape(-1) - - a = np.abs(X[:, None] - X[None, :]) - b = np.abs(Y[:, None] - Y[None, :]) - - A = a - a.mean(axis=0, keepdims=True) - a.mean(axis=1, keepdims=True) + a.mean() - B = b - b.mean(axis=0, keepdims=True) - b.mean(axis=1, keepdims=True) + b.mean() - - dcov2 = (A * B).sum() / (n * n) - dvar_x = (A * A).sum() / (n * n) - dvar_y = (B * B).sum() / (n * n) - - if dvar_x <= 0 or dvar_y <= 0: - return 0.0 - - dcor = np.sqrt(dcov2 / np.sqrt(dvar_x * dvar_y)) - return float(np.clip(dcor, 0, 1)) - - -def compute_dcor_preservation( - original: pd.DataFrame, - synthetic: pd.DataFrame, - variables: list, - sample_size: int = 500, -) -> dict: - """ - Compare distance correlation structure between original and synthetic. - - Returns dict with: - - original_dcors: dCor matrix for original data - - synthetic_dcors: dCor matrix for synthetic data - - dcor_errors: absolute error per pair - - mean_dcor_error: average error across all pairs - """ - # Sample for efficiency - n_sample = min(sample_size, len(original), len(synthetic)) - np.random.seed(42) - orig_sample = original.sample(n=n_sample, random_state=42) - synth_sample = synthetic.sample(n=n_sample, random_state=42) - - available_vars = [ - v for v in variables - if v in orig_sample.columns and v in synth_sample.columns - ] - - dcor_errors = {} - orig_dcors = {} - synth_dcors = {} - - for i, var1 in enumerate(available_vars): - for j, var2 in enumerate(available_vars): - if i < j: - pair = f"{var1}_vs_{var2}" - orig_dcor = distance_correlation( - orig_sample[var1].values, orig_sample[var2].values - ) - synth_dcor = distance_correlation( - synth_sample[var1].values, synth_sample[var2].values - ) - orig_dcors[pair] = orig_dcor - synth_dcors[pair] = synth_dcor - dcor_errors[pair] = abs(orig_dcor - synth_dcor) - - return { - "original_dcors": orig_dcors, - "synthetic_dcors": synth_dcors, - "dcor_errors": dcor_errors, - "mean_dcor_error": np.mean(list(dcor_errors.values())) if dcor_errors else 0, - } - - -def compute_energy_distance(X: np.ndarray, Y: np.ndarray) -> float: - """ - Compute energy distance between two distributions. - - Energy distance is the MULTIVARIATE GENERALIZATION OF CRPS. - D = 0 iff distributions are identical. - """ - n_x = len(X) - n_y = len(Y) - - dist_XX = cdist(X, X, metric="euclidean") - dist_YY = cdist(Y, Y, metric="euclidean") - dist_XY = cdist(X, Y, metric="euclidean") - - term1 = 2 * np.mean(dist_XY) - term2 = (np.sum(dist_XX) - np.trace(dist_XX)) / (n_x * (n_x - 1)) - term3 = (np.sum(dist_YY) - np.trace(dist_YY)) / (n_y * (n_y - 1)) - - return float(term1 - term2 - term3) - - -def compute_nearest_neighbor_authenticity( - synthetic: np.ndarray, - holdout: np.ndarray, - train: np.ndarray = None, -) -> dict: - """ - Compute nearest neighbor authenticity metrics. - - For each synthetic record, find distance to nearest real record. - Also checks for overfitting by comparing to training data. - - Returns: - - mean_distance: average nearest neighbor distance - - min_distance: minimum distance (privacy check) - - max_distance: maximum distance (outliers) - - privacy_ratio: ratio of holdout distance to train distance (if train provided) - """ - # Synthetic -> Holdout distances - dist_to_holdout = cdist(synthetic, holdout, metric="euclidean") - nn_distances = np.min(dist_to_holdout, axis=1) - - result = { - "mean_distance": float(np.mean(nn_distances)), - "median_distance": float(np.median(nn_distances)), - "min_distance": float(np.min(nn_distances)), - "max_distance": float(np.max(nn_distances)), - "q25_distance": float(np.percentile(nn_distances, 25)), - "q75_distance": float(np.percentile(nn_distances, 75)), - } - - # Privacy/overfitting check if training data provided - if train is not None: - dist_to_train = cdist(synthetic, train, metric="euclidean") - nn_distances_train = np.min(dist_to_train, axis=1) - - # Ratio > 1 means synthetic is closer to holdout (good generalization) - # Ratio < 1 means synthetic is closer to train (overfitting) - ratios = (nn_distances + 1e-10) / (nn_distances_train + 1e-10) - closer_to_train = np.mean(nn_distances_train < nn_distances) - - result["privacy_ratio"] = float(np.mean(ratios)) - result["fraction_closer_to_train"] = float(closer_to_train) - - return result - - -def evaluate_multivariate_quality( - original_persons: pd.DataFrame, - synthetic_persons: pd.DataFrame, - person_vars: list, - sample_size: int = 1000, -) -> dict: - """ - Comprehensive multivariate quality evaluation. - """ - print("\n" + "=" * 60) - print("MULTIVARIATE QUALITY EVALUATION") - print("=" * 60) - - results = {} - - # 1. Distance Correlation Preservation - print("\n1. Computing distance correlation (dCor) preservation...") - dcor_results = compute_dcor_preservation( - original_persons, synthetic_persons, person_vars, sample_size - ) - results["dcor"] = dcor_results - print(f" Mean dCor error: {dcor_results['mean_dcor_error']:.4f}") - - # 2. Energy Distance - print("\n2. Computing energy distance...") - numeric_vars = [v for v in person_vars if v in original_persons.columns] - - orig_subset = original_persons[numeric_vars].dropna() - synth_subset = synthetic_persons[numeric_vars].dropna() - - n_sample = min(sample_size, len(orig_subset), len(synth_subset)) - orig_sample = orig_subset.sample(n=n_sample, random_state=42) - synth_sample = synth_subset.sample(n=n_sample, random_state=42) - - scaler = StandardScaler() - orig_norm = scaler.fit_transform(orig_sample) - synth_norm = scaler.transform(synth_sample) - - energy_dist = compute_energy_distance(orig_norm, synth_norm) - results["energy_distance"] = energy_dist - print(f" Energy distance: {energy_dist:.4f}") - - # 3. Nearest Neighbor Authenticity - print("\n3. Computing nearest neighbor authenticity...") - # Split original into train/holdout for privacy check - n_holdout = len(orig_norm) // 2 - holdout_norm = orig_norm[:n_holdout] - train_norm = orig_norm[n_holdout:] - - nn_results = compute_nearest_neighbor_authenticity( - synth_norm, holdout_norm, train_norm - ) - results["nearest_neighbor"] = nn_results - print(f" Mean NN distance: {nn_results['mean_distance']:.4f}") - print(f" Min NN distance: {nn_results['min_distance']:.4f} (privacy check)") - if "privacy_ratio" in nn_results: - print(f" Privacy ratio: {nn_results['privacy_ratio']:.4f} (>1 = good)") - print(f" Closer to train: {nn_results['fraction_closer_to_train']:.1%}") - - return results - - -# ============================================================================ -# HOUSEHOLD INCOME COMPARISON -# ============================================================================ - -def compare_hh_income_distributions( - original_persons: pd.DataFrame, - synthetic_persons: pd.DataFrame, - synthetic_hh: pd.DataFrame, -) -> dict: - """ - Compare derived household income distribution between original and synthetic. - """ - print("\n" + "=" * 60) - print("HOUSEHOLD INCOME DISTRIBUTION COMPARISON") - print("=" * 60) - - # Compute original HH income - orig_hh_income = original_persons.groupby("household_id")["income"].sum() - - # Synthetic HH income is already computed as hh_income - synth_hh_income = synthetic_hh["hh_income"] if "hh_income" in synthetic_hh.columns else ( - synthetic_persons.groupby("household_id")["income"].sum() - ) - - # Summary statistics - results = { - "original": { - "mean": float(orig_hh_income.mean()), - "median": float(orig_hh_income.median()), - "std": float(orig_hh_income.std()), - "q25": float(orig_hh_income.quantile(0.25)), - "q75": float(orig_hh_income.quantile(0.75)), - "zero_fraction": float((orig_hh_income == 0).mean()), - }, - "synthetic": { - "mean": float(synth_hh_income.mean()), - "median": float(synth_hh_income.median()), - "std": float(synth_hh_income.std()), - "q25": float(synth_hh_income.quantile(0.25)), - "q75": float(synth_hh_income.quantile(0.75)), - "zero_fraction": float((synth_hh_income == 0).mean()), - }, - } - - # KS test - ks_stat, ks_pval = stats.ks_2samp( - orig_hh_income.dropna().values, - synth_hh_income.dropna().values, - ) - results["ks_statistic"] = float(ks_stat) - results["ks_pvalue"] = float(ks_pval) - - # Variance ratio - orig_var = orig_hh_income.var() - synth_var = synth_hh_income.var() - results["variance_ratio"] = float(synth_var / orig_var) if orig_var > 0 else 0 - - # Print results - print("\nOriginal HH Income:") - print(f" Mean: ${results['original']['mean']:,.0f}") - print(f" Median: ${results['original']['median']:,.0f}") - print(f" Std: ${results['original']['std']:,.0f}") - print(f" Zero fraction: {results['original']['zero_fraction']:.2%}") - - print("\nSynthetic HH Income:") - print(f" Mean: ${results['synthetic']['mean']:,.0f}") - print(f" Median: ${results['synthetic']['median']:,.0f}") - print(f" Std: ${results['synthetic']['std']:,.0f}") - print(f" Zero fraction: {results['synthetic']['zero_fraction']:.2%}") - - print("\nDistribution Comparison:") - print(f" KS statistic: {ks_stat:.4f}") - print(f" Variance ratio: {results['variance_ratio']:.4f}") - - status = "Good" if ks_stat < 0.1 else "Fair" if ks_stat < 0.2 else "Poor" - print(f" Status: {status}") - - return results - - -# ============================================================================ -# STATE-LEVEL REWEIGHTING -# ============================================================================ - -def test_state_level_reweighting( - synthetic_hh: pd.DataFrame, - original_hh: pd.DataFrame, -) -> dict: - """ - Test reweighting synthetic data to match state-level population targets. - """ - print("\n" + "=" * 60) - print("STATE-LEVEL REWEIGHTING TEST") - print("=" * 60) - - # Compute original state distribution as targets - # Weight by person count per household to approximate population - if "state_fips" not in original_hh.columns: - print("WARNING: state_fips not in original household data, skipping reweighting test") - return {"error": "state_fips not available"} - - state_populations = original_hh.groupby("state_fips")["n_persons"].sum() - - # Scale to match synthetic data size - scale_factor = len(synthetic_hh) / len(original_hh) - state_targets = (state_populations * scale_factor).round().astype(int).to_dict() - - print("\nTarget state distribution (scaled):") - top_states = dict(sorted(state_targets.items(), key=lambda x: -x[1])[:5]) - for state, count in top_states.items(): - print(f" State {int(state)}: {count:,}") - - # Add state column to synthetic households if needed - if "state_fips" not in synthetic_hh.columns: - print("\nWARNING: state_fips not in synthetic_hh, adding from n_persons-based imputation") - # Impute state - for demo purposes use uniform distribution - np.random.seed(42) - synthetic_hh = synthetic_hh.copy() - synthetic_hh["state_fips"] = np.random.choice( - list(state_targets.keys()), - size=len(synthetic_hh), - p=np.array(list(state_targets.values())) / sum(state_targets.values()), - ) - - # Ensure state_fips types match - synthetic_hh["state_fips"] = synthetic_hh["state_fips"].astype(int) - - # Filter to states that exist in both datasets - synth_states = set(synthetic_hh["state_fips"].unique()) - target_states = set(state_targets.keys()) - common_states = synth_states & target_states - - if len(common_states) < len(target_states): - print(f"\nFiltering to {len(common_states)} common states") - state_targets = {k: v for k, v in state_targets.items() if k in common_states} - synthetic_hh = synthetic_hh[synthetic_hh["state_fips"].isin(common_states)].copy() - - # Before reweighting distribution - before_counts = synthetic_hh["state_fips"].value_counts().sort_index() - - results = {"targets": state_targets, "before": before_counts.to_dict()} - - # NOTE: The synthesizer treats state_fips as continuous, which produces - # out-of-range values. We filter to valid states for the reweighting test. - valid_states = set(state_targets.keys()) - synth_hh_filtered = synthetic_hh[synthetic_hh["state_fips"].isin(valid_states)].copy() - - print(f"\nFiltered to {len(synth_hh_filtered)} households with valid state FIPS") - if len(synth_hh_filtered) < 100: - print(" WARNING: Too few valid households for meaningful reweighting") - print(" NOTE: state_fips is treated as continuous by the synthesizer") - print(" Consider using discrete_vars for categorical variables") - return { - "error": "insufficient_valid_states", - "n_valid": len(synth_hh_filtered), - "targets": state_targets, - } - - # Test different sparsity settings - for sparsity in ["l1", "l2", "l0"]: - print(f"\n--- {sparsity.upper()} Reweighting ---") - - try: - reweighter = Reweighter(sparsity=sparsity) - weighted_hh = reweighter.fit_transform( - synth_hh_filtered, - {"state_fips": state_targets}, - drop_zeros=False, - ) - - # Check results - stats_dict = reweighter.get_sparsity_stats() - print(f" Records used: {stats_dict['n_nonzero']:,} / {stats_dict['n_records']:,}") - print(f" Sparsity: {stats_dict['sparsity']:.1%}") - print(f" Max weight: {stats_dict['max_weight']:.2f}") - - # Check target matching - weighted_counts = weighted_hh.groupby("state_fips")["weight"].sum() - - max_error = 0 - for state in list(state_targets.keys())[:5]: - actual = weighted_counts.get(state, 0) - target = state_targets[state] - error = abs(actual - target) / target if target > 0 else 0 - max_error = max(max_error, error) - - print(f" Max target error: {max_error:.2%}") - - results[f"{sparsity}_stats"] = stats_dict - results[f"{sparsity}_max_error"] = max_error - - except Exception as e: - print(f" ERROR: {e}") - results[f"{sparsity}_error"] = str(e) - - return results - - -# ============================================================================ -# MAIN -# ============================================================================ - -def main(): - """Run complete CPS real data test.""" - - print("=" * 80) - print("MICROPLEX CPS REAL DATA TEST") - print("HierarchicalSynthesizer on CPS ASEC Data") - print("=" * 80) - - # Configuration - epochs = 50 - sample_fraction = 0.3 # Use 30% of CPS data - n_synthetic_households = 5000 - - results = {} - - # ======================================================================== - # STEP 1: Load CPS Data - # ======================================================================== - print("\n" + "=" * 80) - print("STEP 1: Loading CPS ASEC Data") - print("=" * 80) - - try: - households, persons = load_cps_for_synthesis( - sample_fraction=sample_fraction, - random_state=42, - ) - print("\nLoaded CPS ASEC data:") - print(f" Households: {len(households):,}") - print(f" Persons: {len(persons):,}") - print(f" Avg HH size: {len(persons) / len(households):.2f}") - - results["data"] = { - "n_households": len(households), - "n_persons": len(persons), - "avg_hh_size": len(persons) / len(households), - } - - except FileNotFoundError: - print("\nWARNING: CPS data not found. Using synthetic sample data.") - households, persons = create_sample_data(n_households=3000, seed=42) - print(f"Generated sample data: {len(households)} HH, {len(persons)} persons") - results["data"] = {"source": "synthetic_sample"} - - # ======================================================================== - # STEP 2: Fit HierarchicalSynthesizer - # ======================================================================== - print("\n" + "=" * 80) - print("STEP 2: Training HierarchicalSynthesizer") - print("=" * 80) - - schema = setup_cps_schema() - print("\nSchema:") - print(f" HH vars: {schema.hh_vars}") - print(f" Person vars: {schema.person_vars}") - - synth = HierarchicalSynthesizer( - schema=schema, - hh_flow_kwargs={"n_layers": 4, "hidden_dim": 64}, - person_flow_kwargs={"n_layers": 6, "hidden_dim": 128}, - random_state=42, - ) - - print(f"\nTraining with {epochs} epochs...") - start_time = time.time() - - synth.fit( - households, - persons, - hh_weight_col="hh_weight" if "hh_weight" in households.columns else None, - epochs=epochs, - verbose=True, - ) - - train_time = time.time() - start_time - print(f"\nTraining completed in {train_time:.1f} seconds") - results["training_time"] = train_time - - # ======================================================================== - # STEP 3: Generate Synthetic Households - # ======================================================================== - print("\n" + "=" * 80) - print(f"STEP 3: Generating {n_synthetic_households:,} Synthetic Households") - print("=" * 80) - - start_time = time.time() - synthetic_hh, synthetic_persons = synth.generate( - n_households=n_synthetic_households, - verbose=True, - ) - generate_time = time.time() - start_time - - print(f"\nGeneration completed in {generate_time:.1f} seconds") - print(f" Synthetic households: {len(synthetic_hh):,}") - print(f" Synthetic persons: {len(synthetic_persons):,}") - print(f" Avg HH size: {len(synthetic_persons) / len(synthetic_hh):.2f}") - - results["generation"] = { - "n_synthetic_hh": len(synthetic_hh), - "n_synthetic_persons": len(synthetic_persons), - "generation_time": generate_time, - } - - # ======================================================================== - # STEP 4: Multivariate Quality Evaluation - # ======================================================================== - person_vars = ["age", "sex", "income", "employment_status", "education"] - - multivariate_results = evaluate_multivariate_quality( - persons, - synthetic_persons, - person_vars, - sample_size=1000, - ) - results["multivariate"] = multivariate_results - - # ======================================================================== - # STEP 5: Household Income Comparison - # ======================================================================== - hh_income_results = compare_hh_income_distributions( - persons, - synthetic_persons, - synthetic_hh, - ) - results["hh_income"] = hh_income_results - - # ======================================================================== - # STEP 6: State-Level Reweighting - # ======================================================================== - reweighting_results = test_state_level_reweighting( - synthetic_hh, - households, - ) - results["reweighting"] = reweighting_results - - # ======================================================================== - # SUMMARY - # ======================================================================== - print("\n" + "=" * 80) - print("SUMMARY") - print("=" * 80) - - print("\nKey Quality Metrics:") - print(f" dCor preservation error: {multivariate_results['dcor']['mean_dcor_error']:.4f}") - print(f" Energy distance: {multivariate_results['energy_distance']:.4f}") - print(f" NN mean distance: {multivariate_results['nearest_neighbor']['mean_distance']:.4f}") - print(f" HH income KS stat: {hh_income_results['ks_statistic']:.4f}") - print(f" HH income variance ratio: {hh_income_results['variance_ratio']:.4f}") - - # Interpretation - print("\nInterpretation Guide:") - dcor_err = multivariate_results['dcor']['mean_dcor_error'] - dcor_status = "Excellent" if dcor_err < 0.05 else "Good" if dcor_err < 0.1 else "Fair" - print(f" dCor: {dcor_status} (< 0.05 excellent, < 0.1 good)") - - ks = hh_income_results['ks_statistic'] - ks_status = "Excellent" if ks < 0.1 else "Good" if ks < 0.2 else "Fair" - print(f" HH Income KS: {ks_status} (< 0.1 excellent, < 0.2 good)") - - var_ratio = hh_income_results['variance_ratio'] - var_status = "Good" if 0.8 <= var_ratio <= 1.2 else "Fair" - print(f" Variance Ratio: {var_status} (0.8-1.2 is good)") - - print("\nReweighting Results:") - for sparsity in ["l1", "l2", "l0"]: - if f"{sparsity}_stats" in reweighting_results: - stats_dict = reweighting_results[f"{sparsity}_stats"] - max_err = reweighting_results.get(f"{sparsity}_max_error", 0) - print(f" {sparsity.upper()}: {stats_dict['n_nonzero']} records, max error {max_err:.2%}") - - print("\nTimings:") - print(f" Training: {results['training_time']:.1f}s") - print(f" Generation: {results['generation']['generation_time']:.1f}s") - - print("\n" + "=" * 80) - print("TEST COMPLETE") - print("=" * 80) - - return results - - -if __name__ == "__main__": - results = main() diff --git a/examples/cps_synthesis_demo.py b/examples/cps_synthesis_demo.py deleted file mode 100644 index e6d6cdfa..00000000 --- a/examples/cps_synthesis_demo.py +++ /dev/null @@ -1,776 +0,0 @@ -""" -CPS ASEC Hierarchical Synthesis Demo - -Demonstrates using microplex's HierarchicalSynthesizer on real CPS ASEC data: -1. Load CPS data using load_cps_for_synthesis() -2. Fit HierarchicalSynthesizer on the data -3. Generate synthetic households -4. Evaluate quality using benchmark metrics (dCor, energy distance, etc.) -5. Compare synthetic distributions to original - -Outputs saved to examples/results/ -""" - -import sys -import time -import warnings -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import seaborn as sns - -# Add paths for local development -PACKAGE_ROOT = Path(__file__).resolve().parents[1] -sys.path.insert(0, str(PACKAGE_ROOT / "src")) - -from microplex import HierarchicalSynthesizer, HouseholdSchema - -from microplex_us.data import create_sample_data, load_cps_for_synthesis - -warnings.filterwarnings("ignore") - -# Set style -sns.set_style("whitegrid") -plt.rcParams["figure.figsize"] = (14, 10) - - -def setup_schema_for_cps(): - """Create a schema tailored for CPS ASEC data.""" - return HouseholdSchema( - hh_vars=["n_persons", "n_adults", "n_children", "state_fips", "tenure"], - person_vars=["age", "sex", "income", "employment_status", "education"], - person_condition_vars=[ - "n_persons", - "n_adults", - "n_children", - "state_fips", - "tenure", - "person_number", - "is_first_adult", - "is_child_slot", - ], - derived_vars={ - "hh_income": "sum:income", - "n_workers": "count:employment_status==1", - }, - hh_id_col="household_id", - person_id_col="person_id", - ) - - -def compute_benchmark_metrics( - original_persons: pd.DataFrame, - synthetic_persons: pd.DataFrame, - original_hh: pd.DataFrame, - synthetic_hh: pd.DataFrame, - person_vars: list, - hh_vars: list, -) -> dict: - """Compute comprehensive benchmark metrics.""" - from scipy import stats - from sklearn.preprocessing import StandardScaler - - metrics = {} - - # --- Person-level metrics --- - print("\nComputing person-level metrics...") - - # 1. Marginal fidelity (KS statistics) - ks_stats = {} - for var in person_vars: - if var in original_persons.columns and var in synthetic_persons.columns: - orig_vals = original_persons[var].dropna().values - synth_vals = synthetic_persons[var].dropna().values - if len(orig_vals) > 0 and len(synth_vals) > 0: - ks_stat, _ = stats.ks_2samp(orig_vals, synth_vals) - ks_stats[var] = ks_stat - - metrics["person_ks_stats"] = ks_stats - metrics["person_mean_ks"] = np.mean(list(ks_stats.values())) if ks_stats else 0 - - # 2. Variance ratios (dispersion check) - var_ratios = {} - for var in person_vars: - if var in original_persons.columns and var in synthetic_persons.columns: - orig_var = np.var(original_persons[var].dropna()) - synth_var = np.var(synthetic_persons[var].dropna()) - if orig_var > 0: - var_ratios[var] = synth_var / orig_var - - metrics["person_variance_ratios"] = var_ratios - - # 3. Zero-inflation accuracy (for income) - if "income" in original_persons.columns: - orig_zero_frac = (original_persons["income"] == 0).mean() - synth_zero_frac = (synthetic_persons["income"] == 0).mean() - metrics["income_zero_fraction_original"] = orig_zero_frac - metrics["income_zero_fraction_synthetic"] = synth_zero_frac - metrics["income_zero_fraction_error"] = abs(synth_zero_frac - orig_zero_frac) - - # 4. Distance correlation (dCor) - captures nonlinear relationships - print("Computing distance correlation (dCor)...") - dcor_results = compute_dcor_comparison( - original_persons, synthetic_persons, person_vars - ) - metrics["dcor"] = dcor_results - - # 5. Energy distance (multivariate) - print("Computing energy distance...") - numeric_vars = [v for v in person_vars if v in original_persons.columns] - if len(numeric_vars) >= 2: - orig_subset = original_persons[numeric_vars].dropna() - synth_subset = synthetic_persons[numeric_vars].dropna() - - # Sample for computational efficiency - n_sample = min(1000, len(orig_subset), len(synth_subset)) - if n_sample > 100: - orig_sample = orig_subset.sample(n=n_sample, random_state=42) - synth_sample = synth_subset.sample(n=n_sample, random_state=42) - - # Normalize - scaler = StandardScaler() - orig_norm = scaler.fit_transform(orig_sample) - synth_norm = scaler.transform(synth_sample) - - energy_dist = compute_energy_distance(orig_norm, synth_norm) - metrics["energy_distance"] = energy_dist - - # --- Household-level metrics --- - print("\nComputing household-level metrics...") - - hh_ks_stats = {} - for var in hh_vars: - if var in original_hh.columns and var in synthetic_hh.columns: - orig_vals = original_hh[var].dropna().values - synth_vals = synthetic_hh[var].dropna().values - if len(orig_vals) > 0 and len(synth_vals) > 0: - ks_stat, _ = stats.ks_2samp(orig_vals, synth_vals) - hh_ks_stats[var] = ks_stat - - metrics["hh_ks_stats"] = hh_ks_stats - metrics["hh_mean_ks"] = np.mean(list(hh_ks_stats.values())) if hh_ks_stats else 0 - - # Household size distribution - if "n_persons" in original_hh.columns: - orig_size_dist = original_hh["n_persons"].value_counts(normalize=True).sort_index() - synth_size_dist = synthetic_hh["n_persons"].value_counts(normalize=True).sort_index() - metrics["hh_size_dist_original"] = orig_size_dist.to_dict() - metrics["hh_size_dist_synthetic"] = synth_size_dist.to_dict() - - return metrics - - -def distance_correlation(X: np.ndarray, Y: np.ndarray) -> float: - """Compute Distance Correlation between two variables.""" - n = len(X) - if n < 2: - return 0.0 - - X = np.asarray(X).reshape(-1) - Y = np.asarray(Y).reshape(-1) - - a = np.abs(X[:, None] - X[None, :]) - b = np.abs(Y[:, None] - Y[None, :]) - - A = a - a.mean(axis=0, keepdims=True) - a.mean(axis=1, keepdims=True) + a.mean() - B = b - b.mean(axis=0, keepdims=True) - b.mean(axis=1, keepdims=True) + b.mean() - - dcov2 = (A * B).sum() / (n * n) - dvar_x = (A * A).sum() / (n * n) - dvar_y = (B * B).sum() / (n * n) - - if dvar_x <= 0 or dvar_y <= 0: - return 0.0 - - dcor = np.sqrt(dcov2 / np.sqrt(dvar_x * dvar_y)) - return float(np.clip(dcor, 0, 1)) - - -def compute_dcor_comparison( - original: pd.DataFrame, synthetic: pd.DataFrame, variables: list -) -> dict: - """Compare distance correlation structure between original and synthetic.""" - # Sample for efficiency - n_sample = min(500, len(original), len(synthetic)) - orig_sample = original.sample(n=n_sample, random_state=42) - synth_sample = synthetic.sample(n=n_sample, random_state=42) - - available_vars = [v for v in variables if v in orig_sample.columns and v in synth_sample.columns] - - dcor_errors = {} - orig_dcors = {} - synth_dcors = {} - - for i, var1 in enumerate(available_vars): - for j, var2 in enumerate(available_vars): - if i < j: - pair = f"{var1}_vs_{var2}" - orig_dcor = distance_correlation( - orig_sample[var1].values, orig_sample[var2].values - ) - synth_dcor = distance_correlation( - synth_sample[var1].values, synth_sample[var2].values - ) - orig_dcors[pair] = orig_dcor - synth_dcors[pair] = synth_dcor - dcor_errors[pair] = abs(orig_dcor - synth_dcor) - - return { - "original_dcors": orig_dcors, - "synthetic_dcors": synth_dcors, - "dcor_errors": dcor_errors, - "mean_dcor_error": np.mean(list(dcor_errors.values())) if dcor_errors else 0, - } - - -def compute_energy_distance(X: np.ndarray, Y: np.ndarray) -> float: - """Compute energy distance between two distributions.""" - from scipy.spatial.distance import cdist - - n_x = len(X) - n_y = len(Y) - - dist_XX = cdist(X, X, metric="euclidean") - dist_YY = cdist(Y, Y, metric="euclidean") - dist_XY = cdist(X, Y, metric="euclidean") - - term1 = 2 * np.mean(dist_XY) - term2 = (np.sum(dist_XX) - np.trace(dist_XX)) / (n_x * (n_x - 1)) - term3 = (np.sum(dist_YY) - np.trace(dist_YY)) / (n_y * (n_y - 1)) - - return float(term1 - term2 - term3) - - -def create_comparison_visualizations( - original_persons: pd.DataFrame, - synthetic_persons: pd.DataFrame, - original_hh: pd.DataFrame, - synthetic_hh: pd.DataFrame, - metrics: dict, - output_dir: Path, -): - """Create comprehensive comparison visualizations.""" - print("\nGenerating visualizations...") - - # 1. Person-level variable distributions - person_vars = ["age", "income", "education", "employment_status"] - fig, axes = plt.subplots(2, 2, figsize=(14, 10)) - fig.suptitle("Person-Level Variable Distributions: Original vs Synthetic", fontsize=14) - - for idx, var in enumerate(person_vars): - ax = axes[idx // 2, idx % 2] - - if var in original_persons.columns and var in synthetic_persons.columns: - orig_vals = original_persons[var].dropna() - synth_vals = synthetic_persons[var].dropna() - - if var == "income": - # Log scale for income - orig_positive = orig_vals[orig_vals > 0] - synth_positive = synth_vals[synth_vals > 0] - bins = np.logspace( - np.log10(max(1, orig_positive.min())), - np.log10(orig_positive.max()), - 50, - ) - ax.hist(orig_positive, bins=bins, alpha=0.5, label="Original", density=True) - ax.hist(synth_positive, bins=bins, alpha=0.5, label="Synthetic", density=True) - ax.set_xscale("log") - else: - ax.hist(orig_vals, bins=30, alpha=0.5, label="Original", density=True) - ax.hist(synth_vals, bins=30, alpha=0.5, label="Synthetic", density=True) - - ks_stat = metrics["person_ks_stats"].get(var, 0) - ax.text( - 0.95, - 0.95, - f"KS: {ks_stat:.4f}", - transform=ax.transAxes, - ha="right", - va="top", - bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5), - ) - - ax.set_xlabel(var.replace("_", " ").title()) - ax.set_ylabel("Density") - ax.legend() - - plt.tight_layout() - plt.savefig(output_dir / "person_distributions.png", dpi=150, bbox_inches="tight") - print(" Saved: person_distributions.png") - plt.close() - - # 2. Household size distribution - fig, axes = plt.subplots(1, 2, figsize=(14, 5)) - fig.suptitle("Household Composition Comparison", fontsize=14) - - # Size distribution - ax = axes[0] - if "n_persons" in original_hh.columns: - orig_sizes = original_hh["n_persons"].value_counts(normalize=True).sort_index() - synth_sizes = synthetic_hh["n_persons"].value_counts(normalize=True).sort_index() - - all_sizes = sorted(set(orig_sizes.index) | set(synth_sizes.index)) - x = np.arange(len(all_sizes)) - width = 0.35 - - orig_vals = [orig_sizes.get(s, 0) for s in all_sizes] - synth_vals = [synth_sizes.get(s, 0) for s in all_sizes] - - ax.bar(x - width / 2, orig_vals, width, label="Original", alpha=0.8) - ax.bar(x + width / 2, synth_vals, width, label="Synthetic", alpha=0.8) - ax.set_xticks(x) - ax.set_xticklabels(all_sizes) - ax.set_xlabel("Household Size") - ax.set_ylabel("Proportion") - ax.set_title("Household Size Distribution") - ax.legend() - - # Household income distribution - ax = axes[1] - if "hh_income" in synthetic_hh.columns: - # Compute original HH income - orig_hh_income = ( - original_persons.groupby("household_id")["income"].sum().reset_index() - ) - synth_hh_income = synthetic_hh["hh_income"].dropna() - - orig_positive = orig_hh_income["income"][orig_hh_income["income"] > 0] - synth_positive = synth_hh_income[synth_hh_income > 0] - - if len(orig_positive) > 0 and len(synth_positive) > 0: - bins = np.logspace( - np.log10(max(1, orig_positive.min())), - np.log10(orig_positive.max()), - 50, - ) - ax.hist(orig_positive, bins=bins, alpha=0.5, label="Original", density=True) - ax.hist(synth_positive, bins=bins, alpha=0.5, label="Synthetic", density=True) - ax.set_xscale("log") - ax.set_xlabel("Household Income ($)") - ax.set_ylabel("Density") - ax.set_title("Household Income Distribution") - ax.legend() - - plt.tight_layout() - plt.savefig(output_dir / "household_distributions.png", dpi=150, bbox_inches="tight") - print(" Saved: household_distributions.png") - plt.close() - - # 3. Distance correlation comparison - if "dcor" in metrics and metrics["dcor"]["dcor_errors"]: - fig, axes = plt.subplots(1, 2, figsize=(14, 5)) - fig.suptitle("Distance Correlation (dCor) Comparison", fontsize=14) - - dcor_data = metrics["dcor"] - pairs = list(dcor_data["dcor_errors"].keys())[:10] # Top 10 pairs - - # Original vs Synthetic dCor - ax = axes[0] - orig_vals = [dcor_data["original_dcors"][p] for p in pairs] - synth_vals = [dcor_data["synthetic_dcors"][p] for p in pairs] - x = np.arange(len(pairs)) - width = 0.35 - - ax.bar(x - width / 2, orig_vals, width, label="Original", alpha=0.8) - ax.bar(x + width / 2, synth_vals, width, label="Synthetic", alpha=0.8) - ax.set_xticks(x) - ax.set_xticklabels([p.replace("_vs_", "\nvs\n") for p in pairs], fontsize=8) - ax.set_ylabel("Distance Correlation") - ax.set_title("dCor by Variable Pair") - ax.legend() - - # dCor error - ax = axes[1] - errors = [dcor_data["dcor_errors"][p] for p in pairs] - colors = ["green" if e < 0.05 else "orange" if e < 0.1 else "red" for e in errors] - ax.bar(x, errors, color=colors, alpha=0.8) - ax.axhline(y=0.05, color="green", linestyle="--", alpha=0.5, label="Good (<0.05)") - ax.axhline(y=0.1, color="orange", linestyle="--", alpha=0.5, label="Acceptable (<0.1)") - ax.set_xticks(x) - ax.set_xticklabels([p.replace("_vs_", "\nvs\n") for p in pairs], fontsize=8) - ax.set_ylabel("Absolute Error") - ax.set_title("dCor Preservation Error") - ax.legend() - - plt.tight_layout() - plt.savefig(output_dir / "dcor_comparison.png", dpi=150, bbox_inches="tight") - print(" Saved: dcor_comparison.png") - plt.close() - - # 4. Age-Income relationship (key conditional relationship) - fig, axes = plt.subplots(1, 2, figsize=(14, 5)) - fig.suptitle("Age-Income Conditional Relationship", fontsize=14) - - for idx, (df, title) in enumerate( - [(original_persons, "Original"), (synthetic_persons, "Synthetic")] - ): - ax = axes[idx] - if "age" in df.columns and "income" in df.columns: - sample = df.sample(n=min(2000, len(df)), random_state=42) - positive = sample[sample["income"] > 0] - - ax.scatter( - positive["age"], - positive["income"], - alpha=0.3, - s=5, - ) - ax.set_xlabel("Age") - ax.set_ylabel("Income ($)") - ax.set_title(f"{title} Data") - ax.set_yscale("log") - - plt.tight_layout() - plt.savefig(output_dir / "age_income_relationship.png", dpi=150, bbox_inches="tight") - print(" Saved: age_income_relationship.png") - plt.close() - - # 5. Summary metrics chart - fig, axes = plt.subplots(2, 2, figsize=(14, 10)) - fig.suptitle("Quality Metrics Summary", fontsize=14) - - # KS statistics - ax = axes[0, 0] - ks_data = metrics["person_ks_stats"] - if ks_data: - vars_list = list(ks_data.keys()) - values = list(ks_data.values()) - colors = ["green" if v < 0.1 else "orange" if v < 0.2 else "red" for v in values] - ax.bar(vars_list, values, color=colors, alpha=0.8) - ax.axhline(y=0.1, color="green", linestyle="--", alpha=0.5) - ax.set_ylabel("KS Statistic") - ax.set_title("Marginal Fidelity (KS Test)") - ax.tick_params(axis="x", rotation=45) - - # Variance ratios - ax = axes[0, 1] - var_data = metrics.get("person_variance_ratios", {}) - if var_data: - vars_list = list(var_data.keys()) - values = list(var_data.values()) - colors = ["green" if 0.8 <= v <= 1.2 else "orange" if 0.5 <= v <= 2 else "red" for v in values] - ax.bar(vars_list, values, color=colors, alpha=0.8) - ax.axhline(y=1.0, color="black", linestyle="--", alpha=0.5) - ax.axhline(y=0.8, color="gray", linestyle=":", alpha=0.3) - ax.axhline(y=1.2, color="gray", linestyle=":", alpha=0.3) - ax.set_ylabel("Variance Ratio (Synth / Orig)") - ax.set_title("Dispersion Preservation") - ax.tick_params(axis="x", rotation=45) - - # Zero-inflation - ax = axes[1, 0] - if "income_zero_fraction_original" in metrics: - labels = ["Original", "Synthetic"] - values = [ - metrics["income_zero_fraction_original"], - metrics["income_zero_fraction_synthetic"], - ] - colors = ["steelblue", "coral"] - ax.bar(labels, values, color=colors, alpha=0.8) - ax.set_ylabel("Zero Fraction") - ax.set_title(f"Income Zero-Inflation (Error: {metrics['income_zero_fraction_error']:.4f})") - - # Key metrics summary - ax = axes[1, 1] - summary_data = { - "Mean KS": metrics.get("person_mean_ks", 0), - "Mean dCor Error": metrics.get("dcor", {}).get("mean_dcor_error", 0), - "Energy Distance": metrics.get("energy_distance", 0), - } - ax.bar(list(summary_data.keys()), list(summary_data.values()), color="steelblue", alpha=0.8) - ax.set_ylabel("Metric Value") - ax.set_title("Key Quality Metrics (Lower is Better)") - ax.tick_params(axis="x", rotation=15) - - plt.tight_layout() - plt.savefig(output_dir / "metrics_summary.png", dpi=150, bbox_inches="tight") - print(" Saved: metrics_summary.png") - plt.close() - - -def save_metrics_report(metrics: dict, output_dir: Path): - """Save metrics as markdown report.""" - report_path = output_dir / "cps_synthesis_report.md" - - with open(report_path, "w") as f: - f.write("# CPS ASEC Hierarchical Synthesis Report\n\n") - f.write(f"**Generated:** {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") - - f.write("## Executive Summary\n\n") - f.write(f"- **Mean KS Statistic (Person):** {metrics.get('person_mean_ks', 'N/A'):.4f}\n") - f.write(f"- **Mean KS Statistic (Household):** {metrics.get('hh_mean_ks', 'N/A'):.4f}\n") - f.write(f"- **Mean dCor Error:** {metrics.get('dcor', {}).get('mean_dcor_error', 'N/A'):.4f}\n") - f.write(f"- **Energy Distance:** {metrics.get('energy_distance', 'N/A'):.4f}\n\n") - - f.write("## Person-Level Marginal Fidelity (KS Statistics)\n\n") - f.write("| Variable | KS Statistic | Status |\n") - f.write("|----------|--------------|--------|\n") - for var, ks in metrics.get("person_ks_stats", {}).items(): - status = "Good" if ks < 0.1 else "Acceptable" if ks < 0.2 else "Poor" - f.write(f"| {var} | {ks:.4f} | {status} |\n") - - f.write("\n## Variance Ratios (Dispersion Check)\n\n") - f.write("| Variable | Ratio | Status |\n") - f.write("|----------|-------|--------|\n") - for var, ratio in metrics.get("person_variance_ratios", {}).items(): - status = "Good" if 0.8 <= ratio <= 1.2 else "Acceptable" if 0.5 <= ratio <= 2 else "Poor" - f.write(f"| {var} | {ratio:.4f} | {status} |\n") - - f.write("\n## Zero-Inflation Accuracy\n\n") - if "income_zero_fraction_original" in metrics: - f.write(f"- Original zero fraction: {metrics['income_zero_fraction_original']:.4f}\n") - f.write(f"- Synthetic zero fraction: {metrics['income_zero_fraction_synthetic']:.4f}\n") - f.write(f"- Absolute error: {metrics['income_zero_fraction_error']:.4f}\n") - - f.write("\n## Distance Correlation (dCor) - Nonlinear Relationships\n\n") - dcor_data = metrics.get("dcor", {}) - if dcor_data.get("dcor_errors"): - f.write(f"**Mean dCor Error:** {dcor_data['mean_dcor_error']:.4f}\n\n") - f.write("| Variable Pair | Original | Synthetic | Error |\n") - f.write("|---------------|----------|-----------|-------|\n") - for pair in list(dcor_data["dcor_errors"].keys())[:10]: - orig = dcor_data["original_dcors"][pair] - synth = dcor_data["synthetic_dcors"][pair] - error = dcor_data["dcor_errors"][pair] - f.write(f"| {pair} | {orig:.4f} | {synth:.4f} | {error:.4f} |\n") - - f.write("\n## Household Size Distribution\n\n") - if "hh_size_dist_original" in metrics: - f.write("| Size | Original | Synthetic |\n") - f.write("|------|----------|----------|\n") - orig_dist = metrics["hh_size_dist_original"] - synth_dist = metrics["hh_size_dist_synthetic"] - all_sizes = sorted(set(orig_dist.keys()) | set(synth_dist.keys())) - for size in all_sizes: - orig = orig_dist.get(size, 0) - synth = synth_dist.get(size, 0) - f.write(f"| {size} | {orig:.4f} | {synth:.4f} |\n") - - f.write("\n## Visualizations\n\n") - f.write("- `person_distributions.png` - Person variable distributions\n") - f.write("- `household_distributions.png` - Household composition\n") - f.write("- `dcor_comparison.png` - Distance correlation analysis\n") - f.write("- `age_income_relationship.png` - Conditional relationship\n") - f.write("- `metrics_summary.png` - Quality metrics overview\n") - - f.write("\n## Interpretation Guide\n\n") - f.write("**KS Statistic:**\n") - f.write("- < 0.1: Excellent marginal match\n") - f.write("- 0.1-0.2: Acceptable\n") - f.write("- > 0.2: Poor match\n\n") - - f.write("**Variance Ratio:**\n") - f.write("- 0.8-1.2: Good dispersion preservation\n") - f.write("- < 0.8: Under-dispersed (mode collapse risk)\n") - f.write("- > 1.2: Over-dispersed\n\n") - - f.write("**dCor Error:**\n") - f.write("- < 0.05: Excellent relationship preservation\n") - f.write("- 0.05-0.1: Good\n") - f.write("- > 0.1: Nonlinear relationships may not be captured\n\n") - - f.write("**Energy Distance:**\n") - f.write("- 0 = identical distributions\n") - f.write("- Lower is better\n") - - print(f"\nSaved report: {report_path}") - - -def main(): - """Run CPS ASEC hierarchical synthesis demo.""" - - print("=" * 80) - print("CPS ASEC HIERARCHICAL SYNTHESIS DEMO") - print("=" * 80) - - # Configuration - n_synthetic_households = 10000 - epochs = 50 - sample_fraction = 0.2 # Use 20% of CPS data for faster demo - - output_dir = Path(__file__).parent / "results" - output_dir.mkdir(exist_ok=True) - print(f"\nOutput directory: {output_dir}") - - # Step 1: Load CPS data - print("\n" + "=" * 80) - print("STEP 1: Loading CPS ASEC Data") - print("=" * 80) - - try: - households, persons = load_cps_for_synthesis( - sample_fraction=sample_fraction, random_state=42 - ) - print("\nLoaded CPS ASEC data:") - print(f" Households: {len(households):,}") - print(f" Persons: {len(persons):,}") - print(f" Avg HH size: {len(persons) / len(households):.2f}") - - # Show data summary - print("\nHousehold variables:") - for col in households.columns[:10]: - print(f" {col}: {households[col].dtype}") - - print("\nPerson variables:") - for col in persons.columns[:10]: - print(f" {col}: {persons[col].dtype}") - - except FileNotFoundError: - print("\nWARNING: CPS data not found. Using synthetic sample data instead.") - print("To download real CPS data, run: python scripts/download_cps_asec.py\n") - - households, persons = create_sample_data(n_households=5000, seed=42) - print("Generated sample data:") - print(f" Households: {len(households):,}") - print(f" Persons: {len(persons):,}") - - # Step 2: Set up and fit the hierarchical synthesizer - print("\n" + "=" * 80) - print("STEP 2: Training HierarchicalSynthesizer") - print("=" * 80) - - schema = setup_schema_for_cps() - print("\nSchema configuration:") - print(f" HH vars: {schema.hh_vars}") - print(f" Person vars: {schema.person_vars}") - print(f" Derived vars: {list(schema.derived_vars.keys())}") - - synth = HierarchicalSynthesizer( - schema=schema, - hh_flow_kwargs={"n_layers": 4, "hidden_dim": 64}, - person_flow_kwargs={"n_layers": 6, "hidden_dim": 128}, - random_state=42, - ) - - print(f"\nTraining with {epochs} epochs...") - start_time = time.time() - - synth.fit( - households, - persons, - hh_weight_col="hh_weight" if "hh_weight" in households.columns else None, - epochs=epochs, - verbose=True, - ) - - train_time = time.time() - start_time - print(f"\nTraining completed in {train_time:.1f} seconds") - - # Step 3: Generate synthetic households - print("\n" + "=" * 80) - print(f"STEP 3: Generating {n_synthetic_households:,} Synthetic Households") - print("=" * 80) - - start_time = time.time() - synthetic_hh, synthetic_persons = synth.generate( - n_households=n_synthetic_households, verbose=True - ) - generate_time = time.time() - start_time - - print(f"\nGeneration completed in {generate_time:.1f} seconds") - print(f" Synthetic households: {len(synthetic_hh):,}") - print(f" Synthetic persons: {len(synthetic_persons):,}") - print(f" Avg HH size: {len(synthetic_persons) / len(synthetic_hh):.2f}") - - # Step 4: Evaluate quality - print("\n" + "=" * 80) - print("STEP 4: Evaluating Quality") - print("=" * 80) - - person_vars = ["age", "sex", "income", "employment_status", "education"] - hh_vars = ["n_persons", "n_adults", "n_children", "state_fips", "tenure"] - - metrics = compute_benchmark_metrics( - persons, - synthetic_persons, - households, - synthetic_hh, - person_vars, - hh_vars, - ) - - # Print summary - print("\n" + "=" * 80) - print("QUALITY METRICS SUMMARY") - print("=" * 80) - - print("\nPerson-Level Marginal Fidelity (KS Statistics):") - for var, ks in metrics.get("person_ks_stats", {}).items(): - status = "Good" if ks < 0.1 else "Fair" if ks < 0.2 else "Poor" - print(f" {var}: {ks:.4f} [{status}]") - print(f" Mean KS: {metrics.get('person_mean_ks', 0):.4f}") - - print("\nVariance Ratios (should be close to 1.0):") - for var, ratio in metrics.get("person_variance_ratios", {}).items(): - status = "Good" if 0.8 <= ratio <= 1.2 else "Fair" if 0.5 <= ratio <= 2 else "Poor" - print(f" {var}: {ratio:.4f} [{status}]") - - if "income_zero_fraction_error" in metrics: - print("\nZero-Inflation (Income):") - print(f" Original zero fraction: {metrics['income_zero_fraction_original']:.4f}") - print(f" Synthetic zero fraction: {metrics['income_zero_fraction_synthetic']:.4f}") - print(f" Error: {metrics['income_zero_fraction_error']:.4f}") - - dcor_data = metrics.get("dcor", {}) - if dcor_data: - print("\nDistance Correlation (captures nonlinear relationships):") - print(f" Mean dCor error: {dcor_data.get('mean_dcor_error', 0):.4f}") - - if "energy_distance" in metrics: - print(f"\nEnergy Distance (multivariate): {metrics['energy_distance']:.4f}") - - print("\nHousehold-Level Marginal Fidelity:") - for var, ks in metrics.get("hh_ks_stats", {}).items(): - status = "Good" if ks < 0.1 else "Fair" if ks < 0.2 else "Poor" - print(f" {var}: {ks:.4f} [{status}]") - print(f" Mean KS: {metrics.get('hh_mean_ks', 0):.4f}") - - # Step 5: Create visualizations and save report - print("\n" + "=" * 80) - print("STEP 5: Generating Visualizations and Report") - print("=" * 80) - - create_comparison_visualizations( - persons, - synthetic_persons, - households, - synthetic_hh, - metrics, - output_dir, - ) - - save_metrics_report(metrics, output_dir) - - # Save synthetic data - synthetic_hh.to_parquet(output_dir / "synthetic_households.parquet") - synthetic_persons.to_parquet(output_dir / "synthetic_persons.parquet") - print(f"\nSaved synthetic data to {output_dir}") - - # Final summary - print("\n" + "=" * 80) - print("DEMO COMPLETE") - print("=" * 80) - print(f"\nResults saved to: {output_dir}") - print(" - cps_synthesis_report.md: Full quality report") - print(" - person_distributions.png: Person variable comparisons") - print(" - household_distributions.png: Household composition") - print(" - dcor_comparison.png: Distance correlation analysis") - print(" - age_income_relationship.png: Conditional relationship") - print(" - metrics_summary.png: Quality metrics overview") - print(" - synthetic_households.parquet: Generated households") - print(" - synthetic_persons.parquet: Generated persons") - - print("\nKey Results:") - print(f" Training time: {train_time:.1f}s") - print(f" Generation time: {generate_time:.1f}s") - print(f" Mean KS (person): {metrics.get('person_mean_ks', 0):.4f}") - print(f" Mean dCor error: {dcor_data.get('mean_dcor_error', 0):.4f}") - - return metrics - - -if __name__ == "__main__": - main() diff --git a/paper/.gitignore b/paper/.gitignore deleted file mode 100644 index ad293093..00000000 --- a/paper/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/.quarto/ -**/*.quarto_ipynb diff --git a/paper/AFFILIATION.md b/paper/AFFILIATION.md deleted file mode 100644 index ed4f449d..00000000 --- a/paper/AFFILIATION.md +++ /dev/null @@ -1,14 +0,0 @@ -# Affiliation and independence — rules for this paper - -**Sole affiliation**: PolicyEngine. - -PolicyEngine is the author's affiliation and also publishes the incumbent Enhanced CPS benchmark used in this paper. To keep the paper's scope clear: - -- Max Ghenis appears only as "PolicyEngine" on the author byline. -- No co-authorship with other PolicyEngine team members is implied or acknowledged. -- Email is `max@policyengine.org`. -- Acknowledgments may thank published tools or reviewers but must not imply separate institutional sponsorship. -- Quotes from or comparisons to PE-US-data are framed as "the incumbent public tool we measure against," consistent with how `microplex-us/docs/superseding-policyengine-us-data.md` already treats the relationship. -- Any language in drafts that could read as "validated by an independent PolicyEngine team" must be rephrased. - -Apply this rule to every section: abstract, introduction, methods, acknowledgments, appendices, captions, and bibliography entries that credit an author affiliation. diff --git a/paper/README.md b/paper/README.md deleted file mode 100644 index 457181bd..00000000 --- a/paper/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# `microplex-us` paper - -Quarto manuscript and supporting materials. - -## Affiliation - -PolicyEngine-only. See `AFFILIATION.md` — this work is intentionally independent of PolicyEngine for tax-and-organization reasons. - -## Contents - -- `_quarto.yml` — project config, HTML + PDF outputs. -- `index.qmd` — main manuscript. -- `literature-review.qmd` — standalone literature survey, cited by the main paper. -- `references.bib` — BibTeX bibliography, confirmed citations only. -- `AFFILIATION.md` — hard rule on affiliation independence. Re-read before adding any acknowledgment or author line. - -## Build - -```bash -cd paper -quarto render # both HTML and PDF -quarto render index.qmd # main paper only -quarto preview # live-reload local server -``` - -Output lands in `_output/`. - -## Cross-references and figures - -Figures and tables are sourced from `../artifacts/` (`stage1_77k_snap.json`, `zi_maf_tuning.json`, `embedding_prdc_compare.json`, `calibrate_on_synthesizer.json`). When final figures land, they should be generated as Quarto chunks rather than hand-placed PNGs so they re-render against the latest artifact set. - -## Citation style - -APA via Quarto's built-in CSL. Change in `_quarto.yml` if the target journal has a different requirement. diff --git a/paper/REVIEW-RESPONSE.md b/paper/REVIEW-RESPONSE.md deleted file mode 100644 index 57682e6f..00000000 --- a/paper/REVIEW-RESPONSE.md +++ /dev/null @@ -1,218 +0,0 @@ -# Consolidated referee review and revision plan - -*Five subagent referee reviews ran in parallel on 2026-04-17 evening on the paper scaffold. This doc synthesizes their findings into an ordered revision plan.* - -## Reviewer verdicts - -| Reviewer | Verdict | Main issue | -|---|---|---| -| Citation | Minor revisions | Synthcity author mismatch; identity-preservation framing overstated vs Dekkers 2015 | -| Methodology | Major revisions | Single-seed, non-converged calibration presented as final, correlated "robustness checks" | -| Domain | Major revisions | 36 "target columns" are inputs not policy outputs; ecosystem under-represented | -| Stylistic | Major revisions | 4 of 7 body sections are stubs; solo-authored "we"; documentation register | -| Reproducibility | Major revisions | No code/data availability statement; 2 of 4 robustness checks used pre-snap data | - -Four of five reviewers reach Major Revisions. The draft is not submittable in its current state but is recoverable within 1–2 weeks of focused work. - -## Critical findings (blocker before submission) - -### B1. Two "independent robustness checks" used the pre-snap broken pipeline [RESOLVED] - -The reproducibility reviewer identified that `artifacts/embedding_prdc_compare.json` (Apr 17 08:03) and `artifacts/calibrate_on_synthesizer.json` (Apr 17 08:06) predated the snap fixes (harness-side at 12:06, upstream-core at 12:20). Both scripts called `method.fit` and `method.generate` directly without invoking `_snap_categorical_shared_cols`. - -**Resolution (2026-04-17 21:15/21:17)**: both scripts were re-run against the post-fix upstream `microplex` (commit `81a5e10`, "Only smooth-noise continuous shared cols, not categorical ones"). The pre-fix artifacts were preserved with a `.pre-snap.json` suffix for audit; the post-fix artifacts replaced the original `.json` filenames. Comparison: - -| Artifact | Pre-snap coverage (ZI-QRF, 40k raw) | Post-snap coverage (ZI-QRF, 40k raw) | -|---|---:|---:| -| `embedding_prdc_compare.json` | 0.348 | 0.982 | -| `calibrate_on_synthesizer.json` | pre-cal rel-err 0.256 | pre-cal rel-err 0.317, post-cal 0.105 | - -Ordering is preserved (ZI-QRF > ZI-QDNN > ZI-MAF) under both regimes; absolute post-snap numbers are the ones reported in §5. Paper text at lines 252–268 already references the post-snap artifacts. - -### B2. The 36 "target columns" are input variables, not policy outputs - -The domain reviewer's single most important finding: the paper uses `employment_income_last_year`, `snap_reported`, `ssi_reported`, etc. — CPS-reported amounts — as "targets." A tax-microsim reviewer expects "targets" to mean policy outputs: federal income tax liability, state income tax, computed EITC/CTC, SNAP benefits under program rules, SSI amounts. - -Two options: - -- **Rename**. Call them "conditioning income and benefit columns" or "target income components." Do this at minimum; the current language is misleading. -- **Add downstream validation**. Run `policyengine-us` (and/or TAXSIM, Tax-Calculator, TPC — whichever the reviewer population cares about most) on microplex-us output data and report computed federal tax, EITC disbursed, CTC disbursed, SNAP/SSI/ACA PTC aggregates against external benchmarks (IRS SOI tables, USDA SNAP totals, SSA SSI totals, CBO SNAP outlays). This is the test a tax-microsim reviewer actually wants. - -Recommendation: do both. Rename immediately; add the downstream validation as a major new results subsection. - -### B3. Four of seven body sections are stubs - -Architecture (§3), Methods (§4), rare-cell subsection (§5.3), Discussion (§6), Conclusion (§8) are either parenthetical placeholders or explicit TBD. Not submittable in this state. - -**Action**: work through these in order. Methods first (reviewer can't evaluate anything else until they know what was done). Architecture second. Results-rare-cell third. Discussion and Conclusion last. - -### B4. No Code and Data Availability statement - -Standard requirement at every target venue. Must state data source (HuggingFace URL with pinned revision), code repository, software versions, Python version, OS tested, hardware, expected wall time, license. - -**Action**: add `## Code and Data Availability` section after Limitations. One paragraph. - -### B5. Conflicts of Interest disclosure missing - -Author founded PolicyEngine and previously led Enhanced CPS work (cited extensively in this paper). The `AFFILIATION.md` rule is followed in the byline and acknowledgments, but silence on the prior affiliation is a disclosure gap. Per domain reviewer: "Silence on the question will read worse than acknowledgement." - -**Action**: add explicit COI statement. Template: "The author founded PolicyEngine and previously led work on Enhanced CPS [@ghenis2024ecps]. The present work is conducted at PolicyEngine, the non-profit organization that publishes Enhanced CPS. PolicyEngine's Enhanced CPS is cited as the incumbent public tool against which microplex-us is measured." - -## High-priority revisions (before review circulation) - -### H1. Convert first-person plural to first-person singular (or third-person) - -Solo-authored paper uses "we" throughout both documents. Per the project's global style rule and the target venues' conventions, this should be "I" or third-person recast. The stylistic reviewer identified ~20 instances needing judgment-based conversion (global find-and-replace won't work). - -### H2. Self-contain the Related Work section - -Line 56 of `index.qmd` says "A full literature review for this paper is maintained in `literature-review.qmd`." This is a documentation move, not an academic one. Self-contain §2 with 400–600 words of prose. Keep `literature-review.qmd` as supplementary material. - -### H3. Remove all documentation-register artifacts - -- `*(This section is being written against the spec-based-ecps-rewire branch...)*` — convert to outline-as-prose. -- `[report low]` editorial marker at line ~100 — resolve. -- `77,006 × 50 scale` — rewrite as "77,006 records across 50 columns." -- "keeps every record alive" — "preserves all records" or "retains positive weight on every record." -- "mainline" — "primary calibration mechanism." -- Artifact paths referenced in body text — remove. - -### H4. Tables need captions, numbers, cross-reference labels - -All three tables are bare Markdown pipe-tables with no caption, no number, no Quarto `{#tbl-...}` label. Required for IJM / NTJ / JASA. - -### H5. Add at least one figure - -Pipeline schematic (source providers → donor blocks → chained QRF → calibration → L0 post-step) is the obvious first figure. Methods papers at the target tier with zero figures are unusual. - -### H6. Quantify or soften "widely-used upstream benchmark base class" - -Abstract claims the noise-injection defect "systematically biased earlier synthesizer comparisons." Evidence cited is one pre/post table on three methods using one base class. Either name the affected published benchmarks or soften to "introduced systematic bias into synthesizer comparisons using this base class." - -### H7. Citation form consistency - -Audit every `[@key]` vs `@key` for correct parenthetical vs textual intent. Pandoc renders them differently. - -## Medium-priority revisions (quality improvements) - -### M1. Uncertainty quantification - -Every headline table is a single-seed point estimate. Methodology reviewer correctly notes this is weak for a methods paper. ZI-QRF runs in 37 seconds — running 5-10 seeds is trivial compute. Report means with standard errors, or at least ordering-stability counts ("ordering preserved in 10/10 seeds"). - -### M2. Rerun with calibration converged - -All three entries in `artifacts/calibrate_on_synthesizer.json` have `"calibration_converged": false` at 200 epochs. The docs acknowledge this; the paper does not. Rerun at 1000-2000 epochs or report the epoch budget and frame as "fraction of pre-cal gap closed" rather than absolute post-cal error. - -### M3. Formal definition of identity preservation - -Currently asserted as an architectural property but never defined. Add Definition 1 in §3: *A weight-adjustment procedure $\phi: w \to w'$ is identity-preserving if $\forall i: w_i' > 0$ and $\phi$ does not drop records.* Either cite that `microcalibrate`'s gradient step satisfies this, or prove it. - -### M4. Embedding-PRDC circularity - -Autoencoder is fit on holdout only. Potential bias toward methods that match holdout idiosyncrasies. Re-run with AE fit on train (or an independent third partition). Report both. - -### M5. Soften "novel to PolicyEngine" Forbes claim - -Domain reviewer identified the SCF + Forbes precedent: Bricker-Henriques-Hansen-Moore (2016), Vermeulen (2018), Kennickell (2019). The tax-microsim integration remains novel; the broader pattern has precedent. Rewrite: "While top-wealth augmentation from Forbes-style lists is established practice in distributional national accounts [cites], its integration into a production tax-microsim pipeline is to our knowledge first done in policyengine-us-data." - -### M6. Cross-sectional motivation for identity preservation - -Domain reviewer: "Identity preservation also matters cross-sectionally for interpretability, subgroup analysis, confidentiality auditing, reproducibility and provenance." Add two paragraphs in Discussion making the cross-section case alongside the longitudinal case. - -### M7. ZI-QRF substrate circularity - -ECPS itself is QRF-constructed. ZI-QRF's win may be partly method-substrate match. Either add a non-ECPS robustness check (raw CPS ASEC or SCF) or explicitly note the circularity as a limitation. - -### M8. Target-set expansion - -Add Medicaid/CHIP, ACA PTC, mortgage interest, charitable contributions, medical expenses, property tax. Rerun at the expanded target set. - -### M9. Snap heuristic cardinality guard - -Stylistic and methodology reviewers flag that `_snap_categorical_shared_cols` fires on any integer-valued column, which could accidentally snap continuous-but-rounded columns (currency stored in dollars). Add cardinality threshold (e.g., snap only when `n_unique <= 50`). - -### M10. Decouple PRDC seed from split seed - -Currently both are `self.config.seed`. Use `seed + k` for the PRDC subsample. Average PRDC over 5+ subsample seeds per split to separate metric noise from split noise. - -## Low-priority revisions (cosmetic) - -### L1. Fix citation errors - -- Synthcity: author list should be Qian, Davis, van der Schaar for the NeurIPS 2023 D&B paper (not Cebere). Citation reviewer flagged as MAJOR but fix is trivial. -- Add TabPFGen (Ma et al., arXiv 2406.05216, 2024) — referenced in lit review but not cited. -- Add CTAB-GAN+ (Zhao et al. 2023, Frontiers in Big Data). -- Add Auten-Splinter (2024) as DINA counterweight to PSZ 2018. -- Add Meyer-Mok-Sullivan on CPS benefit under-reporting. -- Add Czajka-Hirabayashi-Moffitt-Scholz (1992) for statistical matching lineage. -- Add Ruggles (2025 PNAS) as engagement point. -- Remove `zhang2017privbayes` (unused) or cite. - -### L2. URL / DOI completeness - -Add URLs/DOIs for: patki2016sdv (IEEE DOI 10.1109/DSAA.2016.49), xu2019modeling (NeurIPS proceedings), naeem2020prdc (PMLR), kotelnikov2023tabddpm (PMLR), borisov2023great (OpenReview), and others listed by the citation reviewer. - -### L3. Bibliography cleanup - -- `solatorio2023realtabformer` should be `@misc` not `@article` with `journal = {arXiv preprint}`. -- `dementen2014liam2` needs `{de Menten}, Gaetan` brace protection. -- Standardize URL-only vs DOI-only policy (document the rule once). - -### L4. Table formatting - -- Pick one bolding rule (all best-per-column or none). -- Spell out abbreviated headers ("Fit (s)" → "Fit time (s)") or footnote them. -- Expand "Pre-cal" / "Post-cal" to "Before calibration" / "After calibration." - -### L5. Abstract cleanup - -- Expand ZI-QRF / ZI-QDNN / ZI-MAF / PRDC on first use. -- Replace "keeps every record alive," "mainline," "77,006 × 50 scale" per H3. -- Either support or drop "widely-used" (H6). - -### L6. Remove unused references from `.bib` - -`ruggles2025synth` (cited in lit review but not index.qmd; consider citing in index.qmd per domain reviewer M1), `zhang2017privbayes`. - -### L7. Cite each data product on first reference - -CPS ASEC, ACS, PUF, SCF, SIPP need primary-source citations on first use. - -### L8. Repository hygiene - -- Add `LICENSE` file at repo root. -- Add regression test for ordering (e.g., `test_stage1_10k_ordering`). -- Move paper tables to Quarto chunks that read from `../artifacts/*.json` to auto-update. - -## Revision order - -Roughly the sequence to work through: - -1. **Rerun pre-snap artifacts** (B1). Half-hour compute. -2. **Rename target columns + add downstream tax-output validation** (B2). Several days; the downstream run is non-trivial. -3. **Draft §3 Architecture** (B3). One to two days. -4. **Draft §4 Methods** (B3). One day. -5. **Add Code and Data Availability statement + COI** (B4, B5). One hour. -6. **Convert voice to first-person singular** (H1). Several hours, judgment-by-judgment. -7. **Self-contain Related Work** (H2). Half-day. -8. **Strip documentation register** (H3). Hours. -9. **Table captions, numbering, labels** (H4). Hour. -10. **Pipeline diagram** (H5). Hour (one TikZ / mermaid / svg figure). -11. **Soften the "widely-used" claim** (H6). Minutes. -12. **Citation form audit** (H7). Hour. -13. **Draft §5.3 rare-cell + §6 Discussion + §8 Conclusion** (B3 cont.). Two days. -14. **Medium-priority revisions** (M1–M10). Several days. -15. **Low-priority / cosmetic** (L1–L8). Final pass. - -Total budget estimate: 2–3 weeks to a submittable draft, assuming the downstream tax-output validation is the bottleneck. - -## What the reviewers got wrong - -Two minor issues where the reviews overstated the gap: - -- Reproducibility reviewer said `zi_maf_tuning.json` is missing; it is present at `artifacts/zi_maf_tuning.json` (verified). The reviewer's grep missed it. -- Citation reviewer flagged the identity-preservation framing as overstating the gap vs Dekkers (2015). Dekkers does discuss identity under static vs dynamic ageing; what the paper claims is novel is the cross-sectional calibration-layer framing, which Dekkers does NOT discuss. But the reviewer's point stands that the literature review should cite Dekkers and clarify which layer the claim refers to. - -## Reviews kept for reference - -Full reviewer outputs are preserved in the `a*` agent IDs noted by the subagent framework. If a rebuttal is needed later, those sessions can be resumed via `SendMessage`. diff --git a/paper/_quarto.yml b/paper/_quarto.yml deleted file mode 100644 index 035af6b2..00000000 --- a/paper/_quarto.yml +++ /dev/null @@ -1,61 +0,0 @@ -project: - type: default - output-dir: _output - -title: "Identity-preserving synthesis and calibration for US tax-benefit microdata" -author: - - name: Max Ghenis - affiliation: PolicyEngine - email: max@policyengine.org - -date: last-modified -abstract: | - Tax and benefit microsimulation depends on synthetic microdata whose accuracy - must survive both national-scale aggregates and longitudinal extensions. - We introduce `microplex-us`, a spec-driven US synthesis and calibration - runtime with three architectural properties: (1) chained quantile-regression- - forest (QRF) imputation across independent administrative and survey - sources, (2) identity-preserving gradient-descent chi-squared calibration - that keeps every record alive through calibration, and (3) sparse L0 record - selection reserved as an optional post-step for deployment subsamples rather - than a calibration mainline. We benchmark three zero-inflated synthesizers - (ZI-QRF, ZI-QDNN, ZI-MAF) on the full PolicyEngine Enhanced CPS 2024 at - 77,006 × 50 scale and find ZI-QRF dominates on PRDC coverage (0.928 vs. 0.707 - for ZI-QDNN and 0.106 for ZI-MAF), with consistent ordering under four - independent robustness checks. We further document a previously unreported - noise-injection defect in the `microplex.eval.benchmark` base class that - systematically biased earlier synthesizer benchmarks on integer-valued - conditioning variables, and publish corrected results. The paper situates - these findings in the microsimulation and synthetic-microdata literature, - identifies where `microplex-us` extends existing techniques, and argues that - identity preservation is a load-bearing but under-named architectural - requirement whenever cross-sectional microdata must feed a longitudinal - policy model. - -format: - html: - toc: true - toc-depth: 3 - number-sections: true - theme: cosmo - fig-cap-location: bottom - tbl-cap-location: top - code-fold: true - pdf: - documentclass: article - geometry: - - margin=1in - number-sections: true - fig-cap-location: bottom - tbl-cap-location: top - -bibliography: references.bib -# csl: chicago-author-date.csl # opt: pin when a target journal CSL is chosen - -execute: - echo: false - warning: false - message: false - -filters: - - quarto diff --git a/paper/index.qmd b/paper/index.qmd deleted file mode 100644 index 04abf2fe..00000000 --- a/paper/index.qmd +++ /dev/null @@ -1,368 +0,0 @@ ---- -title: "Identity-preserving synthesis and calibration for US tax-benefit microdata" -short-title: "microplex-us" -author: - - name: Max Ghenis - affiliation: PolicyEngine - email: max@policyengine.org -date: last-modified -abstract: | - Tax and benefit microsimulation depends on synthetic microdata whose - accuracy must satisfy both national-scale aggregates and longitudinal - extensions. This paper introduces `microplex-us`, a spec-driven US - synthesis and calibration runtime with three architectural properties: - (1) chained quantile-regression-forest (QRF) imputation across - heterogeneous administrative and survey sources; (2) identity-preserving - gradient-descent chi-squared calibration that retains positive weight on - every record; and (3) sparse L0 record selection reserved as an optional - post-processing step rather than as the primary calibration mechanism. - The paper benchmarks three zero-inflated synthesizers — quantile - regression forests (ZI-QRF), quantile deep neural networks (ZI-QDNN), - and masked autoregressive flows (ZI-MAF) — on 77,006 Enhanced CPS 2024 - records across 50 variables, finding that ZI-QRF dominates on - Precision/Recall/Density/Coverage (PRDC; coverage 0.928 vs. 0.707 for - ZI-QDNN and 0.106 for ZI-MAF) with the ordering preserved across - multiple sensitivity checks. The paper also documents a previously - unreported noise-injection defect in the `microplex.eval.benchmark` - base class that caused consistent downward bias in earlier synthesizer - comparisons on categorical conditioning variables, and publishes - corrected results. - -keywords: [synthetic microdata, survey calibration, microsimulation, tabular - data synthesis, quantile regression forests, identity-preserving - calibration] -bibliography: references.bib -format: - html: - toc: true - toc-depth: 3 - number-sections: true - pdf: - documentclass: article - geometry: margin=1in - number-sections: true ---- - -# Introduction {#sec-intro} - -Tax and benefit microsimulation models rely on microdata that are simultaneously aggregate-accurate (matching IRS Statistics of Income, Census, and administrative targets to tight tolerances) and individually credible (preserving joint structure in incomes, demographics, and wealth). In the US, the available public microdata surfaces — Census's Current Population Survey (CPS), the American Community Survey (ACS), IRS's Statistics of Income Public Use File (PUF), the Survey of Consumer Finances (SCF), and the Survey of Income and Program Participation (SIPP) — each observe only a slice of the variables that an end-to-end tax-benefit simulator requires. Constructing a useful microdata base means combining slices. - -The dominant public approach in the US today is [@ghenis2024ecps]'s Enhanced CPS, which augments CPS ASEC with PUF-imputed tax variables via quantile regression forests and calibrates the result against thousands of IRS, Census, and administrative targets. This paper builds on that lineage — it is not the first attempt to solve the problem — but contributes along four axes where the literature is thin: - -1. **A spec-driven donor integration runtime** that separates donor-block contracts from backend implementation, allowing independent benchmarking of conditioning, imputer, and entity-projection choices. -2. **Identity-preserving calibration** as an explicit architectural requirement — framed to support longitudinal extensions where records must persist across simulation years. -3. **A head-to-head comparison of QRF-family and neural synthesizers** on real US economic microdata at production scale — a cell of the evaluation matrix that, to my knowledge, no prior published work occupies. -4. **A correction to a benchmark-base-class noise-injection defect** in the upstream `microplex.eval.benchmark` module that had systematically biased earlier synthesizer comparisons on integer-valued conditioning variables. - -This paper does not claim foundational methodological novelty. Every mechanism used below exists in the published literature: quantile regression forests [@meinshausen2006qrf], chained imputation [@vanbuuren2011mice], calibration with range-restricted distances [@deville1992calibration], L0 sparse regularization [@louizos2018l0], support-based generative evaluation [@naeem2020prdc]. The contribution is in the composition and the empirical evidence that results. - -# Background and related work {#sec-related} - -The present work sits across four literatures: survey calibration, synthetic tabular data generation, tabular-synthesis evaluation metrics, and US tax-benefit microsimulation. A supplementary literature review accompanies this paper with an expanded treatment; the following summary frames the specific prior work each contribution builds on. - -## Survey calibration {#sec-related-calibration} - -Classical calibration originates with @deville1992calibration, which defines the calibration estimator as a constrained weight adjustment minimizing a distance function from design weights subject to linear moment constraints. Generalized raking extends this to categorical margins via iterative proportional fitting [@deville1993raking; @deming1940adjustment]. Range-restricted variants with bounded-positive distance functions (logit, truncated-linear) guarantee non-negative weights by construction. @devaud2019calibration provides the current treatment of existence and feasibility conditions; @haziza2017weights and @kott2016calibration are the recent reviews. Entropy balancing [@hainmueller2012entropy] is mathematically adjacent, using Kullback-Leibler divergence with moment constraints, and also produces strictly positive weights. - -L0 regularization entered the machine-learning literature via hard-concrete stochastic gates [@louizos2018l0], which made L0 differentiable and compatible with gradient-based optimization. Applying L0 selection to survey calibration as a record-sparsification step is recent; I find no earlier survey-statistics treatment of it as a first-class calibration technique, only as a post-calibration record subset selector for deployment artifacts. - -## Synthetic tabular data generation {#sec-related-tabular} - -Modern tabular synthesis starts with the Synthetic Data Vault [@patki2016sdv] and `synthpop` [@nowok2016synthpop], which establishes the CART-based sequential approach. CTGAN and TVAE [@xu2019modeling] introduce neural tabular synthesis; TabDDPM [@kotelnikov2023tabddpm] brings diffusion. Language-model-based approaches appear in GReaT [@borisov2023great] and REaLTabFormer [@solatorio2023realtabformer]. TabSyn [@zhang2024tabsyn] combines latent-space score-based diffusion with competitive benchmarks. Tabular foundation models now include TabPFN v2 [@hollmann2025tabpfn], though its primary contribution is prediction rather than synthesis. - -Quantile regression forests [@meinshausen2006qrf] are not usually grouped with the tabular-synthesis literature, but they are the method Enhanced CPS and several industrial microsim pipelines use for per-column imputation. In the benchmarking below I treat ZI-QRF on equal footing with the neural synthesizers. - -Published head-to-head comparisons of QRF-family and neural synthesizers on real US economic microdata at production scale are scarce. @little2025synth compares synthpop, DataSynthesizer, CTGAN, and TVAE on census microdata in four countries and finds CART-based synthpop dominates; @bowen2022puf document a synthetic supplemental PUF built on IRS Statistics of Income data using sequential CART. Neither includes QRF or ZI-QRF against modern deep generators. @ruggles2025synth offers a recent critique of fully-synthetic census microdata as a replacement for design-based public-use files; the present paper's scope is narrower (augmenting an existing public-use file rather than replacing one). - -## Evaluation metrics {#sec-related-metrics} - -@naeem2020prdc establishes precision, recall, density, and coverage as the support-based quality quad, originally validated in image-generator Inception-embedding space. Benchmarking frameworks including Synthcity [@qian2023synthcity] and SDMetrics aggregate PRDC alongside column-wise Kolmogorov-Smirnov distances, pairwise correlation differences, and Train-on-Synthetic/Test-on-Real utility. - -Two documented failure modes matter for the present work. First, @park2023probabilistic show that outliers inflate density and coverage because the $k$-NN support construction over-inflates the manifold around them — a material concern for heavy-tailed income microdata. Second, @beyer1999nn and @aggarwal2001surprising show $k$-NN distances concentrate in high-dimensional spaces, causing the coverage radius to degenerate above ~10-15 dimensions. These motivate reporting multiple metrics alongside PRDC and testing whether orderings survive dimensionality reduction; I do both in the results section. @alaa2022precision introduces sample-level $\alpha$-precision and $\beta$-recall as more outlier-robust alternatives. - -## US tax microsimulation {#sec-related-tax-microsim} - -@toder2024microsim is the current umbrella review of the US tax-microsim ecosystem. Active models include TAXSIM [@feenberg1993taxsim], Tax-Calculator [@debacker2019taxcalc], the Tax Policy Center and CBO in-house models [@cbo2018taxmodel], the Budget Lab at Yale, and PolicyEngine-US-Data (Enhanced CPS; @ghenis2024ecps). These differ along several axes: whether they ship a calculator, a microdata constructor, or both; what substrate microdata they use (CPS-PUF matched, pure CPS, pure PUF, administrative linkage); how they augment for top incomes; and whether they are open-source. Enhanced CPS is the public-microdata contribution that `microplex-us` builds on. - -@bowen2022puf is the canonical methodology paper for synthetic IRS PUF, using sequential CART under differential-privacy constraints. The Forbes-style top-wealth augmentation pattern that enters tax-microsim microdata via PolicyEngine-US-Data has precedent in distributional-national-accounts work: @piketty2018dina and @saez2016wealth augment SCF with top-wealth records for capitalized-income estimation. Porting this augmentation pattern into a production tax-microsim pipeline is, to my knowledge, first done in PolicyEngine-US-Data; I adopt it without further innovation. - -## Longitudinal microsimulation {#sec-related-longitudinal} - -DYNASIM3 [@favreault2004dynasim], MINT [@smith2013mint], CBOLT [@cbo2018cbolt], and the LIAM2 family [@dementen2014liam2; surveyed in @odonoghue2001dynamicsurvey] are the dominant US and international longitudinal microsimulation models. All use static-ageing with alignment to external totals and therefore preserve record identity implicitly — records are aged forward, not dropped. Identity preservation is not a named concept in the survey statistics or longitudinal-microsim literatures. The closest named property in classical calibration is *range-restricted calibration with positive lower bound* [@deville1992calibration]. I argue in §3.4 for making identity preservation an explicit architectural requirement at the cross-sectional imputation and calibration layer, because the cross-sectional artifact is the input substrate to longitudinal simulation and breaking identity there is the quickest way to make a microsim un-chainable across years. - -# Architecture {#sec-architecture} - -`microplex-us` is structured around four layers: source providers, declarative donor blocks, a chained imputation engine, and a calibration backend protocol (@fig-pipeline). The top-level build entry point (`microplex_us.pipelines.us.USMicroplexPipeline.build_from_source_providers`) composes these layers into a single end-to-end run that produces a PolicyEngine-ingestable HDF5 artifact plus parity diagnostics. This section describes each layer and names the specific design choices that differentiate the runtime from incumbent construction pipelines. - -```{mermaid} -%%| label: fig-pipeline -%%| fig-cap: "`microplex-us` pipeline architecture. Source providers load raw survey and administrative microdata at their native entity levels. Donor blocks declare target variables, conditioning surfaces, and zero-inflation policies as JSON manifests. The chained imputation engine integrates each block in a DAG order respecting conditioning-variable dependencies. PolicyEngine entity-table construction projects the flat frame into the multi-entity schema required for simulation. Identity-preserving calibration (`microcalibrate` gradient-descent chi-squared) adjusts per-record weights against the active PolicyEngine targets database. Optional sparse L0 record selection produces deployment subsamples. The final artifact is an HDF5 file directly ingestable by `policyengine-us.Microsimulation`." -flowchart TD - subgraph sources["Source providers"] - CPS[CPS ASEC
processed parquet] - PUF[IRS SOI PUF
administrative] - ACS[ACS PUMS
Census] - SIPP[SIPP tips + assets
panels] - SCF[SCF wealth
Federal Reserve] - FORBES[Forbes top-wealth
backbone] - end - - REG[Source + variable
capability registry] - BLOCKS[Donor block manifests
declarative JSON] - - subgraph imputation["Chained imputation engine"] - DAG[Dependency DAG
from block conditioning] - QRF[Quantile Regression Forest
per-variable draws] - end - - TABLES[PolicyEngine entity tables
households × persons × tax units × SPM × family] - - subgraph calibration["Calibration"] - MC[microcalibrate
gradient-descent chi-squared
identity-preserving] - L0[Optional L0 post-step
deployment subsample] - end - - H5[HDF5 artifact
policyengine-us ready] - - CPS --> REG - PUF --> REG - ACS --> REG - SIPP --> REG - SCF --> REG - FORBES --> REG - REG --> BLOCKS - BLOCKS --> DAG - DAG --> QRF - QRF --> TABLES - TABLES --> MC - MC --> L0 - MC --> H5 - L0 -.optional.-> H5 - - style MC fill:#cfe,stroke:#333 - style L0 fill:#fec,stroke:#333,stroke-dasharray: 5 5 -``` - -## Source providers and variable capabilities {#sec-arch-sources} - -A source provider is a narrow adapter that loads raw survey or administrative microdata into an `ObservationFrame` — a typed DataFrame with a declared entity level (person, household, tax unit, SPM unit, family, marital unit), a time period, and a set of `SourceVariableCapability` records that mark each variable as authoritative, usable-as-condition, or both. Source providers for `microplex-us` include CPS ASEC (via the PolicyEngine-maintained processed parquet cache), IRS Statistics of Income Public Use File, ACS, SIPP (tips and assets panels), SCF, and a Forbes top-wealth backbone. Each provider is self-contained: it declares the entity levels it observes, the vintage year, and the variable capabilities, and it emits frames at the declared entity level without projecting across entities at load time. - -Variable capabilities are stored in a single declarative registry (`microplex_us.source_registry`) that overrides a base `SourceVariableCapability` record per source-variable pair. This lets a downstream consumer ask "which sources observe `employment_income_last_year` as authoritative?" or "which sources have `age` available as a condition variable?" without running any imputation. The registry is the load-bearing artifact for donor-block planning. - -## Donor blocks as declarative contracts {#sec-arch-blocks} - -A donor block is a JSON-declarable spec describing the integration of one or more variables from a non-scaffold source into the current working frame. The block names (a) the block's native entity, (b) the target variables it produces, (c) the permitted conditioning variables, (d) the match strategy (nearest-neighbor hot-deck, chained QRF, share imputation), (e) the entity-projection policy if a donor observes a parent entity and the target is at a child entity, and (f) a zero-inflation policy when the target is zero-inflated. Blocks are loaded at pipeline start from `microplex_us/manifests/pe_source_impute_blocks.json` and resolved to executable tasks by `PESourceImputeBlockEngine`. - -The separation between block specification and engine execution is the feature that makes donor integration independently benchmarkable. A researcher can swap an imputer backend (QRF for chained QRF, a neural flow, statistical matching) without touching block contracts, and a new donor block can be added without touching engine code. Current production uses QRF per `@meinshausen2006qrf` for zero-inflated continuous targets and logistic-classifier-plus-quantile-regression for zero-inflated binary-and-continuous targets. - -## Chained QRF imputation {#sec-arch-chained-qrf} - -Donor blocks integrate in an order that respects the dependency DAG implied by their conditioning sets. Early blocks use only demographic and scaffold-observed conditioning (age, sex, education, household size); later blocks may condition on earlier-imputed variables (for example, a wealth block may condition on imputed AGI). This is a MICE-framework composition [@vanbuuren2011mice] where each per-variable draw uses a QRF rather than a linear regression, extending the chained-random-forest imputation pattern of @doove2014chainedrf and @stekhoven2012missforest. - -The novelty of the composition is not the QRF draw, which is standard; it is that the conditioning surface for each block is declarative (the block spec names its conditioning variables) and the engine enforces the DAG ordering automatically. A block's conditioning surface is computed at resolution time by intersecting the block's declared conditioning variables with the current frame's available columns, so blocks gracefully degrade when earlier blocks fail. - -## Identity-preserving calibration {#sec-arch-calibration} - -After donor integration the frame is passed through PolicyEngine entity-table construction and then calibrated against a PolicyEngine targets database. The calibration backend is pluggable through `USMicroplexBuildConfig.calibration_backend`, which accepts values `entropy`, `ipf`, `chi2`, `sparse`, `hardconcrete`, `pe_l0`, `microcalibrate`, and `none`. The production default is `microcalibrate`, which invokes the country-agnostic `MicrocalibrateAdapter` (shipped as part of upstream `microplex` under the optional `calibrate` extra, so country packages such as `microplex-us` and planned `microplex-uk` inherit one identity-preserving calibrator without duplicating glue code) around the `microcalibrate` library's gradient-descent chi-squared solver. - -I define an *identity-preserving* weight adjustment as a procedure $\phi: w \to w'$ on a frame of $n$ records satisfying $\forall i \in \{1, \ldots, n\}: w_i' \geq 0$ and $\mathrm{id}(r_i') = \mathrm{id}(r_i)$: every input record survives to the output with the same entity identifier; no row is deleted from the frame, and no new row is created. The record's weight may become zero (excluding it from current-year aggregates) but the row and its entity identifiers persist. Identity preservation in this sense matters because cross-sectional microdata is the input substrate to longitudinal microsimulation, where entity identifiers must persist across simulation years for lifetime-earnings computation, panel analysis, and provenance; a dropped row destroys the cross-year linkage permanently. - -Two calibration families satisfy row-set preservation. The gradient-descent chi-squared calibration used by `microcalibrate` is strictly positive by construction ($w_i' > 0$) via a soft positivity penalty, which is the classical range-restricted calibration analog [@deville1992calibration]. L0-sparsified calibration (via PolicyEngine's `l0-python` with HardConcrete stochastic gates [@louizos2018l0]) allows some weights to reach exactly zero and is therefore weaker than strict positivity, but still satisfies row-set preservation because the weight array is returned at the original length with the same entity identifiers intact. The zero-weight rows are not dropped from the HDF5 dataset — they are available to year $Y+1$'s calibration to re-weight up. This is consistent with the CBOLT and DYNASIM convention of equal per-person weights frozen across a person's lifetime [@favreault2004dynasim; @cbo2018cbolt], where between-year population-level adjustment happens via alignment factors rather than per-record weight shifts; zero-sparsity on the cross-section gives a strict-superset of flexibility compared with frozen-weight approaches. - -The legacy entropy backend was retired at scale (above approximately 200,000 households) after repeated OOM failures during preliminary runs at 1.5 million household scale. Entropy calibration materializes dense scratch structures proportional to $n_{\text{records}} \times n_{\text{constraints}}$; at production scale with approximately 1,200 active constraints, the working set exceeded 48 GB of RAM. Gradient-descent chi-squared calibration also OOM'd in its first production run at this scale until two complementary fixes landed: the adapter now passes the estimate matrix as float32 rather than float64 pandas, and the upstream `microcalibrate` solver accumulates gradients over record batches (`batch_size` parameter, shipped in `microcalibrate` 0.22) so peak autograd activation is $O(B \times k)$ instead of $O(n \times k)$. With both fixes, the production pipeline completes the calibration step on the same 48 GB workstation in minutes rather than OOM-killing. - -## Sparse L0 as a first-class calibrator {#sec-arch-sparse} - -Sparse L0 record selection (via `PolicyEngine/l0-python` with HardConcrete stochastic gates [@louizos2018l0]) is a fully identity-preserving calibrator under the row-set-preservation definition above, and is exposed as `calibration_backend="pe_l0"` alongside the `microcalibrate` chi-squared default. The two are complementary rather than alternative-and-fallback: chi-squared preserves strict positivity at the cost of a larger deployment artifact, while L0 permits zeroed weights in exchange for a dramatically smaller effective working set that can be handled by downstream applications with tight memory budgets (web UIs, small-area point estimates, simulation endpoints running inside a 2 GB container). Both produce outputs readable by `policyengine-us.Microsimulation` without modification. - -An empirical caveat worth flagging: on the same pipeline, aggressive L0 selection (above approximately 90 % sparsity) can drive rare-subpopulation ratios (for example, elderly self-employed, young dividend recipients) to zero because the optimizer trades their retention for aggregate accuracy. Production deployments of the L0 backend should audit rare-cell coverage before shipping; the chi-squared backend provides a safer default when such audits aren't run. - -## Entity-table export {#sec-arch-export} - -The final stage writes a PolicyEngine-US-ingestable HDF5 file with person, household, tax-unit, SPM-unit, family, and marital-unit tables. The exporter preserves the entity identifiers propagated through donor integration and calibration, so the output of a production build is directly readable by `policyengine-us.Microsimulation` without additional harmonization. This is a deliberate compatibility choice: the PolicyEngine-US simulator is the downstream consumer, and a `microplex-us` build that cannot be plugged into the incumbent simulator is not a useful cross-section for tax-benefit work. - -# Benchmark methodology {#sec-methods} - -## Data {#sec-methods-data} - -All empirical results use Enhanced CPS 2024 as the evaluation substrate, published by PolicyEngine at `https://huggingface.co/policyengine/policyengine-us-data` as `enhanced_cps_2024.h5`. The HDF5 file stores variables at their native entity level: person-level variables (77,006 rows), household-level variables (29,999 rows), SPM-unit-level (31,330 rows), tax-unit-level (41,448 rows), family-level (one row per family), and marital-unit-level variables. The benchmark harness loads variables into a flat person-level DataFrame by broadcasting non-person entity values to person level via the `person__id` linkage columns. The result is a 77,006 × 50 DataFrame per experimental run. - -## Variable selection {#sec-methods-variables} - -The benchmark uses 14 conditioning variables and 36 synthesizer-target variables. Conditioning variables are person-level demographics and household-context flags (age, sex, Hispanic origin, CPS race category, disability, blindness, military service, full-time college enrollment, separation status, state FIPS, ESI coverage, Marketplace coverage, own children in household, pre-tax retirement contributions). Target variables span labor income (employment income, self-employment income), interest and dividends (taxable interest, tax-exempt interest, qualified dividends, non-qualified dividends), capital gains (long-term, short-term), retirement income (pension, IRA distributions, Social Security and its retirement/disability/survivor split), other income (rental, farm, unemployment compensation, alimony, miscellaneous), wealth (bank accounts, bonds, stocks, net worth, auto loan balance), and reported benefit receipts (SNAP, housing assistance, SSI, TANF, disability, workers' compensation, veterans' benefits, child support received and paid, real estate taxes paid, HSA deductions). I emphasize that these are the synthesizer's *target income and benefit variables* — the quantities the synthesizer is asked to reproduce — and not policy outputs such as federal income tax liability, computed EITC amount, or computed SNAP participation. Downstream tax-output validation (running `policyengine-us` on the synthesized frame and comparing computed aggregates against administrative totals) is deferred to a companion paper. - -## Synthesizers evaluated {#sec-methods-synthesizers} - -Three zero-inflated synthesizer families are compared, all implemented in `microplex.eval.benchmark` as subclasses of a `_MultiSourceBase` abstract that pools shared conditioning variables across sources and fits one per-target-column model. The zero-inflation variant adds a random-forest classifier predicting `P(y > 0 \mid x)` when the target's training-set zero fraction exceeds 10 %: - -- **ZI-CART**: synthpop-style classification and regression trees [@nowok2016synthpop]. For each target variable, a `DecisionTreeRegressor` with `min_samples_leaf = 5` is fit on the shared conditioning variables; at generation time, each synthetic record is routed to a leaf via `tree.apply`, and the synthetic value is sampled uniformly from the training-set outcomes that landed in that leaf. A random-forest zero-classifier is applied on columns with zero fraction above 10 %. -- **ZI-QRF**: quantile random forests [@meinshausen2006qrf] with 100 trees predicting deciles of the conditional distribution, with a random-forest zero-classifier. -- **ZI-QDNN**: a quantile deep neural network with two hidden layers (width 64), 50 training epochs, batch size 256, predicting decile-level quantiles under pinball loss. -- **ZI-MAF**: a masked autoregressive flow [@xu2019modeling] with four layers and hidden dimension 32, 50 training epochs, batch size 256, and a random-forest zero-classifier. - -All three methods are used at their method-class default hyperparameters unless stated. A follow-up hyperparameter sweep on ZI-MAF specifically is reported in the results section. - -An isolated per-column evaluation of the zero-classifier alone (logistic regression, histogram gradient boosting, a small MLP, isotonic-calibrated random forest, and the 50-tree random-forest default) shows that on direct classifier-quality measures — held-out log-loss, Brier score, expected calibration error, and ROC-AUC over the 26 ZI-eligible target columns — histogram gradient boosting Pareto-dominates the random-forest default (log-loss 0.225 vs 0.310, ECE 0.005 vs 0.039, AUC 0.809 vs 0.737). PRDC coverage at the synthesizer level, however, is insensitive to the swap (0.7017 for histogram gradient boosting vs 0.7081 for the 50-tree random forest), because error in the downstream QDNN non-zero draw swamps the classifier-level gap. The benchmark numbers reported in @sec-results were generated with the random-forest default for reproducibility with prior artifacts; the `microplex-us` implementation default has since moved to histogram gradient boosting for deployments that surface $\hat{P}(y=0 \mid x)$ as a user-visible diagnostic signal. The full isolated evaluation is recorded in `docs/zi-factorial.md`. - -## Train/holdout split and PRDC evaluation {#sec-methods-prdc} - -The 77,006-record dataset is split into 61,604 training and 15,402 holdout records at a fixed random seed (42). Each synthesizer is fit on the training partition and generates 61,604 synthetic records. PRDC metrics [@naeem2020prdc] are computed on 15,000 real and 15,000 synthetic records, sub-sampled without replacement from the holdout and synthetic outputs respectively. The PRDC sample cap of 15,000 per side is a memory-budget constraint: the `prdc` library materializes pairwise distance matrices, and capping both sides at 15,000 keeps those matrices within a 48 GB workstation budget. PRDC coverage is computed with $k = 5$ nearest neighbors on standardized feature vectors. - -The sample cap couples metric noise to the split seed, because the PRDC sub-sample is drawn from the same RNG that produced the train/holdout split. Decoupling the two seeds and averaging over multiple PRDC sub-samples would separate metric-noise variance from split variance; this is deferred to a future extension. - -## Rare-cell probes {#sec-methods-rare-cells} - -Four pre-registered rare-cell probes are computed per method as synthetic-count divided by real-count in cells constructed from combinations of target and conditioning variables: (a) elderly self-employed (age ≥ 62 and self-employment income > 0), (b) young dividend recipients (age < 30 and qualified dividend income > 0), (c) SSDI-participating disabled individuals (is_disabled = 1 and Social Security disability income > 0), and (d) top-1 % employment-income earners (employment income ≥ 99th percentile of the holdout distribution). A ratio of 1.0 means the synthesizer preserves the real cell frequency; 0.0 means the synthesizer annihilates the cell; a ratio greater than 1.0 indicates over-representation. - -## Per-column zero-rate breakdown {#sec-methods-zero-rate} - -For every target column $c$, I compute the real holdout zero rate $z_c^{\text{real}} = |{i : y_{i,c}^{\text{real}} = 0}| / n_{\text{holdout}}$ and the synthetic zero rate $z_c^{\text{synth}}$, and report the scalar mean absolute error $\mathrm{MAE}_z = \frac{1}{|C|} \sum_c |z_c^{\text{real}} - z_c^{\text{synth}}|$ alongside a per-column $(z_c^{\text{real}}, z_c^{\text{synth}}, |z_c^{\text{real}} - z_c^{\text{synth}}|)$ breakdown for diagnostic use. - -## Robustness checks {#sec-methods-robustness} - -Three sensitivity checks follow the headline PRDC evaluation: - -1. **Scale sensitivity**: rerun at 40,000 records (random sub-sample, seed 42). If ordering or absolute values depend on scale, the 77,006-row result is not generalizable. -2. **Learned-embedding PRDC**: fit a 16-dimensional autoencoder on the 15,402-record standardized holdout for 200 epochs (two hidden layers of width 64, mean-squared reconstruction loss), then compute PRDC in the 16-dimensional latent space. If ordering depends on the raw 50-dimensional metric, a less dimension-sensitive embedding should reveal that. -3. **Calibrate-on-synthesizer follow-up**: apply gradient-descent chi-squared calibration to each synthesizer's output, with per-target-column holdout-sum constraints. If the synthesizer's output is structurally close to the holdout distribution, calibration reduces its weighted-aggregate relative error; if the output is structurally broken, calibration cannot close the gap. - -Each of these checks uses the same 77,006-record dataset and seed=42 split; they are complementary rather than statistically independent. A multi-seed replication of ordering stability is a natural next step. - -## Hyperparameter sensitivity {#sec-methods-tuning} - -Given the wide default-hyperparameter performance gap between ZI-MAF and the other two methods, I ran a four-configuration expansion sweep on ZI-MAF: default (4 layers × 32 hidden × 50 epochs, learning rate 1e-3), wide (4 × 128 × 50, 1e-3), long (4 × 32 × 200, 1e-3), and wide+long (8 × 128 × 200, 5e-4). The wide+long configuration is a 16-fold increase in parameter count and a 4-fold increase in training time relative to default. The sweep is a diagonal slice rather than a full grid, so it cannot rule out that a non-axis-aligned combination dominates; it is designed to characterize how ZI-MAF coverage scales with compute budget rather than to find an optimum. - -## Upstream benchmark correction {#sec-methods-snap} - -During the benchmark, I identified and corrected a noise-injection defect in `microplex.eval.benchmark._MultiSourceBase.generate`. The routine applied Gaussian noise with standard deviation 0.1 to every shared conditioning value before per-column regeneration, which turned binary and categorical conditioning variables into non-integer floats and systematically biased downstream PRDC coverage downward. The correction detects integer-valued training columns by the test $\forall i: |y_i - \mathrm{round}(y_i)| < 10^{-6}$ and skips noise injection for those columns. All numerical results in this paper use the corrected base class; @tbl-prefix reports the pre- vs post-correction comparison. - -# Results {#sec-results} - -## Cross-section synthesizer ordering - -Four synthesizers were evaluated on the 77,006-record, 50-column Enhanced CPS 2024 panel, using a fixed 80/20 train/holdout split (seed 42) and capping PRDC estimation at 15,000 samples per comparison. Headline results are in @tbl-stage1. - -| Method | Coverage | Precision | Density | Fit time (s) | Peak RSS (GB) | Zero-rate MAE | -|----------|---------:|----------:|--------:|-------------:|--------------:|--------------:| -| ZI-QRF | **0.931**| **0.907** | **0.879** | 38.4 | 9.6 | **0.013** | -| ZI-CART | 0.908 | 0.897 | 0.840 | **5.2** | **1.3** | **0.013** | -| ZI-QDNN | 0.707 | 0.834 | 0.673 | 99.4 | 11.0 | 0.136 | -| ZI-MAF | 0.093 | 0.030 | 0.022 | 226.0 | 11.0 | 0.081 | - -: Cross-section benchmark results at 77,006 records and 50 variables on Enhanced CPS 2024. PRDC diagnostics are estimated on 15,000 samples per side. All runs share a single 80/20 train/holdout split (seed 42) and use each method class's default hyperparameters. Bold indicates best in column. Peak RSS is peak resident-set memory during fit. Zero-rate MAE is the mean absolute error of column-wise zero proportion between synthetic output and the real holdout. {#tbl-stage1} - -A three-seed replication at seeds 0, 1, and 2 (all other settings identical) gives ZI-QRF mean coverage 0.931 ± 0.002 and ZI-CART mean coverage 0.910 ± 0.002. The 0.021-point gap is approximately ten standard deviations wide, ruling out seed-variance as an explanation. ZI-QRF is genuinely more accurate than ZI-CART on PRDC coverage, but at 7× the fit time and 7× the peak memory. For production use under a compute budget, this trade-off is load-bearing: at full-scale 1.5-million-household microsimulation, ZI-CART's 1.3 GB RSS extrapolates to approximately 30 GB while ZI-QRF extrapolates to above 200 GB (linear extrapolation, upper bound). ZI-CART is the compute-constrained production default; ZI-QRF is the accuracy-maximizing choice when memory and wall time are not binding. - -The ordering in @tbl-stage1 is preserved under four complementary sensitivity checks: raw 50-dimensional PRDC at 40,000 records, raw 50-dimensional PRDC at 77,006 records, 16-dimensional learned-autoencoder-embedding PRDC at 40,000 records, and weighted-aggregate relative error under subsequent calibration. ZI-MAF hyperparameter expansion (from 4-layer × 32-hidden × 50 epochs to 8-layer × 128-hidden × 200 epochs, a 14-fold compute budget increase) moves ZI-MAF coverage from 0.026 to 0.033 — a 25 % relative improvement that leaves a tenfold gap to ZI-QRF. - -## Upstream benchmark defect and correction - -I identified a noise-injection defect in `microplex.eval.benchmark._MultiSourceBase.generate` during the course of this work. The routine added σ = 0.1 Gaussian noise to every shared-column value before per-column regeneration, including binary and categorical conditioning variables (for example, sex, military-service, state FIPS, and CPS race indicators). Pre-fix, synthetic values never matched the training pool's discrete support on these variables; per-column zero-rate diagnostics appeared broken for every method simultaneously, because a nominally binary indicator became continuous floats such as `1.04`. The fix detects integer-valued training columns and skips noise injection for them. - -Pre-fix and post-fix PRDC coverage on matched 77,006-record, 50-variable runs are reported in @tbl-prefix. - -| Method | Before correction | After correction | Δ | -|---------|------------------:|-----------------:|---------:| -| ZI-QRF | 0.256 | 0.928 | +0.672 | -| ZI-QDNN | 0.147 | 0.707 | +0.560 | -| ZI-MAF | 0.014 | 0.106 | +0.092 | - -: PRDC coverage before and after correcting the noise-injection defect in `microplex.eval.benchmark._MultiSourceBase.generate`. Before-correction values use σ = 0.1 Gaussian noise applied to all shared-column values, including binary and categorical conditioning variables. After-correction values skip noise injection for integer-valued columns. Same 77k × 50 run configuration in both columns. {#tbl-prefix} - -Ordering is invariant across the fix; absolute coverage values are meaningfully higher after correction. Synthesizer benchmarks that used the same `microplex.eval.benchmark` base class before the correction landed should be interpreted as reporting a systematically biased lower bound on PRDC coverage against real data. I merged the fix into the upstream `microplex` repository on 2026-04-17. - -## Rare-cell preservation - -Synthetic-to-real count ratios for the four pre-registered rare-cell probes are reported in @tbl-rare-cells. - -| Method | Elderly self-employed | Young dividend | Disabled SSDI | Top-1 % employment | -|---------|----------------------:|---------------:|--------------:|-------------------:| -| ZI-QRF | **3.2** | **3.9** | **3.3** | **4.0** | -| ZI-QDNN | 79.2 | 3.0 | 3.3 | 4.0 | -| ZI-MAF | 98.9 | 4.0 | 3.2 | 4.0 | - -: Synthetic-count divided by real-count for four pre-registered rare-cell probes on the 77,006-record Enhanced CPS 2024 holdout. A ratio of 1.0 indicates exact preservation; values above 1.0 indicate the synthesizer over-samples the cell; values below 1.0 indicate under-representation. Bold indicates the method closest to 1.0 in each column. {#tbl-rare-cells} - -All three methods over-sample each cell by roughly 3–4 fold, consistent with the synthesizers generating conditional distributions that are broader than the empirical distribution (a characteristic byproduct of the per-column modeling strategy). ZI-QRF is closest to unit preservation across every cell. The neural methods have a specific pathology on elderly self-employed — ZI-QDNN at 79× and ZI-MAF at 99× over-sampling — which is almost certainly a zero-inflation-classifier calibration failure on this particular cell (the class has low base rate and the per-column classifier over-predicts non-zero self-employment income conditional on age $\geq 62$). Fixing this would require either a per-cell precision-recall post-hoc calibration on the classifier or a joint zero-mask model over the full target-column set. - -## Calibration on synthesizer output - -Identity-preserving gradient-descent chi-squared calibration was applied to the 36 target-column sums of each synthesizer's output, with holdout totals as the calibration targets. Results after 500 epochs of calibration at learning rate 1e-3 are in @tbl-calibrate. - -| Method | Before calibration (mean rel. err.) | After calibration (mean rel. err.) | -|----------|-----------------------------------:|-----------------------------------:| -| ZI-QRF | 0.317 | **0.105** | -| ZI-QDNN | 0.386 | 0.251 | -| ZI-MAF | 17.51 | 11.86 | - -: Mean relative error of 36 target-column sums against holdout totals before and after 500 epochs of gradient-descent chi-squared calibration on each synthesizer's output. All three calibrations were run with identical hyperparameters (learning rate 1e-3, noise level 0, seed 42). Bold indicates best in column. {#tbl-calibrate} - -Calibration refines structurally sound synthesizer output; it does not rescue a structurally broken one. ZI-MAF's post-calibration error remains over 1100 % of target scale, consistent with its raw outputs falling too far outside target support for weight adjustment to bridge. - -# Discussion {#sec-discussion} - -## Why QRF dominance on heavy-tailed conditional distributions is expected {#sec-disc-qrf} - -The empirical finding that ZI-QRF dominates on PRDC coverage at 77,006 records × 50 variables is consistent with the known behavior of quantile regression forests on heavy-tailed conditional distributions. QRF estimates the conditional distribution of $y$ given $x$ non-parametrically by pooling conditional empirical quantiles over the terminal leaves of an ensemble of random trees [@meinshausen2006qrf]. At a terminal leaf, QRF can reproduce the empirical distribution of $y$ exactly — including the rare heavy-tail values — because the model is a mixture over leaf-local histograms rather than a smooth parametric family. - -This is in tension with the way MAF and QDNN approximate heavy-tailed targets. A MAF with log-space preprocessing [@xu2019modeling] maps heavy-tailed positive values through $\log(1 + y)$, which compresses the tail into a bounded regime where the flow's Gaussian base measure can cover it. Log-preprocessing is a reasonable choice for well-behaved right-tails but introduces systematic under-estimation on variables with point masses at extreme values (top-1% income, net worth at SCF-augmented billionaire records). Quantile DNNs under pinball loss approximate decile quantiles with a smooth neural network; the smoothness prior is a regularizer that helps generalization but damages heavy-tail fidelity. - -On Enhanced CPS data specifically, many target variables are heavy-tailed by construction — employment income follows a log-normal with IRS-administrative top-coding, net worth inherits the SCF tail and is further augmented with Forbes records — so the QRF preservation of empirical quantiles is unusually load-bearing. A fair question is whether ZI-QRF's advantage shrinks on data without the extreme tails (for example, on demographics-only benchmarks or on census-data-only targets without the PUF augmentation). The benchmark here does not address that question directly; it addresses the question "which method produces better synthetic microdata for US tax-benefit work at production scale," where heavy-tail fidelity is specifically what matters. - -## ZI-MAF's hyperparameter expansion and its limits {#sec-disc-zi-maf} - -The wide+long ZI-MAF configuration uses approximately 16× the parameters and 4× the training time of the default and recovers only 0.033 coverage from 0.026 — a 25 % relative improvement that leaves ZI-QRF's 0.982 essentially unapproachable within the architectural family. Three structural limitations plausibly explain this: - -1. **Per-column independence**. The `ZIMAFMethod` class fits one flow per target column, with no cross-target joint structure. In Enhanced CPS many target columns are correlated (wage income correlates with SE income, 401(k) contributions correlate with wage income, capital gains correlate with dividends). An independent-per-column flow cannot exploit those correlations and therefore produces synthetic records that are marginally plausible but jointly implausible. A joint flow (a single MAF over the entire target-column vector) is architecturally different and may recover the gap. This paper does not test that hypothesis. -2. **Log-then-standardize preprocessing on zero-inflated continuous targets**. The per-column MAF log-transforms positive values with $\log(1 + y)$ and standardizes. Log compression of heavy tails reduces the flow's sensitivity to extreme values; standardization sets a fixed scale that is determined by the non-zero subset. Both choices favor bulk-of-distribution fidelity over tail fidelity. -3. **Zero-inflation handling via an independent RF classifier**. The classifier predicts $P(y > 0 \mid x)$ per column independently. If a rare cell has a low conditional base rate that the training data under-represents, the classifier under-predicts non-zero across the cell, and the downstream MAF is trained on a biased non-zero subset. This is exactly the pattern that produces the 99× over-sampling of elderly self-employed in @tbl-rare-cells. - -Fixing any one of these would require architectural changes beyond hyperparameter tuning. The paper's claim is not that MAF-family synthesizers cannot be made competitive — it is that they are not competitive at the default `ZIMAFMethod` implementation and that closing the gap requires a redesign rather than a sweep. - -## PRDC in 50 dimensions and the role of the embedding check {#sec-disc-prdc} - -PRDC coverage uses a $k$-nearest-neighbor ball construction on standardized feature vectors. Beyond approximately 10–15 dimensions, $k$-NN distances concentrate toward their mean and the coverage metric becomes noise-dominated in the sense that identically distributed real and synthetic samples can yield coverage values far from 1.0 [@beyer1999nn; @aggarwal2001surprising]. At 50 dimensions this concern is material. The embedding-PRDC check in @sec-methods-robustness addresses it: if the 50-dimensional PRDC ordering is an artifact of dimensionality concentration, the ordering in the 16-dimensional learned-autoencoder latent space should differ. - -The embedding check preserves ordering exactly (ZI-QRF > ZI-QDNN > ZI-MAF) and ZI-QRF's latent-space coverage (0.984) is essentially identical to its raw-space coverage (0.982), suggesting that the raw-feature result is not a dimensionality artifact. A remaining concern is that the autoencoder is fit on the holdout and could therefore adapt to whatever idiosyncrasies the holdout sample has, potentially favoring methods whose synthetic output matches those idiosyncrasies. A cleaner test would fit the encoder on train-only or on an independent third partition; a multi-seed check on the holdout-vs-train autoencoder fit is deferred. - -## The calibrate-on-synth finding as practical guidance {#sec-disc-calibrate} - -The calibration-refines-but-does-not-rescue finding (@tbl-calibrate) is a specific claim about a specific pipeline and has practical implications for practitioners. If an organization runs a weak synthesizer and plans to calibrate heavily afterward to hit policy-target aggregates, this paper's evidence suggests the calibrated output will approximate policy aggregates only if the underlying synthesizer was structurally close to the targets in the first place. ZI-QRF starts close (mean relative error 0.317) and calibrates to 0.105; ZI-MAF starts so far off (17.51) that 500 epochs of calibration closes only 32 % of the gap and leaves mean error above 1100 % of target scale. Calibration's role is to refine, not to repair, and organizations should not trust post-calibration aggregates to compensate for low synthesizer fidelity. - -## Runtime and operational considerations {#sec-disc-runtime} - -ZI-QRF runs in 37 seconds and peaks at 6 GB RSS on an Apple M3 with 48 GB RAM; ZI-QDNN in 105 seconds at 11 GB; ZI-MAF in 227 seconds at 11 GB. For an organization iterating on synthesizer choice, the 6× compute gap between ZI-QRF and ZI-MAF is as practically decisive as the coverage gap. ZI-QRF's cost profile also extrapolates cleanly to larger scales without requiring a GPU, which matters for microsim teams without dedicated ML infrastructure. The neural methods' 11 GB memory floor at 77,006 records extrapolates to approximately 220 GB at the production-scale 1.5-million-household frame; fitting either at full scale would require either GPU acceleration, batch-training with careful checkpointing, or a smaller per-column model. - -# Limitations {#sec-limits} - -The cross-section benchmark uses PolicyEngine's Enhanced CPS as both the input substrate and the source of held-out evaluation samples; it is not a test of generalization across CPS vintages. The 77k-record scale is one order of magnitude below production-scale local-area microdata (~1.5M households). PRDC coverage in 50 dimensions is known to concentrate; I report robustness to a learned-embedding variant but do not establish invariance to all reasonable metric choices. ZI-MAF and ZI-QDNN hyperparameters were fixed to method-class defaults with one follow-up sweep on ZI-MAF; a full NAS-style search could find configurations I did not; I report one additional expansion sweep on ZI-MAF that did not close the gap. Longitudinal accuracy claims are architectural rather than empirical in this paper; the evaluation of identity-preserving calibration across simulated years is deferred to a companion paper. - -# Code and data availability {#sec-availability} - -All code is open-source under the MIT license at `https://github.com/PolicyEngine/microplex-us` (commit hash of the submitted version will be noted in the camera-ready). The benchmark harness, scripts, and Quarto source for this paper are in that repository. Supporting infrastructure in `microplex` core (`https://github.com/PolicyEngine/microplex`) is also open-source. - -The Enhanced CPS 2024 dataset used as the evaluation substrate is the `enhanced_cps_2024.h5` HDF5 file published by PolicyEngine on Hugging Face (`https://huggingface.co/policyengine/policyengine-us-data`). The file is freely downloadable without credentials and is ~43 MB on disk. The specific revision used for all benchmarks in this paper will be pinned to a Hugging Face dataset revision hash or mirrored to Zenodo in the camera-ready version. - -Rebuilding Enhanced CPS from scratch requires IRS PUF access, which is gated by data-use agreements; I do not reproduce this upstream construction in this paper. A third party with the published HDF5 can reproduce every numerical result in the paper without additional data-access credentials. - -Reproduction environment for the results reported here: Python 3.14.0, macOS 14 (Darwin 25.3.0) on an Apple M3 with 48 GB unified memory. The benchmark harness is CPU-only (no GPU required); full stage-1 run at 77k × 50 scale across three methods completes in approximately six minutes. The `uv.lock` file pins all dependencies. - -# Disclosures {#sec-disclosures} - -I founded PolicyEngine, the non-profit organization that publishes the Enhanced CPS 2024 data product this paper uses as an evaluation substrate, and previously led the work reported in @ghenis2024ecps. The present research is conducted at PolicyEngine. PolicyEngine's Enhanced CPS is cited throughout as the incumbent public tool against which `microplex-us` is measured. I have no other competing interests to disclose. - -# Conclusion {#sec-conclusion} - -`microplex-us` is a spec-driven alternative to legacy construction pipelines for US tax-benefit microdata, built from four decisions that matter independently: donor-block specifications separated from imputer-backend implementation, chained quantile-regression-forest imputation across heterogeneous administrative and survey sources, identity-preserving gradient-descent chi-squared calibration as the production default, and sparse L0 record selection reserved for deployment subsampling rather than as a calibration mainline. None of the underlying mechanisms is foundationally new. What is new is the composition and the empirical evidence that follows from it. - -At 77,006 Enhanced CPS 2024 records across 50 target income and benefit variables, ZI-QRF dominates ZI-QDNN and ZI-MAF on PRDC coverage (0.928 vs. 0.707 vs. 0.106), at roughly $\frac{1}{6}$ the compute budget, with ordering preserved across three complementary sensitivity checks and across a hyperparameter expansion sweep on ZI-MAF. The result is consistent with QRF's known empirical-quantile fidelity on heavy-tailed conditional distributions, which is exactly the distributional structure tax microdata has. Practitioners choosing a synthesizer for US tax-benefit work at this scale have a clear default based on this evidence. - -The paper also documents a noise-injection defect in the upstream `microplex.eval.benchmark` base class and publishes corrected results. Benchmark numbers produced with the uncorrected base class before 2026-04-17 should be treated as lower bounds on PRDC coverage against real data. - -The evaluation is cross-sectional; longitudinal claims are architectural rather than empirical. The natural next step is to test identity-preserving calibration across simulated years using a matched longitudinal benchmark, and to extend the target-variable set to include downstream policy outputs (computed federal and state income tax liabilities, EITC and CTC disbursed amounts, SNAP and SSI program-rule-derived amounts) rather than the CPS-reported input variables benchmarked here. Both extensions are underway in companion work. - -# Acknowledgments {-} - -The empirical work benefited from access to public data products maintained by the US Census Bureau (CPS ASEC, ACS), the Internal Revenue Service (Statistics of Income Public Use File), the Federal Reserve Board (SCF), and the Social Security Administration (SIPP). Specific data loading and entity-table construction reference code from the open-source `policyengine-us-data` project is cited in the methods section where used; this paper is independent research not conducted in collaboration with PolicyEngine. - -# References {-} diff --git a/paper/literature-review.qmd b/paper/literature-review.qmd deleted file mode 100644 index 4bd269e4..00000000 --- a/paper/literature-review.qmd +++ /dev/null @@ -1,119 +0,0 @@ ---- -title: "Literature review for `microplex-us`" -author: - - name: Max Ghenis - affiliation: PolicyEngine - email: max@policyengine.org -date: last-modified -bibliography: references.bib -format: - html: - toc: true - toc-depth: 3 - number-sections: true ---- - -This document surveys the literature that frames `microplex-us`'s contributions. It is written to be cited by the main paper, and to be useful as a standalone reading map. Sections follow the four research threads the project sits across: synthetic tabular data, survey calibration, evaluation metrics, and US tax microsimulation. - -## Synthetic tabular data: methods and benchmarks - -### Generator lineage - -The modern tabular-synthesis literature starts with the Synthetic Data Vault (@patki2016sdv) and copula-based generators, then moves to `synthpop` (@nowok2016synthpop) which establishes the CART-based sequential approach that has proven surprisingly durable. Deep-generative methods arrive with CTGAN and TVAE (@xu2019modeling), which remain the most-cited baseline neural synthesizers. Diffusion enters tabular with TabDDPM (@kotelnikov2023tabddpm). Language-model-based synthesis emerges with GReaT (@borisov2023great) and REaLTabFormer (@solatorio2023realtabformer). TabSyn (@zhang2024tabsyn) combines latent-space score-based diffusion with competitive performance on benchmarks. Foundation-model approaches for tabular data now include TabPFN-v2 (@hollmann2025tabpfn), whose primary contribution is prediction rather than synthesis but which spawned a synthesis variant (TabPFGen) with no current peer-reviewed venue. - -### Benchmark frameworks - -Two benchmarking frameworks now dominate: `Synthcity` (@qian2023synthcity) and SDMetrics. Benchmarks aggregate three metric families: - -- Statistical fidelity: column-wise Kolmogorov-Smirnov and total-variation distances, pairwise correlation differences. -- Sample-level / support-based: Precision, Recall, Density, Coverage (PRDC; @naeem2020prdc), and the sample-level α-precision and β-recall of @alaa2022precision. -- Downstream utility: Train-on-Synthetic / Test-on-Real (TSTR), typically with a boosted-tree classifier or regressor on held-out real data. - -### Tabular synth on US economic microdata - -Published head-to-head benchmarks on real US tax or income microdata are scarce. @little2025synth compares synthpop, DataSynthesizer, CTGAN, and TVAE on census microdata in four countries and finds CART-based synthpop dominates utility, with CTGAN/TVAE substantially weaker on pairwise dependence. @bowen2022puf document a synthetic supplemental PUF built on IRS Statistics of Income data using sequential CART, framed as a privacy-preserving release for restricted data. - -No published head-to-head comparison of quantile regression forests (QRF; @meinshausen2006qrf) or ZI-QRF against modern deep generators (CTGAN, TabDDPM, GReaT, TabSyn) on real US income microdata appears to exist. This is the gap the cross-section benchmark in this paper fills. - -### Known scaling failure modes - -@kotelnikov2023tabddpm report stable performance up to ~100 features but do not publish a clean scaling ablation. Published survey work (including @drechsler2024synthetic) notes that GANs exhibit mode collapse on high-cardinality categoricals, that CTGAN/TVAE degrade on skewed long-tail continuous variables, and that one-hot encoding multiplies the effective dimensionality for wide categorical schemas. TabPFN-v2 has a native cap at 500 features. The PUF has 179 real columns — near or above the comfort zones of several methods. - -## Survey calibration: classical lineage and modern extensions - -### Canonical calibration - -The foundational paper is @deville1992calibration, which defines the calibration estimator as a constrained weight adjustment minimizing a distance function from design weights subject to linear moment constraints. The generalized raking extension in @deville1993raking handles categorical margins via iterative proportional fitting [@deming1940adjustment]. Modern practice extends this to range-restricted variants (bounded, logit, truncated-linear distance functions) which guarantee positive weights on every retained record — the property labeled *identity preservation* in the main paper. @devaud2019calibration provides the most current treatment of existence and feasibility conditions. Reviews by @haziza2017weights and @kott2016calibration map the current landscape. - -A related line is entropy balancing (@hainmueller2012entropy), which is mathematically close to calibration with a Kullback-Leibler distance and moment constraints. Entropy-balanced weights are always positive. - -### Sparse / L0 calibration - -L0 regularization entered machine learning via hard-concrete stochastic gates [@louizos2018l0], which made L0 differentiable and therefore compatible with gradient-based optimization. Applying this to survey calibration — effectively using L0 to select a sparse subset of records that hits a target set — is the mechanism implemented in the open-source PolicyEngine L0 package and its dependents. I could not locate an earlier paper formally treating L0-regularized survey calibration as a survey-statistics contribution. The technique's provenance is the deep-learning pruning literature; its application to microsim calibration appears to be novel to the PolicyEngine ecosystem. - -### Identity preservation as an under-named requirement - -"Identity-preserving calibration" is not a term of art in the survey statistics literature. The closest named property is "range-restricted calibration with positive lower bound" (e.g., logit or truncated-linear distance functions per @deville1992calibration). In longitudinal microsim, identity is implicit: DYNASIM3 [@favreault2004dynasim], MINT [@smith2013mint], and CBOLT [@cbo2018cbolt] all use dynamic-ageing or static-ageing with alignment to external totals, never dropping records. LIAM2 [@dementen2014liam2] similarly keeps full population records. The main paper argues for explicit recognition of identity preservation as an architectural requirement at the cross-sectional imputation and calibration layer, rather than as an implicit consequence of a particular ageing strategy, because the cross-sectional artifact is the input substrate to longitudinal simulation. - -### Chained multi-source QRF imputation - -The chained-equations framework for imputation is canonical MICE [@vanbuuren2011mice]. Extending it to use random forests as the per-variable draw model is explored in @doove2014chainedrf; related tools include `missForest` [@stekhoven2012missforest]. Using QRF specifically [@meinshausen2006qrf] for the per-variable draw in a chained microdata synthesis / imputation pipeline — where each stage feeds the next stage's conditioning set — is a natural combination of published components, but no single paper appears to name it as a method in its own right. It is best understood as a novel application of existing primitives rather than a fundamentally new algorithm. - -## Evaluation metrics: what works for tabular microdata - -### PRDC and its limitations - -@naeem2020prdc established precision/recall/density/coverage as the support-based quality quad, originally for image generators evaluated in Inception-embedding space. The approach is now widely applied to tabular data in raw-feature or standardized-feature space. - -Two documented failure modes matter in the present setting: - -1. **Outlier inflation of density and coverage.** @park2023probabilistic show that kNN-based support estimation is unreliable in the presence of outliers because the support manifold over-inflates around them. Income microdata with heavy tails (top-1 % employment income, net worth) is exactly the regime where this matters. -2. **High-dimensional concentration of distances.** @beyer1999nn and @aggarwal2001surprising demonstrate that in high-dimensional spaces, the ratio of maximum to minimum k-NN distance collapses toward 1, making nearest-neighbor-based metrics increasingly noise-dominated. The effect starts becoming non-trivial around 10–15 dimensions and is well-established by 50. - -These critiques motivate (a) reporting multiple metrics alongside PRDC rather than PRDC alone, and (b) testing whether PRDC orderings survive dimensionality reduction. - -### Alternatives - -@alaa2022precision introduce sample-level α-precision, β-recall, and authenticity, which are less fragile under outliers. TSTR is now the dominant primary metric in benchmark papers including @kotelnikov2023tabddpm and @zhang2024tabsyn. Detection-based metrics (classifier two-sample tests) are common; privacy metrics including distance-to-closest-record and membership-inference attacks form a parallel axis. - -### Rare-subpopulation preservation - -No canonical metric exists for rare-subgroup preservation. @stadler2022groundhog document that synthesizers systematically drop outlier records under differential privacy, with implications for minority-cell representation. Sub-group TSTR or conditional-marginal TV distance are the field's current ad-hoc solutions. A principled metric appears to remain an open problem. - -## US tax-benefit microsimulation - -### The ecosystem - -@toder2024microsim is the current umbrella review. Active US tax microsimulation models include: - -- TAXSIM (@feenberg1993taxsim), NBER, the long-standing public tool. -- Tax-Calculator / PSL Models (@debacker2019taxcalc). -- The Urban-Brookings Tax Policy Center microsimulation model. -- CBO's tax microsimulation (@cbo2018taxmodel). -- The Budget Lab at Yale (active since 2024). -- PolicyEngine-US-Data (Enhanced CPS), first published as @ghenis2024ecps. - -Each ships with its own approach to augmenting Census data with tax-administrative detail. @bowen2022puf is the current reference point for synthetic PUF methodology at IRS SOI; the technique is sequential CART with privacy-motivated noise. - -### Longitudinal models - -DYNASIM3 (@favreault2004dynasim), MINT (@smith2013mint), and CBOLT (@cbo2018cbolt) are the three long-running US longitudinal microsims; all are government-linked and use static-ageing with external alignment. The international family (LIAM2 and MIDAS; @dementen2014liam2, with survey in @odonoghue2001dynamicsurvey) provides the open-source reference implementations. - -### Top-income augmentation precedents - -Augmenting Survey of Consumer Finances data with Forbes-style top-wealth records is established practice in distributional national accounts [@piketty2018dina; @saez2016wealth]. Porting this augmentation pattern into a tax microsimulation dataset is, as far as I can tell, novel to the PolicyEngine-US-Data lineage; `microplex-us` adopts the approach without methodological innovation. - -### Small-area estimation - -@fay1979herriot is the foundational paper for area-level small-area estimation; @rao2015sae is the modern textbook reference. Applications to tax microdata at the county / congressional-district scale remain a research frontier — IRS SOI publishes direct rather than smoothed estimates, and the Fay-Herriot framework has not been formally ported into a published tax microsimulation pipeline. - -## Synthesis - -The `microplex-us` project contributes in four places where the literature is thin: - -1. A head-to-head comparison of QRF-family and neural synthesizers on real US tax microdata at realistic scale. No prior published work covers this cell directly. -2. An explicit formulation of identity preservation as an architectural requirement for cross-section-to-longitudinal pipelines, with concrete implementation via `microcalibrate`-style gradient-descent chi-squared calibration. -3. A composition of chained QRF imputation with `microcalibrate` calibration that has no single-paper precedent, though each component is published. -4. A spec-driven donor integration runtime that explicitly separates donor-block contracts from backend implementation. - -The main paper reports empirical results supporting (1) and documents the architectural and software design behind (2)–(4). This paper does not claim foundational methodological novelty; it claims that the composition and the empirical finding together advance the state of practice for US tax-benefit microdata construction. diff --git a/paper/references.bib b/paper/references.bib deleted file mode 100644 index e35f3aa5..00000000 --- a/paper/references.bib +++ /dev/null @@ -1,509 +0,0 @@ -% ----------------------------------------------------------------------------- -% Core references — synthetic tabular data synthesis & evaluation -% ----------------------------------------------------------------------------- - -@inproceedings{patki2016sdv, - title = {The Synthetic Data Vault}, - author = {Patki, Neha and Wedge, Roy and Veeramachaneni, Kalyan}, - booktitle = {2016 IEEE International Conference on Data Science and Advanced Analytics (DSAA)}, - year = {2016}, - url = {https://dspace.mit.edu/handle/1721.1/109616} -} - -@article{nowok2016synthpop, - title = {synthpop: Bespoke Creation of Synthetic Data in {R}}, - author = {Nowok, Beata and Raab, Gillian M. and Dibben, Chris}, - journal = {Journal of Statistical Software}, - volume = {74}, - number = {11}, - year = {2016}, - doi = {10.18637/jss.v074.i11} -} - -@inproceedings{xu2019modeling, - title = {Modeling Tabular Data using Conditional {GAN}}, - author = {Xu, Lei and Skoularidou, Maria and Cuesta-Infante, Alfredo and - Veeramachaneni, Kalyan}, - booktitle = {Advances in Neural Information Processing Systems}, - volume = {32}, - year = {2019}, - eprint = {1907.00503}, - archivePrefix = {arXiv} -} - -@inproceedings{naeem2020prdc, - title = {Reliable Fidelity and Diversity Metrics for Generative Models}, - author = {Naeem, Muhammad Ferjad and Oh, Seong Joon and Uh, Youngjung and - Choi, Yunjey and Yoo, Jaejun}, - booktitle = {International Conference on Machine Learning}, - year = {2020}, - eprint = {2002.09797}, - archivePrefix = {arXiv} -} - -@inproceedings{kotelnikov2023tabddpm, - title = {{TabDDPM}: Modelling Tabular Data with Diffusion Models}, - author = {Kotelnikov, Akim and Baranchuk, Dmitry and Rubachev, Ivan and - Babenko, Artem}, - booktitle = {International Conference on Machine Learning}, - year = {2023}, - eprint = {2209.15421}, - archivePrefix = {arXiv} -} - -@inproceedings{borisov2023great, - title = {Language Models are Realistic Tabular Data Generators}, - author = {Borisov, Vadim and Sessler, Kathrin and Leemann, Tobias and - Pawelczyk, Martin and Kasneci, Gjergji}, - booktitle = {International Conference on Learning Representations}, - year = {2023}, - eprint = {2210.06280}, - archivePrefix = {arXiv} -} - -@article{solatorio2023realtabformer, - title = {{REaLTabFormer}: Generating Realistic Relational and Tabular Data - using Transformers}, - author = {Solatorio, Aivin V. and Dupriez, Olivier}, - journal = {arXiv preprint}, - year = {2023}, - eprint = {2302.02041} -} - -@inproceedings{qian2023synthcity, - title = {Synthcity: a Benchmark Framework for Diverse Use Cases of Tabular - Synthetic Data}, - author = {Qian, Zhaozhi and Davis, Rob and van der Schaar, Mihaela}, - booktitle = {Advances in Neural Information Processing Systems (Datasets and - Benchmarks Track)}, - year = {2023}, - url = {https://proceedings.neurips.cc/paper_files/paper/2023/hash/09723c9f291f6056fd1885081859c186-Abstract-Datasets_and_Benchmarks.html} -} - -@inproceedings{zhang2024tabsyn, - title = {Mixed-Type Tabular Data Synthesis with Score-based Diffusion in - Latent Space}, - author = {Zhang, Hengrui and Zhang, Jiani and Srinivasan, Balasubramaniam - and Shen, Zhengyuan and Qin, Xiao and Faloutsos, Christos and - Rangwala, Huzefa and Karypis, George}, - booktitle = {International Conference on Learning Representations}, - year = {2024}, - eprint = {2310.09656}, - archivePrefix = {arXiv} -} - -@article{hollmann2025tabpfn, - title = {Accurate predictions on small data with a tabular foundation model}, - author = {Hollmann, Noah and M{\"u}ller, Samuel and Purucker, Lennart and - Krishnakumar, Arjun and K{\"o}rfer, Max and Hoo, Shi Bin and - Schirrmeister, Robin Tibor and Hutter, Frank}, - journal = {Nature}, - volume = {637}, - number = {8045}, - year = {2025}, - doi = {10.1038/s41586-024-08328-6} -} - -@inproceedings{alaa2022precision, - title = {How Faithful is your Synthetic Data? Sample-level Metrics for - Evaluating and Auditing Generative Models}, - author = {Alaa, Ahmed and van Breugel, Boris and Saveliev, Evgeny and - van der Schaar, Mihaela}, - booktitle = {International Conference on Machine Learning}, - year = {2022}, - eprint = {2102.08921}, - archivePrefix = {arXiv} -} - -@inproceedings{park2023probabilistic, - title = {Probabilistic Precision and Recall Towards Reliable Evaluation of - Generative Models}, - author = {Park, Jaehyun and Kim, Sangyeong}, - booktitle = {International Conference on Computer Vision}, - year = {2023}, - eprint = {2309.01590}, - archivePrefix = {arXiv} -} - -% ----------------------------------------------------------------------------- -% High-dimensional k-NN critique -% ----------------------------------------------------------------------------- - -@inproceedings{beyer1999nn, - title = {When Is "Nearest Neighbor" Meaningful?}, - author = {Beyer, Kevin S. and Goldstein, Jonathan and Ramakrishnan, Raghu - and Shaft, Uri}, - booktitle = {International Conference on Database Theory (ICDT)}, - year = {1999}, - doi = {10.1007/3-540-49257-7_15} -} - -@inproceedings{aggarwal2001surprising, - title = {On the Surprising Behavior of Distance Metrics in High - Dimensional Space}, - author = {Aggarwal, Charu C. and Hinneburg, Alexander and Keim, Daniel A.}, - booktitle = {International Conference on Database Theory (ICDT)}, - year = {2001}, - doi = {10.1007/3-540-44503-X_27} -} - -% ----------------------------------------------------------------------------- -% Quantile regression forests -% ----------------------------------------------------------------------------- - -@article{meinshausen2006qrf, - title = {Quantile Regression Forests}, - author = {Meinshausen, Nicolai}, - journal = {Journal of Machine Learning Research}, - volume = {7}, - year = {2006}, - pages = {983--999} -} - -% ----------------------------------------------------------------------------- -% Survey calibration — classical and modern -% ----------------------------------------------------------------------------- - -@article{deville1992calibration, - title = {Calibration Estimators in Survey Sampling}, - author = {Deville, Jean-Claude and S{\"a}rndal, Carl-Erik}, - journal = {Journal of the American Statistical Association}, - volume = {87}, - number = {418}, - year = {1992}, - pages = {376--382}, - doi = {10.1080/01621459.1992.10475217} -} - -@article{deville1993raking, - title = {Generalized Raking Procedures in Survey Sampling}, - author = {Deville, Jean-Claude and S{\"a}rndal, Carl-Erik and Sautory, Olivier}, - journal = {Journal of the American Statistical Association}, - volume = {88}, - number = {423}, - year = {1993}, - pages = {1013--1020}, - doi = {10.1080/01621459.1993.10476369} -} - -@article{hainmueller2012entropy, - title = {Entropy Balancing for Causal Effects: A Multivariate Reweighting - Method to Produce Balanced Samples in Observational Studies}, - author = {Hainmueller, Jens}, - journal = {Political Analysis}, - volume = {20}, - number = {1}, - year = {2012}, - pages = {25--46}, - doi = {10.1093/pan/mpr025} -} - -@article{devaud2019calibration, - title = {{Deville and Särndal's} calibration: revisiting a 25-years-old - successful optimization problem}, - author = {Devaud, David and Till{\'e}, Yves}, - journal = {TEST}, - volume = {28}, - number = {4}, - year = {2019}, - pages = {1033--1065}, - doi = {10.1007/s11749-019-00681-3} -} - -@article{haziza2017weights, - title = {Construction of Weights in Surveys: A Review}, - author = {Haziza, David and Beaumont, Jean-Fran{\c{c}}ois}, - journal = {Statistical Science}, - volume = {32}, - number = {2}, - year = {2017}, - pages = {206--226}, - doi = {10.1214/16-STS608} -} - -@article{kott2016calibration, - title = {Calibration Weighting in Survey Sampling}, - author = {Kott, Phillip S.}, - journal = {WIREs Computational Statistics}, - volume = {8}, - number = {1}, - year = {2016}, - doi = {10.1002/wics.1374} -} - -@article{deming1940adjustment, - title = {On a Least Squares Adjustment of a Sampled Frequency Table When - the Expected Marginal Totals Are Known}, - author = {Deming, W. Edwards and Stephan, Frederick F.}, - journal = {The Annals of Mathematical Statistics}, - volume = {11}, - number = {4}, - year = {1940}, - pages = {427--444} -} - -% ----------------------------------------------------------------------------- -% L0 regularization & sparse calibration -% ----------------------------------------------------------------------------- - -@inproceedings{louizos2018l0, - title = {Learning Sparse Neural Networks through {$L_0$} Regularization}, - author = {Louizos, Christos and Welling, Max and Kingma, Diederik P.}, - booktitle = {International Conference on Learning Representations}, - year = {2018}, - eprint = {1712.01312}, - archivePrefix = {arXiv} -} - -% ----------------------------------------------------------------------------- -% Statistical matching & chained imputation -% ----------------------------------------------------------------------------- - -@article{vanbuuren2011mice, - title = {{MICE}: Multivariate Imputation by Chained Equations in {R}}, - author = {van Buuren, Stef and Groothuis-Oudshoorn, Karin}, - journal = {Journal of Statistical Software}, - volume = {45}, - number = {3}, - year = {2011}, - doi = {10.18637/jss.v045.i03} -} - -@article{doove2014chainedrf, - title = {Recursive partitioning for missing data imputation in the presence - of interaction effects}, - author = {Doove, Lisa L. and van Buuren, Stef and Dusseldorp, Elise}, - journal = {Computational Statistics \& Data Analysis}, - volume = {72}, - year = {2014}, - doi = {10.1016/j.csda.2013.10.025} -} - -@article{stekhoven2012missforest, - title = {{MissForest} --- non-parametric missing value imputation for - mixed-type data}, - author = {Stekhoven, Daniel J. and B{\"u}hlmann, Peter}, - journal = {Bioinformatics}, - volume = {28}, - number = {1}, - year = {2012}, - doi = {10.1093/bioinformatics/btr597} -} - -% ----------------------------------------------------------------------------- -% US tax microsimulation ecosystem -% ----------------------------------------------------------------------------- - -@article{feenberg1993taxsim, - title = {An Introduction to the {TAXSIM} Model}, - author = {Feenberg, Daniel R. and Coutts, Elisabeth}, - journal = {Journal of Policy Analysis and Management}, - volume = {12}, - number = {1}, - year = {1993}, - pages = {189--194}, - doi = {10.2307/3325474} -} - -@article{debacker2019taxcalc, - title = {Integrating Microsimulation Models of Tax Policy into a {DGE} - Macroeconomic Model}, - author = {DeBacker, Jason and Evans, Richard W. and Phillips, Kerk L.}, - journal = {Public Finance Review}, - volume = {47}, - number = {2}, - year = {2019}, - pages = {207--275}, - doi = {10.1177/1091142117721638} -} - -@techreport{cbo2018taxmodel, - title = {An Overview of {CBO}'s Microsimulation Tax Model}, - author = {Harris, Ed}, - institution = {Congressional Budget Office}, - number = {54096}, - year = {2018}, - url = {https://www.cbo.gov/publication/54096} -} - -@article{toder2024microsim, - title = {The Use of Microsimulation Models to Inform {US} Tax Policymaking}, - author = {Toder, Eric}, - journal = {International Journal of Microsimulation}, - volume = {17}, - number = {3}, - year = {2024}, - pages = {1--20}, - doi = {10.34196/ijm.00314} -} - -@article{bowen2022puf, - title = {Synthetic Individual Income Tax Data: Promises and Challenges}, - author = {Bowen, Claire McKay and Bryant, Victoria and Burman, Leonard and - Khitatrakun, Surachai and McClelland, Robert and Stallworth, Philip - and Ueyama, Kyle and Williams, Aaron R.}, - journal = {National Tax Journal}, - volume = {75}, - number = {4}, - year = {2022}, - pages = {767--790}, - doi = {10.1086/722094} -} - -@misc{ghenis2024ecps, - title = {{PolicyEngine's} Enhanced Current Population Survey for - Tax-Benefit Microsimulation}, - author = {Ghenis, Max and Woodruff, Nikhil}, - howpublished = {117th Annual Conference on Taxation, National Tax Association, - Detroit, MI}, - year = {2024}, - note = {Session: Advances in Using Administrative Data to Measure - Income Distributions and the Effects of Tax Policies}, - url = {https://www.policyengine.org/us/research/nta-2024} -} - -% ----------------------------------------------------------------------------- -% Longitudinal microsimulation -% ----------------------------------------------------------------------------- - -@techreport{favreault2004dynasim, - title = {A Primer on the Dynamic Simulation of Income Model - ({DYNASIM3})}, - author = {Favreault, Melissa M. and Smith, Karen E.}, - institution = {Urban Institute Retirement Project}, - year = {2004}, - type = {Discussion Paper} -} - -@techreport{smith2013mint, - title = {A Primer on Modeling Income in the Near Term, Version 7 - ({MINT7})}, - author = {Smith, Karen E. and Favreault, Melissa M.}, - institution = {Urban Institute for Social Security Administration}, - year = {2013} -} - -@techreport{cbo2018cbolt, - title = {An Overview of {CBOLT}: The {Congressional Budget Office} - Long-Term Model}, - author = {{Congressional Budget Office}}, - institution = {Congressional Budget Office}, - number = {53667}, - year = {2018}, - url = {https://www.cbo.gov/publication/53667} -} - -@article{dementen2014liam2, - title = {{LIAM2}: A New Open Source Development Tool for Discrete-Time - Dynamic Microsimulation Models}, - author = {de Menten, Gaetan and Dekkers, Gijs and Bryon, Geert and - Liegeois, Philippe and O'Donoghue, Cathal}, - journal = {Journal of Artificial Societies and Social Simulation}, - volume = {17}, - number = {3}, - year = {2014}, - pages = {9}, - doi = {10.18564/jasss.2574} -} - -@article{odonoghue2001dynamicsurvey, - title = {Dynamic Microsimulation: A Methodological Survey}, - author = {O'Donoghue, Cathal}, - journal = {Brazilian Electronic Journal of Economics}, - volume = {4}, - number = {2}, - year = {2001} -} - -% ----------------------------------------------------------------------------- -% Distributional national accounts — Forbes / billionaire augmentation precedents -% ----------------------------------------------------------------------------- - -@article{piketty2018dina, - title = {Distributional National Accounts: Methods and Estimates for the - {United States}}, - author = {Piketty, Thomas and Saez, Emmanuel and Zucman, Gabriel}, - journal = {Quarterly Journal of Economics}, - volume = {133}, - number = {2}, - year = {2018}, - pages = {553--609}, - doi = {10.1093/qje/qjx043} -} - -@article{saez2016wealth, - title = {Wealth Inequality in the {United States} since {1913}: Evidence - from Capitalized Income Tax Data}, - author = {Saez, Emmanuel and Zucman, Gabriel}, - journal = {Quarterly Journal of Economics}, - volume = {131}, - number = {2}, - year = {2016}, - pages = {519--578}, - doi = {10.1093/qje/qjw004} -} - -% ----------------------------------------------------------------------------- -% Small-area estimation -% ----------------------------------------------------------------------------- - -@article{fay1979herriot, - title = {Estimates of Income for Small Places: An Application of - {James-Stein} Procedures to Census Data}, - author = {Fay, Robert E. and Herriot, Roger A.}, - journal = {Journal of the American Statistical Association}, - volume = {74}, - number = {366a}, - year = {1979}, - pages = {269--277}, - doi = {10.1080/01621459.1979.10482505} -} - -@book{rao2015sae, - title = {Small Area Estimation}, - author = {Rao, J. N. K. and Molina, Isabel}, - year = {2015}, - edition = {2}, - publisher = {Wiley} -} - -% ----------------------------------------------------------------------------- -% Synthetic data meta — review and critique -% ----------------------------------------------------------------------------- - -@article{drechsler2024synthetic, - title = {30 Years of Synthetic Data}, - author = {Drechsler, J{\"o}rg and Haensch, Anna-Carolina}, - journal = {Statistical Science}, - year = {2024} -} - -@article{ruggles2025synth, - title = {The shortcomings of synthetic census microdata}, - author = {Ruggles, Steven}, - journal = {Proceedings of the National Academy of Sciences}, - volume = {122}, - number = {11}, - year = {2025}, - doi = {10.1073/pnas.2424655122} -} - -@article{little2025synth, - title = {Synthetic Census Microdata Generation: A Comparative Study of - Synthesizers and Assessment of Disclosure Risk and Utility}, - author = {Little, Claire and Allmendinger, Richard and Elliot, Mark}, - journal = {Journal of Official Statistics}, - year = {2025}, - doi = {10.1177/0282423X241266523} -} - -% ----------------------------------------------------------------------------- -% Privacy / record-level fidelity -% ----------------------------------------------------------------------------- - -@inproceedings{stadler2022groundhog, - title = {Synthetic Data -- Anonymisation {Groundhog} Day}, - author = {Stadler, Theresa and Oprisanu, Bristena and Troncoso, Carmela}, - booktitle = {{USENIX} Security Symposium}, - year = {2022} -} diff --git a/pyproject.toml b/pyproject.toml index 3324f431..4a6e6dcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "microplex-us" version = "0.2.0" -description = "US-specific adapters, pipelines, and PolicyEngine integration for microplex" +description = "US declarative content package for Microplex" readme = "README.md" license = "MIT" authors = [ @@ -13,98 +13,14 @@ authors = [ ] requires-python = ">=3.13" dependencies = [ - "microplex[calibrate] @ git+https://github.com/PolicyEngine/microplex.git@490c717b36a5ef1721b01b7dceaddbc0372c6a0a", - "duckdb>=1.2", - "h5py>=3.10", - "requests>=2.31", -] - -[project.optional-dependencies] -dev = [ - "pytest>=7.0", - "ruff>=0.1", -] -docs = [ - "jupyter-book>=0.15,<0.16", - "standard-imghdr>=3.13; python_version >= '3.13'", -] -r2 = [ - "boto3>=1.34", -] -hf = [ - "huggingface_hub>=0.24", -] -policyengine = [ - "microimpute @ git+https://github.com/PolicyEngine/microimpute.git@90be828eb442c48ee86bb91bb83a75da4b0f0f89 ; python_full_version >= '3.12' and python_full_version < '3.15'", - "policyengine-us==1.715.2; python_version >= '3.11' and python_version < '3.15'", - "spm-calculator>=0.3.1", - # Standalone tax-unit construction engine (the extraction of eCPS's - # tax-unit logic), used by the PolicyEngine pipeline to reconstruct tax - # units from CPS-like person frames (issue #113). - "microunit>=0.1.0", + "microplex @ git+https://github.com/PolicyEngine/microplex.git@5a1ea5e107334f45e850678774efc0c613dce250", ] [project.urls] Repository = "https://github.com/PolicyEngine/microplex-us" -[project.scripts] -microplex-us-arch-target-coverage = "microplex_us.targets.arch:main_coverage" -microplex-us-arch-target-gaps = "microplex_us.targets.arch:main_gaps" -microplex-us-arch-target-parity = "microplex_us.targets.arch:main_parity" -microplex-us-arch-target-refresh = "microplex_us.targets.arch:main_refresh" -microplex-us-arch-target-smoke = "microplex_us.targets.arch:main_smoke" -microplex-us-build-aca-ptc-multipliers = "microplex_us.targets.aca_ptc:main" -microplex-us-backfill-pe-native-audit = "microplex_us.pipelines.backfill_pe_native_audit:main" -microplex-us-backfill-pe-native-scores = "microplex_us.pipelines.backfill_pe_native_scores:main" -microplex-us-check-export-columns = "microplex_us.pipelines.check_export_columns:main" -microplex-us-check-site-snapshot = "microplex_us.pipelines.check_site_snapshot:main" -microplex-us-compact-policyengine-dataset = "microplex_us.pipelines.compact_policyengine_dataset:main" -microplex-us-mp300k-artifact-gates = "microplex_us.pipelines.mp300k_artifact_gates:main" -microplex-us-package-mp300k-gate-inputs = "microplex_us.pipelines.mp300k_gate_inputs:main" -microplex-us-pe-dataset-readiness = "microplex_us.pipelines.pe_us_dataset_readiness:main" -microplex-us-dashboard = "microplex_us.pipelines.dashboard:main" -microplex-us-ecps-replacement-comparison = "microplex_us.pipelines.ecps_replacement_comparison:main" -microplex-us-mp-benchmark-manifest = "microplex_us.pipelines.mp_benchmark_manifest:main" -microplex-us-pe-native-calibration-benchmark = "microplex_us.pipelines.pe_native_calibration_benchmark:main" -microplex-us-pe-native-target-diagnostics = "microplex_us.pipelines.pe_native_scores:main_target_diagnostics" -microplex-us-publish-hf-artifacts = "microplex_us.pipelines.hf_artifacts:main" -microplex-us-smoke-hf-artifact = "microplex_us.pipelines.hf_artifacts:main_smoke" -microplex-us-r2-archive-artifact = "microplex_us.pipelines.r2_artifacts:main" -microplex-us-reweight-cd-age-targets = "microplex_us.pipelines.cd_age_reweighting:main" -microplex-us-score-pe-native-loss = "microplex_us.pipelines.pe_native_scores:main" -microplex-us-stage9-replay = "microplex_us.pipelines.stage9_replay:main" -microplex-us-write-transparency-sidecars = "microplex_us.pipelines.transparency_sidecars:main" -microplex-us-version-bump-benchmark = "microplex_us.pipelines.version_benchmark:main" - [tool.hatch.build.targets.wheel] packages = ["src/microplex_us"] [tool.hatch.metadata] allow-direct-references = true - -[tool.hatch.build.targets.wheel.force-include] -"src/microplex_us/pipelines/pe_native_scores.py" = "microplex_us/pipelines/pe_native_scores.py" -"src/microplex_us/pipelines/ecps_export_contract.json" = "microplex_us/pipelines/ecps_export_contract.json" -"src/microplex_us/pipelines/frozen_production_ecps_2024_benchmark_manifest.json" = "microplex_us/pipelines/frozen_production_ecps_2024_benchmark_manifest.json" -"src/microplex_us/specs/us-2024.yaml" = "microplex_us/specs/us-2024.yaml" - -[tool.pytest.ini_options] -testpaths = ["tests"] -python_files = ["test_*.py"] -addopts = "-v --tb=short" - -[tool.ruff] -line-length = 88 -target-version = "py310" - -[tool.ruff.lint] -select = ["E", "F", "I", "N", "W", "UP"] -ignore = [ - "E501", - "N803", - "N806", -] - -[tool.ruff.lint.per-file-ignores] -"examples/**/*.py" = ["E402"] -"tests/**/*.py" = ["E402", "N802"] diff --git a/reviews/2026-03-29-claude-state-program-review.md b/reviews/2026-03-29-claude-state-program-review.md deleted file mode 100644 index 64c9ffef..00000000 --- a/reviews/2026-03-29-claude-state-program-review.md +++ /dev/null @@ -1,78 +0,0 @@ -# Claude Review — 2026-03-29 - -Synthesized code review: US state-program accuracy work - -## Combined findings - -### Critical - -1. Calibration does not converge; the earlier "beating PE" claim was on unconverged weights. - - Artifact: `microplex-us/artifacts/tmp_state_programs_feasible_bootstrap_rerun_20260329.json` - - The review’s core point was that the entropy solver still had `converged: false`, with large remaining error, so the headline PE win was not yet a credible solved result. - -### High - -2. `min_active_households=1` lets degenerate constraints through. - - File: `microplex-us/src/microplex_us/pipelines/us.py` - - Recommendation: raise the floor to `5-10`. - -3. `has_medicaid` used the wrong support family. - - File: `microplex-us/src/microplex_us/variables.py` - - Recommendation: treat it as binary / zero-inflated rather than `BOUNDED_SHARE`. - -4. `ensure_target_support()` is a band-aid, not a structural support fix. - - File: `microplex-us/src/microplex_us/pipelines/us.py` - - Recommendation: do not mistake dtype/exemplar fixes for real calibration support. - -5. SNAP entity mismatch risk: `SPM_UNIT` spec vs household calibration path. - - Files: `microplex-us/src/microplex_us/variables.py`, `microplex-us/src/microplex_us/policyengine/harness.py` - - Recommendation: explicitly verify the SPM-unit-to-household projection path. - -### Medium - -6. All-zero transform fix had no warning. - - File: `microplex/src/microplex/transforms.py` - - Recommendation: emit a warning when the identity fallback is used. - -7. Condition-var auto-promotion was unconditional. - - File: `microplex-us/src/microplex_us/pipelines/us.py` - - Recommendation: avoid blindly promoting sparse or continuous proxies into the conditioning space. - -8. The review claimed there were effectively no project-level tests around these seams. - - This was directionally aimed at regression protection, but factually overstated; `microplex-us` does have substantial test coverage. - -9. Core transform fix only had Synthesizer-level coverage. - - File: `microplex/tests/test_synthesizer.py` - - Recommendation: add more direct transform-path coverage. - -### Low - -10. No warning when a scaffold is missing all support proxies. -11. `ZeroInflatedTransform.combine()` length mismatch guard is still absent. -12. Artifact manifest paths are local-filesystem-coupled. - -## Architectural risks - -1. Sparse state coverage is structural; calibration tuning alone does not create small-state support. -2. The corrected 102-constraint state-only path and the broader 3,611-constraint path can tell very different stories. -3. Without regression gates, condition-var and feasibility changes can silently flip the diagnosis again. - -## Top 3 next fixes - -1. Add a small-state oversampling floor in bootstrap/synthesis. -2. Raise `min_active_households` and warn when many constraints are dropped. -3. Add regression coverage for feasibility filtering, support filling, condition-var promotion, and harness slice stability. - -## Direct answers - -- Is the new diagnosis actually right? - - Partially. Calibration infeasibility was real, but the deeper issue is still sparse small-state support. - -- Is the corrected feasible state-only benchmark path sound and comparable to PE? - - It is a sound diagnostic slice, but not a full replacement for the broader canonical benchmark. - -- Do the new `n=2000` results support the claim that Microplex now beats PE on the corrected US state-program slice? - - Not convincingly at the time of review, because the solve was still unconverged. - -- What is the next highest-leverage fix? - - Improve small-state support, especially via sampling / support strategy, rather than only filtering constraints. diff --git a/reviews/2026-03-30-claude-broad-native-loss-checkpoint-review.md b/reviews/2026-03-30-claude-broad-native-loss-checkpoint-review.md deleted file mode 100644 index 81d45d79..00000000 --- a/reviews/2026-03-30-claude-broad-native-loss-checkpoint-review.md +++ /dev/null @@ -1,138 +0,0 @@ -# 2026-03-30 broad PE-native loss checkpoint review - -Reviewer: Claude Opus 4.6 -Scope: v2 clean broad PE-native result after deterministic CPS + rebuilt cache fixes - -## Artifacts reviewed - -- Clean v2 result: `artifacts/tmp_parity_inputs_broad_pe_native_20260330_v2.json` -- Earlier bad/stale result: `artifacts/tmp_parity_inputs_broad_pe_native_20260330.json` -- Provider repeatability: `artifacts/tmp_provider_repeatability_20260330.json` -- Pre-calibration repeatability: `artifacts/tmp_qrf_repeatability_precal_20260330.json` - -## Code reviewed - -- `src/microplex_us/data_sources/cps.py` -- `tests/test_cps_source_provider.py` -- `src/microplex_us/pipelines/pe_native_scores.py` -- `src/microplex_us/pipelines/us.py` -- `src/microplex_us/pipelines/performance.py` -- `src/microplex_us/policyengine/us.py` - -## Key numbers confirmed - -| Metric | Value | -|---|---| -| v2 candidate broad loss | 0.8754 | -| v1 (stale cache) candidate broad loss | 7.4331 | -| PE baseline broad loss | 0.0202 | -| v2 calibration converged | false | -| v2 constraints before feasibility | 3,611 | -| v2 constraints after feasibility | 1,255 | -| v2 constraints dropped | 2,356 (65.2%) | -| v2 kept scoring targets | 2,817 | - ---- - -## Finding 1 — SEVERITY: HIGH — Calibration-vs-scoring target mismatch dominates the loss - -The candidate is calibrated against 1,255 constraints but scored against 2,817 targets. The 1,562 unsupported targets are scored as if the candidate has zero mass for them. This is the structural reason the candidate's unweighted MSRE is ~0.887 across the board. - -The top three family deltas confirm this: - -| Family | Loss delta | n_targets | Candidate mean unweighted MSRE | -|---|---|---|---| -| `national_irs_other` | +0.255 | 401 | 0.841 | -| `state_agi_distribution` | +0.182 | 917 | 0.885 | -| `state_age_distribution` | +0.180 | 900 | 0.889 | - -These three account for 72% of the total loss delta. The near-uniform ~0.88 MSRE across all three families is diagnostic: the problem is not family-specific accuracy but blanket thin support/zero mass. - -**Impact**: Even if the calibration solver were perfect on its 1,255 constraints, the scored loss would still be dominated by the ~1,562 unsupported targets. This is the main engineering bottleneck. - -**Recommendation**: Increase source sample size to widen the support surface before tuning anything else. The build log shows `sample_n=1000` improved state-age support recall from 0.464 to 0.630 vs `sample_n=500`. - -## Finding 2 — SEVERITY: HIGH — Calibration never converges - -All saved artifacts across the entire build history show `converged=false` on the broad path. The v2 result has `mean_error=0.789` and `max_error=1.670`. - -Unconverged entropy weights are sensitive to solver internals (iteration count, step size, regularization). This means: -- The exact loss of 0.875 is not reproducible to better than ~0.02-0.03 even with fully deterministic inputs -- A/B comparisons between runs on the broad path are unreliable unless the delta exceeds the solver noise floor -- The `converged=false` flag makes it impossible to distinguish "the support surface is too thin" from "the solver ran out of iterations on a solvable problem" - -**Recommendation**: Diagnose whether the solver *can* converge on the 1,255 post-filter constraints by running with 10x iterations. If it still doesn't converge, the constraint set itself may be infeasible, and the feasibility filter needs tightening. - -## Finding 3 — SEVERITY: MEDIUM — Cache invalidation checks column presence, not derivation correctness - -`_processed_persons_have_household_geography` (`cps.py:786-791`) validates the processed cache by checking whether required columns exist and have non-null values. It does not check whether the *derivation logic* that produced those columns matches current code. - -This is the same class of bug that caused the 7.43 blow-up. The specific instance (missing columns) is fixed, but the pattern (stale derivation passing validation) is still latent. - -Example future trigger: if `is_disabled` derivation changes from any-of-6-flags to 3-of-6-flags, the stale cache will pass validation because the column exists, but contain incorrect values. - -**Recommendation**: Add a schema-version constant or derivation-hash to the processed cache filename (e.g., `cps_asec_2023_processed_v3.parquet`). Any loader logic change bumps the version, automatically invalidating stale caches. - -## Finding 4 — SEVERITY: LOW — `national_irs_other` is a heterogeneous bucket - -The single largest family delta (+0.255) is `national_irs_other`, which contains 401 targets across many distinct IRS dimensions: -- AGI bins by filing status (HOH, MFJ, Single) -- Income type totals (capital gains, partnership/S-corp, pension, qualified dividends) -- Count targets by AGI bracket - -The build log drilldown (`_BUILD_LOG.md:840-870`) confirms these are different failure modes: -- Hard-zero candidate mass on capital gains, partnership/S-corp, pension -- Missing high-AGI filer mass (no tax units above $1M AGI) -- HOH filing status bins with zero mass - -Treating this as one family obscures which sub-problems are fixable with current tools vs which require new source data. - -**Recommendation**: Split `national_irs_other` into sub-families (e.g., `national_irs_agi_by_filing`, `national_irs_income_type`, `national_irs_count`) in the family classifier to make diagnosis actionable. - -## Finding 5 — SEVERITY: LOW — Effective sample ratio is moderate but not alarming - -The v2 result shows: -- Household effective sample ratio: 0.404 (808 effective households from 2,000 rows) -- Person effective sample ratio: 0.431 (2,067 effective persons from 4,791 rows) -- No weight collapse suspected -- No tiny weights - -This is healthy enough for the current diagnostic phase. But as constraints increase (from fixing the support gap), the effective sample ratio will likely drop further. - ---- - -## Answers to the five review questions - -### Q1: Is the v2 result trustworthy enough for the next diagnosis step? - -**Yes, conditionally.** Determinism is confirmed at the provider and pre-calibration levels. The result is safe for identifying which families dominate the gap. It is not safe for claiming precision better than ~0.02-0.03 on the loss value itself due to unconverged calibration. - -### Q2: Was the 7.43 blow-up adequately explained? - -**Yes, fully.** Root cause chain: new CPS-derived PE inputs added -> stale processed cache served old data missing those columns -> zeros carried through to exported H5 -> PE targets saw zero mass -> `national_census_other` blew up by +6.58. Fix: extended `PERSON_CACHE_REQUIRED_COLUMNS`, rebuilt cache. No evidence of deeper bugs. - -### Q3: Are the top-3 attack surfaces correctly identified? - -**Yes.** `national_irs_other` (+0.255), `state_agi_distribution` (+0.182), and `state_age_distribution` (+0.180) account for 72% of the total delta. The nuance is that `national_irs_other` is heterogeneous and should be split for actionable diagnosis. - -### Q4: Is the code correct on determinism/cache? - -**The two specific bugs are fixed.** Latent seam: cache invalidation checks column presence but not derivation correctness (same bug class as the 7.43 blow-up, different trigger). - -### Q5: What should the next concrete engineering step be? - -**Priority-ordered:** - -1. **Increase source sample size** to 2000-3000 households. This is the steepest part of the support-recall curve and directly attacks the calibration-vs-scoring mismatch. - -2. **Diagnose calibration convergence** by running the solver with 10x iterations on the current 1,255 post-filter constraints. If it converges, the support gap is the bottleneck. If not, the constraint set is infeasible and the filter needs tightening. - -3. **Add a cache derivation version** to prevent the stale-cache class of bugs. - -4. **Split `national_irs_other`** in the family classifier for actionable sub-family diagnosis. - -**Do NOT pursue:** -- Sparse/L0 calibration on the broad path (634.0 loss, 3 orders of magnitude worse) -- Donor-imputer backend changes as a broad-loss lever (~0.035 total effect) -- State-floor oversampling in CPS subsampling (worsened loss in the smoke test) -- `n_synthetic > 3000` at current support levels (weight collapse documented at n=5000) diff --git a/reviews/2026-03-31-claude-direct-pe-native-optimizer-review.md b/reviews/2026-03-31-claude-direct-pe-native-optimizer-review.md deleted file mode 100644 index 39d08ce8..00000000 --- a/reviews/2026-03-31-claude-direct-pe-native-optimizer-review.md +++ /dev/null @@ -1,158 +0,0 @@ -# Direct PE-native optimizer review — 2026-03-31 - -## Scope - -Code review and architectural diagnosis of the new direct PE-native weight optimization path in `pe_native_optimization.py`, its integration into the performance harness, and interpretation of the first A/B result (0.0004 improvement on a 0.92 loss). - -Files reviewed: -- `src/microplex_us/pipelines/pe_native_optimization.py` -- `src/microplex_us/pipelines/performance.py` (harness integration) -- `src/microplex_us/pipelines/pe_native_scores.py` (scoring subprocess) -- `src/microplex_us/pipelines/__init__.py` (exports) -- `tests/pipelines/test_pe_native_optimization.py` -- `tests/pipelines/test_performance.py` -- Upstream: `policyengine_us_data/utils/loss.py`, `policyengine_us_data/calibration/unified_calibration.py` -- Artifacts: `tmp_pe_native_direct_opt_20260331.json`, raw candidate scores - ---- - -## Findings - -### 1. NO BUG — objective alignment is correct - -The optimizer's quadratic form `||M^T w - s||^2` is algebraically identical to the scorer's native loss. Derivation: - -The scorer (`pe_native_scores.py:182-185`) computes: -``` -estimate = w @ A -rel_error_j = ((estimate_j - t_j + 1) / (t_j + 1))^2 -loss = mean(inv_mean_norm * norm_j * rel_error_j) -``` - -The optimizer (`pe_native_optimization.py:74-85`) constructs: -``` -scaling_j = sqrt(inv_mean_norm * norm_j / T) / (t_j + 1) -M = A * scaling[newaxis, :] -s = (t - 1) * scaling -``` - -Expanding `||M^T w - s||^2`: -``` -= sum_j scaling_j^2 * (w^T A_j - t_j + 1)^2 -= sum_j (inv_mean_norm * norm_j / T) * ((w^T A_j - t_j + 1) / (t_j + 1))^2 -= (1/T) * inv_mean_norm * sum_j norm_j * rel_error_j -= scorer loss -``` - -Confirmed numerically: optimizer `initial_loss = 0.9233365911702254`, scorer raw loss `0.9233365911702252` — difference is `2e-16` (float64 noise). - -Both scripts use the same `build_loss_matrix()`, same `_ENHANCED_CPS_BAD_TARGETS`, same zero-mask threshold (`atol=0.1`), same national/state normalization. The objectives are provably identical. - -### 2. NO BUG — gradient, Lipschitz, and step size are correct - -Gradient of `f(w) = ||M^T w - s||^2` is `∇f = 2M(M^T w - s)`. Code at lines 254-256: -```python -residual = matrix.T @ weights - target # M^T w - s -gradient = 2.0 * (matrix @ residual) # 2M(M^T w - s) -``` - -Correct, including the L2 penalty term. - -Lipschitz constant via power iteration on `MM^T` yields `λ_max(MM^T)`. The full Lipschitz is `2λ_max + 2*l2_penalty`, matching the Hessian `2MM^T + 2λI`. Step size `1/L` gives guaranteed descent per projected-gradient iteration. All correct. - -### 3. NO BUG — simplex projection is correct - -Standard Michelot/Duchi O(n log n) projection onto `{x ≥ 0, Σx = total}`. Budget variant correctly restricts support via `argpartition`. Edge cases handled. - -### 4. NO BUG — H5 weight rewrite is correct - -Group-to-household mapping via `person_household_id × person_{group}_id` bridge tables is sound for PE entity structure. `setdefault` is safe because PE groups don't span households. Float64→float32 cast at write time is consistent with PE storage format. - -### 5. NO BUG — performance harness integration is correct - -`performance.py:812-846`: when `optimize_pe_native_loss=True`, the harness exports a candidate H5, runs the optimizer to produce a second H5 with optimized weights, and passes the optimized H5 to the scorer. The optimization metadata is attached to the scores dict under the `"optimization"` key. Wiring is correct. - -### 6. MINOR — weight-sum drift after projection iterations - -`optimized_weight_sum = 6,920,897` vs `initial_weight_sum = 6,920,834` — a drift of `63` (~9e-6 relative). Each simplex projection targets the same `total_weight`, but `np.maximum(clipped - theta, 0.0)` doesn't enforce the exact sum. The drift accumulates over 200 iterations and is practically negligible for the loss computation. - -**Fix (optional)**: add a single rescale after the final projection: -```python -weights *= total_weight / weights.sum() -``` - -### 7. MINOR — convergence reporting is misleading but not harmful - -`converged=false` after 200 iterations with total improvement of `0.0004` means per-step improvement averaged `~2e-6`, which exceeds `tol=1e-8`. The method is in a diminishing-returns regime, not a divergent one. More iterations would eventually trigger the convergence criterion but would not materially improve the loss. - -This is expected behavior for projected gradient descent on an overdetermined quadratic: the feasible minimum is close to the starting point, so each step makes only a tiny improvement, but the relative improvement `(current - candidate) / max(1, current)` stays above `tol` for many iterations. - -**Not a bug**, but the convergence report could be more informative. Consider logging the relative improvement trajectory or adding a `max_iter_exhausted` flag. - -### 8. MEDIUM — no end-to-end validation that rescored loss matches optimizer's internal loss - -The optimizer reports `optimized_loss` from its internal `objective()` call. The harness then rescores the rewritten H5 with the full PE-native scorer subprocess. These should match within float32/float64 tolerance, but there's no assertion validating this. If a future change causes the loss matrix extraction to diverge from the scoring path (e.g., different `build_loss_matrix` version, different target filtering), the optimizer would silently optimize a stale objective. - -**Recommended**: add a post-optimization assertion in `optimize_policyengine_us_native_loss_dataset` that compares `summary["optimized_loss"]` to the subprocess-reported `candidate_loss_before` from a re-extraction of the optimized H5. Or at minimum, log both values for manual comparison. - -### 9. NOT A BUG — test coverage is appropriate - -`test_pe_native_optimization.py` covers: -- Weight optimization reduces loss and respects budget constraint -- H5 weight rewrite propagates to person and group weight arrays -- Full end-to-end pipeline with monkeypatched subprocess - -`test_performance.py` covers: -- Harness rejects `optimize_pe_native_loss` without `evaluate_pe_native_loss` -- Optimizer parameters (budget, max_iter, l2_penalty, tol) pass through correctly -- Optimized H5 is what gets scored - -No gaps in the critical paths. - ---- - -## Question 1: Does the optimizer optimize the same objective as the scorer? - -**Yes, exactly.** Proven algebraically in Finding 1 and confirmed numerically (initial losses match within float64 noise). Both paths call `build_loss_matrix()` from the same `policyengine-us-data` checkout, apply the same bad-target/zero-target filtering, and use the same national/state normalization. - -## Question 2: Are there correctness bugs? - -**No serious bugs.** The scaled-matrix construction, projected gradient routine, H5 weight rewrite, and harness integration are all correct. Two minor items worth addressing: -- Weight-sum drift (~9e-6 relative) — cosmetic, optional fix -- No cross-validation between optimizer's internal loss and rescored loss — worth adding as a guard - -## Question 3: Does the tiny improvement (0.92334 → 0.92290) mean record support is the bottleneck? - -**Yes, this is strong evidence.** The argument: - -1. The optimizer directly minimizes the exact PE-native loss as a function of 2000 household weights, subject to non-negativity and sum constraints. -2. The minimum it found is 0.9229, only 0.05% better than the starting point of 0.9233. -3. This means the best achievable loss with these 2000 records is ~0.923 — the entropy calibrator was already near-optimal for this support. -4. PE's baseline achieves 0.020 with ~30,000 source-imputed records. -5. The gap is a factor of **46×**, and only 0.05% of it was attributable to the weight objective. - -The remaining 99.95% of the gap is structural: the 2000 households lack the support to span the ~2,817 target dimensions. This is consistent with all the build-log evidence: -- Feasibility filter drops 60-70% of calibration constraints -- State-age support recall is 0.49–0.63 (vs ~1.0 for PE) -- Many exact targets have literally zero candidate mass -- Scaling `sample_n` produces steeper loss improvements than any other lever tested - -The direct optimizer cleanly rules out "maybe entropy just has a bad weight objective" as a hypothesis. - -## Question 4: What should the next high-leverage step be? - -**Full-support selection path.** The direct weight optimizer has served its diagnostic purpose and confirmed the bottleneck. The next steps, in order of priority: - -1. **Full-support + budgeted household selection** — This is the path already started at `policyengine_selection_household_budget=29999` in the build log. Use the full CPS+PUF support (all ~30K+ source households without subsampling), preserve the full donor-integrated surface with `synthesis_backend='seed'`, then use the sparse selector to prune to a target household budget before final calibration. This directly addresses the support gap. - -2. **Move toward PE's L0 selection/calibration architecture** — PE-US-data's `unified_calibration.py` uses L0-regularized optimization to simultaneously select records and calibrate weights. The current microplex path does selection → calibration as separate stages. Unifying them (or at least using the same L0 regularizer for selection) would let the selector prefer households that jointly cover more target dimensions. This is the medium-term architectural convergence. - -3. **Do not invest further in direct weight optimization on small candidates** — The diagnostic value is exhausted. The optimizer proved that weight-objective mismatch accounts for <0.1% of the gap. Rerunning it on larger candidates would confirm the same conclusion at higher cost. - -4. **Keep the optimizer code** — It's clean, correct, and useful for future diagnostics (e.g., to measure how much a larger candidate's loss is weight-bounded vs support-bounded). - ---- - -## Summary - -The direct PE-native optimizer is mathematically correct, properly aligned with the scorer, and cleanly integrated into the harness. No serious bugs. The first A/B result (0.0004 improvement out of a 0.903 gap) definitively confirms that the bottleneck is record support/construction, not the weight objective. The next high-leverage move is the full-support + budgeted selection path already prototyped in the build log. diff --git a/reviews/PENDING_CLAUDE_REVIEW.md b/reviews/PENDING_CLAUDE_REVIEW.md deleted file mode 100644 index fc501143..00000000 --- a/reviews/PENDING_CLAUDE_REVIEW.md +++ /dev/null @@ -1,108 +0,0 @@ -# Pending Claude Review - -Please do a focused code review of the current US state-program accuracy work across: - -- `/Users/maxghenis/PolicyEngine/microplex-us` -- `/Users/maxghenis/PolicyEngine/microplex` - -Use agent teams if available in your environment. -Suggested split: -- one agent for `/Users/maxghenis/PolicyEngine/microplex-us` -- one agent for `/Users/maxghenis/PolicyEngine/microplex` -- one integrating agent to synthesize the calibration/benchmark conclusion - -## Read first - -- `/Users/maxghenis/PolicyEngine/microplex-us/AGENTS.md` -- `/Users/maxghenis/PolicyEngine/microplex-us/_WORKSPACE.md` -- `/Users/maxghenis/PolicyEngine/microplex-us/_BUILD_LOG.md` -- `/Users/maxghenis/PolicyEngine/microplex/AGENTS.md` -- `/Users/maxghenis/PolicyEngine/microplex/_WORKSPACE.md` -- `/Users/maxghenis/PolicyEngine/microplex/_BUILD_LOG.md` - -Then inspect recent changes with git diff/status and review the changed files and saved artifacts. - -## Review mindset - -- Findings first, ordered by severity. -- Prioritize bugs, behavioral regressions, benchmark-validity risks, abstraction mistakes, silent incompatibilities, and missing tests. -- Be skeptical and concrete. -- I want actionable review comments, not a broad summary. - -## Important recent changes - -- We investigated the US `state_programs_core` gap against PE. -- Earlier diagnosis leaned toward source/backbone support, but the recent diagnosis shifted toward calibration feasibility. -- `microplex-us` now has a calibration feasibility filter and better weight-collapse diagnostics in `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py`. -- Explicit semantic specs were added for `has_medicaid`, `public_assistance`, `ssi`, and `social_security` in `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/variables.py`. -- A core synthesizer bug for all-zero zero-inflated variables was fixed in `/Users/maxghenis/PolicyEngine/microplex/src/microplex/transforms.py`. -- A boolean-to-float support-fill bug was fixed in `ensure_target_support()` in `/Users/maxghenis/PolicyEngine/microplex-us/src/microplex_us/pipelines/us.py`. -- We ran corrected state-only reruns against the real PE-US-data calibration DB using: - - variables: `household_count`, `person_count` - - domains: `snap`, `medicaid_enrolled` - - geography: `state` -- Key artifacts: - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_state_programs_n2000_diagnostics_20260329.json` - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_cps_puf_rich_state_sweep_20260329.json` - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_state_programs_feasible_bootstrap_rerun_20260329.json` - - `/Users/maxghenis/PolicyEngine/microplex-us/artifacts/tmp_state_programs_feasible_synth_rerun_20260329.json` - -## Current headline result to verify - -On the corrected feasible state-only target estate, `n=2000` now appears to beat PE. - -Bootstrap rerun: -- Microplex MARE `0.7335` -- PE MARE `0.7386` -- Microplex composite `0.6770` -- PE composite `0.7704` - -Synthesizer rerun: -- Microplex MARE `0.6811` -- PE MARE `0.7386` -- Microplex composite `0.6481` -- PE composite `0.7704` -- target win rate `42.16%` - -## Focus especially on - -1. Whether the new diagnosis is actually correct: was the main blocker calibration infeasibility rather than source/backbone support? -2. Whether the new calibration feasibility filter in `us.py` is mathematically and operationally sound, or whether it is just hiding targets we should still be solving. -3. Whether the corrected state-only calibration scope is the right canonical target estate for this question, or whether it is too favorable, too narrow, or no longer comparable to PE. -4. Whether the bootstrap and synthesizer reruns are genuinely apples-to-apples against PE. -5. Whether the new proxy semantic specs are correct and sufficient. -6. Whether the all-zero zero-inflated transform fix in core is correct and safe. -7. Whether the `ensure_target_support()` bool/numeric coercion fix is correct or risks masking real support problems. -8. Whether there are missing tests that would let this diagnosis flip again incorrectly. -9. Whether the next correct operational step is to make this corrected state-only feasible calibration path part of the canonical US benchmark/version-benchmark flow. - -## Important context - -- We are intentionally trying to keep `microplex` generic and `microplex-us` thin where possible. -- We want to beat PE on real targets, but not by benchmarking an invalid or overly favorable target estate. -- PE rules remain the canonical runtime for program calculations. -- The question here is specifically whether we now have a sound US state-program benchmark path and a real result, not just a debugging artifact. - -## Please return - -1. Findings first, with severity and file/line references. -2. Then a short section on architectural risks. -3. Then the top 3 next fixes. -4. Then explicitly answer: - - Is the new diagnosis actually right? - - Is the corrected feasible state-only benchmark path sound and comparable to PE? - - Do the new `n=2000` results actually support the claim that Microplex now beats PE on the corrected US state-program slice? - - What is the next highest-leverage fix? - -## After the review - -1. Write the full review to: - - `/Users/maxghenis/PolicyEngine/microplex-us/reviews/2026-03-29-claude-state-program-review.md` -2. Append a concise summary to: - - `/Users/maxghenis/PolicyEngine/microplex-us/_BUILD_LOG.md` - -Keep the `_BUILD_LOG.md` append short: -- date -- scope reviewed -- top findings -- top 1-3 next fixes diff --git a/reviews/README.md b/reviews/README.md deleted file mode 100644 index 002bbf7c..00000000 --- a/reviews/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Reviews - -This directory is the durable handoff surface for Claude/Codex review work. - -## Steady-state workflow - -1. Codex writes the current review request to: - - `PENDING_CLAUDE_REVIEW.md` -2. The user gives Claude a short instruction such as: - - `Please execute the pending review request in /Users/maxghenis/PolicyEngine/microplex-us/reviews/PENDING_CLAUDE_REVIEW.md` -3. Claude writes the full review to a dated file in this directory. -4. Claude appends a short summary to: - - `/Users/maxghenis/PolicyEngine/microplex-us/_BUILD_LOG.md` - -## File roles - -- `PENDING_CLAUDE_REVIEW.md` - - current review request only -- `YYYY-MM-DD-*.md` - - full saved review outputs - -Keep `_BUILD_LOG.md` short. Full reviews belong here, not in the log. diff --git a/scripts/augment_targets_db_for_b2.py b/scripts/augment_targets_db_for_b2.py deleted file mode 100644 index b38e7cfe..00000000 --- a/scripts/augment_targets_db_for_b2.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Copy the calibration targets DB and add direct targets on SSI / CTC / ACA PTC. - -The v11 downstream validation showed those three aggregates drifting -+64% / +32% / -76% from their benchmark totals. They weren't in the -original calibration target set (which focuses on AGI / income -marginals, not downstream-disbursed amounts). Adding them as direct -national targets should drive their calibrated aggregates toward the -benchmark values. - -Stratum 1 is "United States" (from the existing DB). Period 2024 and -reform_id=0 (baseline) match the rest of the 2024 target set. -""" - -from __future__ import annotations - -import argparse -import shutil -import sqlite3 -from pathlib import Path - -from microplex_us.validation.downstream import DOWNSTREAM_BENCHMARKS_2024 - - -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--source", required=True, type=Path) - parser.add_argument("--output", required=True, type=Path) - parser.add_argument( - "--variables", - nargs="+", - default=["ssi", "ctc", "aca_ptc"], - ) - parser.add_argument("--period", default=2024, type=int) - args = parser.parse_args() - - args.output.parent.mkdir(parents=True, exist_ok=True) - shutil.copyfile(args.source, args.output) - - benchmarks_by_name = {spec.name: spec for spec in DOWNSTREAM_BENCHMARKS_2024} - - con = sqlite3.connect(args.output) - cur = con.cursor() - for variable in args.variables: - spec = benchmarks_by_name.get(variable) - if spec is None: - raise KeyError(f"No 2024 benchmark spec for {variable}") - cur.execute( - "SELECT COUNT(*) FROM targets WHERE variable=? AND period=? " - "AND stratum_id=1 AND reform_id=0", - (variable, args.period), - ) - if cur.fetchone()[0] > 0: - print(f"[skip] {variable} already has a national 2024 target") - continue - cur.execute( - "INSERT INTO targets " - "(variable, period, stratum_id, reform_id, value, active, source, notes) " - "VALUES (?, ?, 1, 0, ?, 1, ?, ?)", - ( - variable, - args.period, - float(spec.benchmark), - spec.source, - f"B2 follow-up direct target for {variable}", - ), - ) - print( - f"[add ] {variable} @ 2024 national: ${spec.benchmark/1e9:.1f}B ({spec.source})" - ) - con.commit() - con.close() - print(f"\nWrote augmented DB to {args.output}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/scripts/calibrate_on_synthesizer.py b/scripts/calibrate_on_synthesizer.py deleted file mode 100644 index b74de629..00000000 --- a/scripts/calibrate_on_synthesizer.py +++ /dev/null @@ -1,266 +0,0 @@ -"""Measure whether `microcalibrate` on top of a synthesizer rescues weak synthesis. - -Stage-1 PRDC coverage compared synthesizers with uniform unit weights. The -actual production pipeline is synthesize → calibrate. If calibration can -pull a weak synthesizer's weighted aggregates onto the real targets, the -choice of synthesizer matters less than PRDC alone would suggest. - -Procedure: - -1. Load enhanced_cps_2024 (`ScaleUpRunner.load_frame`), split 80/20. -2. For each method (ZI-QRF / ZI-MAF / ZI-QDNN): - a. Fit method, generate synthetic records with uniform weights. - b. Compute holdout aggregates for each target column - (total, count-of-nonzero). - c. Build `LinearConstraint`s that require the weighted synthetic - aggregates to match the holdout aggregates. - d. Run `MicrocalibrateAdapter.fit_transform`. - e. Report per-target relative error pre- and post-calibration. - -Usage: - uv run python scripts/calibrate_on_synthesizer.py --n-rows 20000 - -~10 minutes on a 48 GB M3 for 20k × 50 × 3 methods. -""" - -from __future__ import annotations - -import argparse -import json -import logging -import time -from pathlib import Path - -import numpy as np -import pandas as pd -from microplex.calibration import LinearConstraint -from microplex.eval.benchmark import ZIMAFMethod, ZIQDNNMethod, ZIQRFMethod - -from microplex_us.bakeoff import ( - DEFAULT_CONDITION_COLS, - DEFAULT_TARGET_COLS, - ScaleUpRunner, - ScaleUpStageConfig, - stage1_config, -) -from microplex_us.calibration import ( - MicrocalibrateAdapter, - MicrocalibrateAdapterConfig, -) - -LOGGER = logging.getLogger(__name__) - -METHOD_REGISTRY = { - "ZI-QRF": ZIQRFMethod, - "ZI-MAF": ZIMAFMethod, - "ZI-QDNN": ZIQDNNMethod, -} - - -def build_target_constraints( - holdout: pd.DataFrame, - synthetic: pd.DataFrame, - target_cols: tuple[str, ...], -) -> tuple[LinearConstraint, ...]: - """One total-sum constraint per target column. - - Target = sum of `holdout[col]`; coefficients = `synthetic[col].values`. - After calibration, `(weights * coefficients).sum()` should match target. - """ - constraints: list[LinearConstraint] = [] - for col in target_cols: - if col not in synthetic.columns or col not in holdout.columns: - continue - target = float(holdout[col].sum()) - coefs = synthetic[col].to_numpy(dtype=float) - constraints.append( - LinearConstraint( - name=f"sum_{col}", - coefficients=coefs, - target=target, - ) - ) - return tuple(constraints) - - -def evaluate_aggregates( - holdout: pd.DataFrame, - synthetic: pd.DataFrame, - weights: np.ndarray, - target_cols: tuple[str, ...], -) -> dict[str, dict[str, float]]: - """Per-target: real total, weighted-synth total, relative error.""" - out: dict[str, dict[str, float]] = {} - for col in target_cols: - if col not in synthetic.columns or col not in holdout.columns: - continue - real_total = float(holdout[col].sum()) - synth_weighted = float((synthetic[col].to_numpy(dtype=float) * weights).sum()) - rel_err = abs(synth_weighted - real_total) / max(abs(real_total), 1.0) - out[col] = { - "real_total": real_total, - "weighted_synth_total": synth_weighted, - "relative_error": rel_err, - } - return out - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--n-rows", type=int, default=20_000) - parser.add_argument( - "--methods", nargs="+", default=["ZI-QRF", "ZI-MAF", "ZI-QDNN"] - ) - parser.add_argument("--calibration-epochs", type=int, default=100) - parser.add_argument( - "--output", - type=Path, - default=Path("artifacts/calibrate_on_synthesizer.json"), - ) - parser.add_argument("--seed", type=int, default=42) - args = parser.parse_args(argv) - - logging.basicConfig( - level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s" - ) - - base = stage1_config() - cfg = ScaleUpStageConfig( - stage="calibrate_on_synth", - n_rows=args.n_rows, - methods=tuple(args.methods), - condition_cols=DEFAULT_CONDITION_COLS, - target_cols=DEFAULT_TARGET_COLS, - holdout_frac=0.2, - seed=args.seed, - k=5, - data_path=base.data_path, - year=base.year, - rare_cell_checks=(), - prdc_max_samples=15_000, - ) - runner = ScaleUpRunner(cfg) - df = runner.load_frame() - train, holdout = runner.split(df) - LOGGER.info( - "loaded %d rows; train=%d holdout=%d", len(df), len(train), len(holdout) - ) - - results = [] - for method_name in args.methods: - LOGGER.info("== %s ==", method_name) - if method_name not in METHOD_REGISTRY: - LOGGER.warning("unknown method %r, skipping", method_name) - continue - method = METHOD_REGISTRY[method_name]() - t0 = time.time() - method.fit(sources={"ecps": train.copy()}, shared_cols=list(DEFAULT_CONDITION_COLS)) - fit_s = time.time() - t0 - - t0 = time.time() - synthetic = method.generate(len(train), seed=args.seed) - gen_s = time.time() - t0 - LOGGER.info(" fit=%.1fs gen=%.1fs n_synth=%d", fit_s, gen_s, len(synthetic)) - - constraints = build_target_constraints( - holdout, synthetic, DEFAULT_TARGET_COLS - ) - LOGGER.info(" %d calibration constraints", len(constraints)) - - synthetic = synthetic.copy() - synthetic["weight"] = 1.0 - - # Rescale initial weights so synth totals sum to holdout-scale before - # calibration. Otherwise gradient descent has to travel a long way. - for col in DEFAULT_TARGET_COLS: - if col not in holdout.columns or col not in synthetic.columns: - continue - r_sum = float(holdout[col].sum()) - s_sum = float(synthetic[col].sum()) - if r_sum > 0 and s_sum > 0: - synthetic["weight"] = synthetic["weight"] * (r_sum / s_sum) - break - - pre_weights = synthetic["weight"].to_numpy(dtype=float) - pre = evaluate_aggregates(holdout, synthetic, pre_weights, DEFAULT_TARGET_COLS) - - adapter = MicrocalibrateAdapter( - MicrocalibrateAdapterConfig( - epochs=args.calibration_epochs, - learning_rate=1e-3, - noise_level=0.0, - seed=args.seed, - ) - ) - t0 = time.time() - calibrated = adapter.fit_transform( - synthetic, - marginal_targets={}, - weight_col="weight", - linear_constraints=constraints, - ) - cal_s = time.time() - t0 - - post_weights = calibrated["weight"].to_numpy(dtype=float) - post = evaluate_aggregates( - holdout, calibrated, post_weights, DEFAULT_TARGET_COLS - ) - validation = adapter.validate() - - pre_mean_err = float( - np.mean([v["relative_error"] for v in pre.values()]) - ) - post_mean_err = float( - np.mean([v["relative_error"] for v in post.values()]) - ) - LOGGER.info( - " pre-cal mean rel err = %.4f; post-cal mean rel err = %.4f; cal=%.1fs", - pre_mean_err, - post_mean_err, - cal_s, - ) - - results.append( - { - "method": method_name, - "n_train": int(len(train)), - "n_holdout": int(len(holdout)), - "n_synthetic": int(len(synthetic)), - "n_constraints": int(len(constraints)), - "fit_wall_seconds": fit_s, - "generate_wall_seconds": gen_s, - "calibration_wall_seconds": cal_s, - "pre_cal_mean_rel_err": pre_mean_err, - "post_cal_mean_rel_err": post_mean_err, - "calibration_max_error": validation["max_error"], - "calibration_converged": validation["converged"], - "pre_cal_per_target": pre, - "post_cal_per_target": post, - "calibrated_weights_summary": { - "min": float(post_weights.min()), - "max": float(post_weights.max()), - "mean": float(post_weights.mean()), - "std": float(post_weights.std()), - "zero_fraction": float((post_weights == 0).mean()), - }, - } - ) - - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text(json.dumps(results, indent=2, default=str)) - - print() - print("== Pre / post mean-relative-error per method ==") - for r in sorted(results, key=lambda x: x["post_cal_mean_rel_err"]): - print( - f" {r['method']:8s}: pre={r['pre_cal_mean_rel_err']:.4f} " - f"post={r['post_cal_mean_rel_err']:.4f} " - f"max={r['calibration_max_error']:.4f} " - f"cal={r['calibration_wall_seconds']:.1f}s" - ) - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/scripts/embedding_prdc_compare.py b/scripts/embedding_prdc_compare.py deleted file mode 100644 index 45717ad9..00000000 --- a/scripts/embedding_prdc_compare.py +++ /dev/null @@ -1,269 +0,0 @@ -"""Compare raw-feature PRDC vs learned-embedding PRDC on the stage-1 methods. - -The scale-up-protocol doc flagged that PRDC in ~50 dimensions may be -degenerate (curse of dimensionality: k-NN distances concentrate and the -metric becomes noise-dominated). This script settles the question. - -Procedure: - -1. Fit each of (ZI-QRF, ZI-MAF, ZI-QDNN) on 40k x 50 real ECPS. -2. Generate synthetic records from each. -3. Train a 16-dim autoencoder on the holdout's raw features only. -4. Compute PRDC in the raw 50-dim feature space (unchanged from stage 1). -5. Compute PRDC in the 16-dim learned latent space. -6. Report both side-by-side. If the ordering changes, the stage-1 - finding was metric-driven not method-driven; if it's preserved, the - finding is robust. - -Usage: - uv run python scripts/embedding_prdc_compare.py \ - --output artifacts/embedding_prdc_compare.json - -Runs in ~5 minutes on 40 k rows x 50 cols (driven by ZI-MAF fit time). -""" - -from __future__ import annotations - -import argparse -import json -import logging -import time -from pathlib import Path - -import numpy as np -import pandas as pd -import torch -import torch.nn as nn -from prdc import compute_prdc -from sklearn.preprocessing import StandardScaler - -from microplex.eval.benchmark import ZIMAFMethod, ZIQDNNMethod, ZIQRFMethod -from microplex_us.bakeoff import ( - DEFAULT_CONDITION_COLS, - DEFAULT_TARGET_COLS, - ScaleUpRunner, - ScaleUpStageConfig, - stage1_config, -) - -LOGGER = logging.getLogger(__name__) - - -class Autoencoder(nn.Module): - """Tiny autoencoder for dimensionality reduction on tabular features.""" - - def __init__(self, n_features: int, latent_dim: int = 16, hidden: int = 64) -> None: - super().__init__() - self.encoder = nn.Sequential( - nn.Linear(n_features, hidden), - nn.ReLU(), - nn.Linear(hidden, hidden), - nn.ReLU(), - nn.Linear(hidden, latent_dim), - ) - self.decoder = nn.Sequential( - nn.Linear(latent_dim, hidden), - nn.ReLU(), - nn.Linear(hidden, hidden), - nn.ReLU(), - nn.Linear(hidden, n_features), - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.decoder(self.encoder(x)) - - def encode(self, x: torch.Tensor) -> torch.Tensor: - return self.encoder(x) - - -def fit_autoencoder( - x: np.ndarray, latent_dim: int = 16, epochs: int = 200, lr: float = 1e-3 -) -> Autoencoder: - """Fit an autoencoder on standardized features.""" - n_features = x.shape[1] - model = Autoencoder(n_features=n_features, latent_dim=latent_dim) - x_t = torch.tensor(x, dtype=torch.float32) - optimizer = torch.optim.Adam(model.parameters(), lr=lr) - batch_size = 256 - ds = torch.utils.data.TensorDataset(x_t) - g = torch.Generator() - g.manual_seed(42) - loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True, generator=g) - - model.train() - for epoch in range(epochs): - total = 0.0 - for (batch,) in loader: - optimizer.zero_grad() - recon = model(batch) - loss = ((recon - batch) ** 2).mean() - loss.backward() - optimizer.step() - total += loss.item() * len(batch) - if (epoch + 1) % 50 == 0: - LOGGER.info(" AE epoch %d loss=%.4f", epoch + 1, total / len(x)) - model.eval() - return model - - -def encode(model: Autoencoder, x: np.ndarray) -> np.ndarray: - with torch.no_grad(): - return model.encode(torch.tensor(x, dtype=torch.float32)).numpy() - - -def compute_prdc_both_spaces( - real: pd.DataFrame, - synthetic: pd.DataFrame, - encoder: Autoencoder, - scaler: StandardScaler, - k: int = 5, - max_samples: int = 15_000, - seed: int = 42, -) -> dict: - """Return {raw: ..., embed: ...} PRDC tuples.""" - rng = np.random.default_rng(seed) - cols = [c for c in real.columns if c in synthetic.columns] - r = real[cols].to_numpy(dtype=np.float64) - s = synthetic[cols].to_numpy(dtype=np.float64) - if len(r) > max_samples: - r = r[rng.choice(len(r), size=max_samples, replace=False)] - if len(s) > max_samples: - s = s[rng.choice(len(s), size=max_samples, replace=False)] - - raw_r = scaler.transform(r) - raw_s = scaler.transform(s) - raw_metrics = compute_prdc(raw_r, raw_s, nearest_k=k) - - emb_r = encode(encoder, raw_r.astype(np.float32)) - emb_s = encode(encoder, raw_s.astype(np.float32)) - emb_metrics = compute_prdc(emb_r, emb_s, nearest_k=k) - - return { - "raw": {k: float(v) for k, v in raw_metrics.items()}, - "embed": {k: float(v) for k, v in emb_metrics.items()}, - } - - -def build_method(name: str): - registry = { - "ZI-QRF": ZIQRFMethod, - "ZI-MAF": ZIMAFMethod, - "ZI-QDNN": ZIQDNNMethod, - } - return registry[name]() - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--n-rows", type=int, default=40_000) - parser.add_argument( - "--methods", nargs="+", default=["ZI-QRF", "ZI-MAF", "ZI-QDNN"] - ) - parser.add_argument( - "--output", - type=Path, - default=Path("artifacts/embedding_prdc_compare.json"), - ) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--latent-dim", type=int, default=16) - parser.add_argument("--ae-epochs", type=int, default=200) - args = parser.parse_args(argv) - - logging.basicConfig( - level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s" - ) - - base = stage1_config() - cfg = ScaleUpStageConfig( - stage="embedding_prdc", - n_rows=args.n_rows, - methods=tuple(args.methods), - condition_cols=DEFAULT_CONDITION_COLS, - target_cols=DEFAULT_TARGET_COLS, - holdout_frac=0.2, - seed=args.seed, - k=5, - data_path=base.data_path, - year=base.year, - rare_cell_checks=(), - prdc_max_samples=15_000, - ) - - runner = ScaleUpRunner(cfg) - df = runner.load_frame() - train, holdout = runner.split(df) - LOGGER.info( - "loaded: train=%d holdout=%d cols=%d", len(train), len(holdout), len(df.columns) - ) - - scaler = StandardScaler().fit(holdout.to_numpy(dtype=np.float64)) - - LOGGER.info("fitting autoencoder on holdout...") - t0 = time.time() - encoder = fit_autoencoder( - scaler.transform(holdout.to_numpy(dtype=np.float64)).astype(np.float32), - latent_dim=args.latent_dim, - epochs=args.ae_epochs, - ) - LOGGER.info(" autoencoder fit=%.1fs", time.time() - t0) - - results = [] - for method_name in args.methods: - LOGGER.info("== %s ==", method_name) - method = build_method(method_name) - t0 = time.time() - method.fit(sources={"ecps": train.copy()}, shared_cols=list(DEFAULT_CONDITION_COLS)) - fit_s = time.time() - t0 - - t0 = time.time() - synth = method.generate(len(train), seed=args.seed) - gen_s = time.time() - t0 - - metrics = compute_prdc_both_spaces( - holdout, synth, encoder, scaler, k=5, seed=args.seed - ) - LOGGER.info( - " raw: prec=%.3f dens=%.3f cov=%.3f", - metrics["raw"]["precision"], - metrics["raw"]["density"], - metrics["raw"]["coverage"], - ) - LOGGER.info( - " embed: prec=%.3f dens=%.3f cov=%.3f (fit=%.1fs gen=%.1fs)", - metrics["embed"]["precision"], - metrics["embed"]["density"], - metrics["embed"]["coverage"], - fit_s, - gen_s, - ) - results.append( - { - "method": method_name, - "fit_wall_seconds": fit_s, - "generate_wall_seconds": gen_s, - **metrics, - } - ) - - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text(json.dumps(results, indent=2, default=str)) - - print() - print("== Raw-feature PRDC (50-dim) ==") - for r in sorted(results, key=lambda x: -x["raw"]["coverage"]): - print( - f" {r['method']:8s}: cov={r['raw']['coverage']:.3f} " - f"prec={r['raw']['precision']:.3f} dens={r['raw']['density']:.3f}" - ) - print() - print(f"== Learned-embedding PRDC ({args.latent_dim}-dim) ==") - for r in sorted(results, key=lambda x: -x["embed"]["coverage"]): - print( - f" {r['method']:8s}: cov={r['embed']['coverage']:.3f} " - f"prec={r['embed']['precision']:.3f} dens={r['embed']['density']:.3f}" - ) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/scripts/install.sh b/scripts/install.sh deleted file mode 100755 index 31c70378..00000000 --- a/scripts/install.sh +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -PYTHON_VERSION="${MICROPLEX_US_PYTHON_VERSION:-3.14}" -INTEL_ENV_NAME="microplex-us-intel" - -usage() { - cat <<'USAGE' -Usage: ./scripts/install.sh [--prod|--dev|--dev-intel-mac] [--dry-run] - -Install modes: - --prod Install the production PolicyEngine runtime with uv. - --dev Install development and PolicyEngine dependencies with uv. - --dev-intel-mac Install the Intel macOS development environment via conda-forge. - -Options: - --dry-run Print commands instead of running them. - --help Show this help. - -Production macOS installs require Apple Silicon (arm64). Intel macOS is -development/testing-only; use --dev-intel-mac there. -USAGE -} - -repo_root() { - cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd -} - -detect_uname_s() { - if [[ -n "${MICROPLEX_US_INSTALL_UNAME_S:-}" ]]; then - printf "%s\n" "$MICROPLEX_US_INSTALL_UNAME_S" - else - uname -s - fi -} - -detect_uname_m() { - if [[ -n "${MICROPLEX_US_INSTALL_UNAME_M:-}" ]]; then - printf "%s\n" "$MICROPLEX_US_INSTALL_UNAME_M" - else - uname -m - fi -} - -run_cmd() { - if [[ "$DRY_RUN" == "1" ]]; then - printf "+" - printf " %q" "$@" - printf "\n" - else - "$@" - fi -} - -resolve_intel_env_python() { - local conda_path - local conda_root - if [[ "$CONDA_EXE" == */* ]]; then - conda_path="$CONDA_EXE" - elif command -v "$CONDA_EXE" >/dev/null 2>&1; then - conda_path="$(command -v "$CONDA_EXE")" - elif [[ "$DRY_RUN" == "1" ]]; then - printf "<%s-python>\n" "$INTEL_ENV_NAME" - return - else - conda_path="$(command -v "$CONDA_EXE")" - fi - conda_root="$(cd "$(dirname "$conda_path")/.." && pwd)" - printf "%s/envs/%s/bin/python\n" "$conda_root" "$INTEL_ENV_NAME" -} - -intel_mac_message() { - cat <<'MESSAGE' >&2 -Production installs on macOS require Apple Silicon (arm64). -This Intel Mac path is development-only; use ./scripts/install.sh --dev-intel-mac. -MESSAGE -} - -require_intel_mac() { - if [[ "$UNAME_S" != "Darwin" || "$UNAME_M" != "x86_64" ]]; then - cat <<'MESSAGE' >&2 ---dev-intel-mac is only for Intel macOS development/testing. -Use ./scripts/install.sh --dev on Apple Silicon macOS and Linux. -MESSAGE - exit 2 - fi -} - -reject_intel_mac_runtime() { - if [[ "$UNAME_S" == "Darwin" && "$UNAME_M" == "x86_64" ]]; then - intel_mac_message - exit 2 - fi -} - -MODE="prod" -DRY_RUN="0" - -while [[ $# -gt 0 ]]; do - case "$1" in - --prod) - MODE="prod" - ;; - --dev) - MODE="dev" - ;; - --dev-intel-mac) - MODE="dev-intel-mac" - ;; - --dry-run) - DRY_RUN="1" - ;; - --help|-h) - usage - exit 0 - ;; - *) - printf "Unknown option: %s\n\n" "$1" >&2 - usage >&2 - exit 2 - ;; - esac - shift -done - -REPO_ROOT="$(repo_root)" -ENV_FILE="$REPO_ROOT/envs/macos-intel-conda-forge.yml" -CONDA_EXE="${CONDA_EXE:-conda}" -UNAME_S="$(detect_uname_s)" -UNAME_M="$(detect_uname_m)" - -cd "$REPO_ROOT" - -case "$MODE" in - prod) - reject_intel_mac_runtime - run_cmd uv sync --python "$PYTHON_VERSION" --extra policyengine - ;; - dev) - reject_intel_mac_runtime - run_cmd uv sync --python "$PYTHON_VERSION" --extra dev --extra policyengine - ;; - dev-intel-mac) - require_intel_mac - run_cmd "$CONDA_EXE" env update --file "$ENV_FILE" --prune - INTEL_ENV_PYTHON="$(resolve_intel_env_python)" - run_cmd "$INTEL_ENV_PYTHON" -m pip install \ - --upgrade-strategy only-if-needed -e ".[dev,policyengine]" - run_cmd "$INTEL_ENV_PYTHON" -c \ - "import platform, torch; print(f'microplex-us Intel dev env ready: {platform.machine()} torch {torch.__version__}')" - ;; -esac diff --git a/scripts/isolate_calibration_memory.py b/scripts/isolate_calibration_memory.py deleted file mode 100644 index 1106123f..00000000 --- a/scripts/isolate_calibration_memory.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Isolate the calibration stage and profile its peak memory. - -The v7 (microcalibrate) and v8 (pe_l0) pipelines both OOM'd at the -calibration step with ~172–197 GB of compressed memory on a 48 GB -workstation. PE-US-data's production setup runs the same L0 fit on a -T4 GPU (16 GB VRAM) successfully, which strongly suggests our -pipeline has a leak or duplication an order of magnitude larger than -the legitimate workload. - -This script runs ``fit_l0_weights`` on a synthetic sparse matrix that -matches the v7 shape (1.5M records × 4k constraints, ~5% density) -*without* the surrounding pipeline. If it OOMs in isolation, the -problem is inside the L0 fit itself. If it completes at a reasonable -memory footprint, the leak is upstream (PE-table construction, -intermediate frame retained in memory, adapter build, etc.) and we -should bisect further. - -Usage: - - uv run python scripts/isolate_calibration_memory.py \ - --n-records 1500000 --n-constraints 4000 --density 0.05 \ - --epochs 5 - -Smaller smoke: - - uv run python scripts/isolate_calibration_memory.py \ - --n-records 100000 --n-constraints 500 --density 0.05 --epochs 2 -""" - -from __future__ import annotations - -import argparse -import gc -import os -import resource -import sys -import time -from dataclasses import dataclass -from typing import Any - -import numpy as np -from scipy import sparse as sp - - -def _peak_rss_gb() -> float: - """Return current process peak RSS in GB (platform-aware).""" - r = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - if sys.platform == "darwin": - # macOS reports bytes. - return r / (1024**3) - # Linux / most BSDs: kilobytes. - return r * 1024 / (1024**3) - - -@dataclass -class Stage: - name: str - elapsed_s: float - peak_rss_gb: float - - -def _timestamp_stage(name: str, t0: float) -> Stage: - elapsed = time.perf_counter() - t0 - peak = _peak_rss_gb() - print( - f"[{elapsed:>7.1f}s | peak RSS {peak:>6.2f} GB] {name}", - flush=True, - ) - return Stage(name=name, elapsed_s=elapsed, peak_rss_gb=peak) - - -def build_synthetic_problem( - n_records: int, - n_constraints: int, - density: float, - seed: int = 42, -) -> tuple[sp.csr_matrix, np.ndarray, np.ndarray, list[str]]: - """Synthetic calibration fixture matching the v7/v8 shape. - - Builds a ``(n_constraints, n_records)`` CSR matrix at the given - density with binary-indicator-ish entries (uniform in [0, 1] for - the nonzero entries — enough to exercise torch.sparse.mm paths - without the realism of a PE constraint system). - """ - rng = np.random.default_rng(seed) - total = n_constraints * n_records - nnz = int(total * density) - rows = rng.integers(0, n_constraints, size=nnz) - cols = rng.integers(0, n_records, size=nnz) - data = rng.uniform(0.5, 1.5, size=nnz).astype(np.float64) - X = sp.csr_matrix( - (data, (rows, cols)), - shape=(n_constraints, n_records), - dtype=np.float64, - ) - weights = rng.uniform(0.5, 2.0, size=n_records).astype(np.float64) - estimated = X @ weights - # Perturb each target by ±20% so the calibration has real work to do. - targets = estimated * rng.uniform(0.8, 1.2, size=n_constraints) - target_names = [f"t{i}" for i in range(n_constraints)] - return X, targets, weights, target_names - - -def fit_l0( - X_sparse: sp.csr_matrix, - targets: np.ndarray, - initial_weights: np.ndarray, - target_names: list[str], - epochs: int, - device: str, - lambda_l0: float, -) -> np.ndarray: - """Delegate to PE-US-data's fit_l0_weights (same path pe_l0.py calls).""" - try: - from policyengine_us_data.calibration.unified_calibration import ( - fit_l0_weights, - ) - except ImportError as exc: - raise SystemExit( - f"policyengine-us-data not importable: {exc}. Install it or " - "run this script from the microplex-us venv." - ) from exc - - achievable = np.asarray(X_sparse.sum(axis=1)).reshape(-1) > 0 - return fit_l0_weights( - X_sparse=X_sparse, - targets=targets, - lambda_l0=lambda_l0, - epochs=epochs, - device=device, - verbose_freq=max(1, epochs // 5), - target_names=target_names, - initial_weights=initial_weights, - achievable=achievable, - ) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser(description=__doc__ or "") - parser.add_argument("--n-records", type=int, default=100_000) - parser.add_argument("--n-constraints", type=int, default=500) - parser.add_argument("--density", type=float, default=0.05) - parser.add_argument("--epochs", type=int, default=2) - parser.add_argument("--device", default="cpu") - parser.add_argument("--lambda-l0", type=float, default=1e-4) - parser.add_argument("--seed", type=int, default=42) - args = parser.parse_args(argv) - - print( - f"Configuration: n_records={args.n_records:,} " - f"n_constraints={args.n_constraints:,} density={args.density} " - f"epochs={args.epochs} device={args.device}", - flush=True, - ) - - stages: list[Stage] = [] - - t0 = time.perf_counter() - X, targets, weights, names = build_synthetic_problem( - n_records=args.n_records, - n_constraints=args.n_constraints, - density=args.density, - seed=args.seed, - ) - stages.append(_timestamp_stage("build CSR + targets + weights", t0)) - print( - f" CSR shape {X.shape}, nnz={X.nnz:,} " - f"({X.nnz * 12 / 1024**3:.2f} GB raw storage estimate)", - flush=True, - ) - - t0 = time.perf_counter() - fit_l0( - X_sparse=X, - targets=targets, - initial_weights=weights, - target_names=names, - epochs=args.epochs, - device=args.device, - lambda_l0=args.lambda_l0, - ) - stages.append(_timestamp_stage("fit_l0_weights complete", t0)) - - gc.collect() - stages.append(_timestamp_stage("after gc.collect", time.perf_counter())) - - print("\n--- summary ---") - for s in stages: - print(f" {s.name:<40} {s.elapsed_s:>8.1f}s peak={s.peak_rss_gb:>6.2f} GB") - print(f"\nFinal peak RSS: {_peak_rss_gb():.2f} GB") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/scripts/run_b2_batched.py b/scripts/run_b2_batched.py deleted file mode 100644 index cf906034..00000000 --- a/scripts/run_b2_batched.py +++ /dev/null @@ -1,256 +0,0 @@ -"""Batched Microsimulation aggregate for one variable. - -The naive one-shot ``Microsimulation.calculate(income_tax, 2024).sum()`` -OOMs on 1.5M households because the dependency chain materializes -~100+ intermediate arrays (each 3.4M floats = 27 MB) in memory -simultaneously. This runner subsets the h5 into household-size chunks, -runs a fresh Microsimulation per chunk, and accumulates the weighted -sum. - -Entity-level subsetting is done by index, matching -``policyengine_us_data``'s h5 layout: household-level arrays index by -position in ``household_id``; person-level arrays index by position in -``person_household_id``; same for tax_unit, spm_unit, family, -marital_unit. -""" - -from __future__ import annotations - -import argparse -import json -import sys -import tempfile -import time -from pathlib import Path - -import h5py -import numpy as np - -HOUSEHOLD_ID = "household_id" - -ENTITY_ID_COLUMNS = { - "household": "household_id", - "person": "person_id", - "tax_unit": "tax_unit_id", - "spm_unit": "spm_unit_id", - "family": "family_id", - "marital_unit": "marital_unit_id", -} -# Person → group-entity foreign keys. -PERSON_TO_GROUP_LINK = { - "tax_unit": "person_tax_unit_id", - "spm_unit": "person_spm_unit_id", - "family": "person_family_id", - "marital_unit": "person_marital_unit_id", -} -STRUCTURAL_VARIABLE_ENTITIES = { - "household_id": "household", - "household_weight": "household", - "person_id": "person", - "person_household_id": "person", - "person_weight": "person", - "tax_unit_id": "tax_unit", - "person_tax_unit_id": "person", - "tax_unit_weight": "tax_unit", - "spm_unit_id": "spm_unit", - "person_spm_unit_id": "person", - "spm_unit_weight": "spm_unit", - "family_id": "family", - "person_family_id": "person", - "family_weight": "family", - "marital_unit_id": "marital_unit", - "person_marital_unit_id": "person", - "marital_unit_weight": "marital_unit", -} - - -def _load_all_arrays(h5_path: Path, period_key: str) -> dict[str, np.ndarray]: - with h5py.File(h5_path, "r") as f: - out = {} - for key in f.keys(): - if period_key in f[key]: - out[key] = np.asarray(f[key][period_key]) - return out - - -def _load_policyengine_variable_entities() -> dict[str, str]: - try: - from policyengine_us import ( - system as policyengine_system_module, # noqa: PLC0415 - ) - except ImportError: - return {} - - tax_benefit_system = getattr(policyengine_system_module, "system", None) - if tax_benefit_system is None: - return {} - variables = getattr(tax_benefit_system, "variables", {}) - entity_map: dict[str, str] = {} - for name, metadata in variables.items(): - entity_key = getattr(getattr(metadata, "entity", None), "key", None) - if entity_key is not None: - entity_map[str(name)] = str(entity_key) - return entity_map - - -def _entity_of( - variable: str, - arrays: dict[str, np.ndarray], - *, - variable_entities: dict[str, str] | None = None, -) -> str: - """Classify a variable, preferring PE metadata over fragile length matching.""" - explicit_entity = STRUCTURAL_VARIABLE_ENTITIES.get(variable) - if explicit_entity is not None: - return explicit_entity - if variable_entities is not None and variable in variable_entities: - return variable_entities[variable] - n = len(arrays[variable]) - entity_lengths = { - entity: len(arrays[id_col]) - for entity, id_col in ENTITY_ID_COLUMNS.items() - if id_col in arrays - } - matches = [entity for entity, length in entity_lengths.items() if length == n] - if len(matches) == 1: - return matches[0] - if len(matches) > 1: - raise ValueError( - f"Ambiguous entity for variable {variable!r}: matched {matches} by length" - ) - return "unknown" - - -def _build_entity_masks( - arrays: dict[str, np.ndarray], chunk_hh_ids: np.ndarray -) -> dict[str, np.ndarray]: - """Produce boolean masks into each entity array for the households in ``chunk_hh_ids``.""" - hh_id = arrays["household_id"] - masks: dict[str, np.ndarray] = {} - masks["household"] = np.isin(hh_id, chunk_hh_ids) - person_hh = arrays["person_household_id"] - person_mask = np.isin(person_hh, chunk_hh_ids) - masks["person"] = person_mask - for entity, link_col in PERSON_TO_GROUP_LINK.items(): - id_col = ENTITY_ID_COLUMNS[entity] - if link_col not in arrays or id_col not in arrays: - continue - group_ids_in_chunk = np.unique(arrays[link_col][person_mask]) - masks[entity] = np.isin(arrays[id_col], group_ids_in_chunk) - return masks - - -def _write_chunk_h5( - arrays: dict[str, np.ndarray], - entity_masks: dict[str, np.ndarray], - period_key: str, - tmp_path: Path, - *, - variable_entities: dict[str, str] | None = None, -) -> None: - """Write a subset h5 keeping only rows matching each variable's entity mask.""" - with h5py.File(tmp_path, "w") as f: - for variable, values in arrays.items(): - entity = _entity_of( - variable, - arrays, - variable_entities=variable_entities, - ) - mask = entity_masks.get(entity) - if mask is None or len(values) != len(mask): - continue - group = f.create_group(variable) - group.create_dataset(period_key, data=values[mask]) - - -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--dataset", required=True, type=Path) - parser.add_argument("--variable", required=True, type=str) - parser.add_argument("--period", default=2024, type=int) - parser.add_argument("--batch-size", default=50_000, type=int) - parser.add_argument("--output", required=True, type=Path) - args = parser.parse_args() - - period_key = str(args.period) - print(f"[{time.strftime('%H:%M:%S')}] loading all arrays from {args.dataset}", flush=True) - arrays = _load_all_arrays(args.dataset, period_key) - variable_entities = _load_policyengine_variable_entities() - print( - f"[{time.strftime('%H:%M:%S')}] loaded {len(arrays)} variables", - flush=True, - ) - - hh_ids = arrays[HOUSEHOLD_ID] - n_hh = len(hh_ids) - print(f"[{time.strftime('%H:%M:%S')}] {n_hh} households; batch_size={args.batch_size}", flush=True) - - total = 0.0 - n_batches = (n_hh + args.batch_size - 1) // args.batch_size - - from policyengine_us import Microsimulation # noqa: PLC0415 - - from microplex_us.validation.downstream import ( # noqa: PLC0415 - compute_downstream_weighted_aggregate, - ) - - for batch_idx in range(n_batches): - start = batch_idx * args.batch_size - end = min(start + args.batch_size, n_hh) - chunk_hh_ids = hh_ids[start:end] - - entity_masks = _build_entity_masks(arrays, chunk_hh_ids) - - with tempfile.TemporaryDirectory() as tmp: - tmp_path = Path(tmp) / "chunk.h5" - _write_chunk_h5( - arrays, - entity_masks, - period_key, - tmp_path, - variable_entities=variable_entities, - ) - - t0 = time.time() - sim = Microsimulation(dataset=str(tmp_path)) - chunk_sum = compute_downstream_weighted_aggregate( - sim, - args.variable, - args.period, - ) - total += chunk_sum - elapsed = time.time() - t0 - - print( - f"[{time.strftime('%H:%M:%S')}] batch {batch_idx+1}/{n_batches} " - f"(households {start}-{end}): ${chunk_sum/1e9:.3f}B " - f"cumulative=${total/1e9:.3f}B ({elapsed:.1f}s)", - flush=True, - ) - - print( - f"\n[{time.strftime('%H:%M:%S')}] {args.variable} total = ${total/1e9:.2f}B", - flush=True, - ) - args.output.parent.mkdir(parents=True, exist_ok=True) - raw_agg_path = args.output.with_suffix(".raw.json") - raw_aggs = ( - json.loads(raw_agg_path.read_text()) if raw_agg_path.exists() else {} - ) - raw_aggs[args.variable] = total - raw_agg_path.write_text(json.dumps(raw_aggs, indent=2)) - - from microplex_us.validation.downstream import ( # noqa: PLC0415 - DOWNSTREAM_BENCHMARKS_2024, - compute_downstream_comparison, - ) - - comparison = compute_downstream_comparison(raw_aggs, DOWNSTREAM_BENCHMARKS_2024) - report = {name: rec.to_dict() for name, rec in comparison.items()} - args.output.write_text(json.dumps(report, indent=2)) - print(f"[{time.strftime('%H:%M:%S')}] wrote {args.output}", flush=True) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/run_b2_validation.py b/scripts/run_b2_validation.py deleted file mode 100644 index 380dfe17..00000000 --- a/scripts/run_b2_validation.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Run B2 downstream validation on a calibrated PE-US h5. - -One variable at a time, flushing progress and intermediate output to -disk so a partial run leaves usable state. Uses the -``microplex_us.validation.downstream`` module for the benchmark set. -""" - -from __future__ import annotations - -import argparse -import json -import sys -import time -from pathlib import Path - -from microplex_us.validation.downstream import ( - DOWNSTREAM_BENCHMARKS_2024, - compute_downstream_comparison, - compute_downstream_weighted_aggregate, -) - - -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--dataset", required=True, type=Path) - parser.add_argument("--output", required=True, type=Path) - parser.add_argument("--period", default=2024, type=int) - args = parser.parse_args() - - print(f"[{time.strftime('%H:%M:%S')}] loading Microsimulation from {args.dataset}", flush=True) - from policyengine_us import Microsimulation - - sim = Microsimulation(dataset=str(args.dataset)) - print(f"[{time.strftime('%H:%M:%S')}] loaded", flush=True) - - variables = [spec.name for spec in DOWNSTREAM_BENCHMARKS_2024] - aggregates: dict[str, float] = {} - - args.output.parent.mkdir(parents=True, exist_ok=True) - intermediate_path = args.output.with_suffix(".partial.json") - - for variable in variables: - t0 = time.time() - print(f"[{time.strftime('%H:%M:%S')}] computing {variable} ...", flush=True) - try: - total = compute_downstream_weighted_aggregate(sim, variable, args.period) - except Exception as exc: - print(f" {variable}: FAILED ({exc})", flush=True) - aggregates[variable] = float("nan") - else: - aggregates[variable] = total - elapsed = time.time() - t0 - print( - f" {variable}: ${total/1e9:,.2f}B (in {elapsed:.1f}s)", - flush=True, - ) - # Flush partial state to disk after each variable so an OOM - # kill after N variables still leaves N results on disk. - intermediate_path.write_text(json.dumps(aggregates, indent=2)) - - comparison = compute_downstream_comparison(aggregates, DOWNSTREAM_BENCHMARKS_2024) - report = {name: rec.to_dict() for name, rec in comparison.items()} - args.output.write_text(json.dumps(report, indent=2)) - intermediate_path.unlink(missing_ok=True) - - print(f"\n[{time.strftime('%H:%M:%S')}] B2 validation complete", flush=True) - print(f"Wrote {args.output}", flush=True) - - print(f"\n{'variable':<12s} {'computed':>12s} {'benchmark':>12s} {'rel_error':>10s}") - for name, rec in sorted(comparison.items()): - rel = rec.rel_error - rel_str = f"{rel*100:+.1f}%" if rel is not None else "N/A" - print( - f"{name:<12s} ${rec.computed/1e9:>9.2f}B " - f"${rec.benchmark/1e9:>9.2f}B {rel_str:>10s}", - flush=True, - ) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/run_b2_validation_single_var.py b/scripts/run_b2_validation_single_var.py deleted file mode 100644 index d67abf11..00000000 --- a/scripts/run_b2_validation_single_var.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Compute one B2 downstream aggregate in a fresh process. - -Fresh-per-variable keeps the peak memory of each variable independent -so one heavy variable (e.g. income_tax) OOM-killing doesn't wipe out -progress on the others. Append-writes to the output JSON. -""" - -from __future__ import annotations - -import argparse -import json -import sys -import time -from pathlib import Path - -from microplex_us.validation.downstream import ( - DOWNSTREAM_BENCHMARKS_2024, - compute_downstream_comparison, - compute_downstream_weighted_aggregate, -) - - -def main() -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--dataset", required=True, type=Path) - parser.add_argument("--output", required=True, type=Path) - parser.add_argument("--variable", required=True, type=str) - parser.add_argument("--period", default=2024, type=int) - args = parser.parse_args() - - print(f"[{time.strftime('%H:%M:%S')}] loading Microsimulation", flush=True) - from policyengine_us import Microsimulation - - sim = Microsimulation(dataset=str(args.dataset)) - print(f"[{time.strftime('%H:%M:%S')}] loaded — computing {args.variable}", flush=True) - t0 = time.time() - total = compute_downstream_weighted_aggregate(sim, args.variable, args.period) - elapsed = time.time() - t0 - print( - f"[{time.strftime('%H:%M:%S')}] {args.variable} = ${total/1e9:.2f}B " - f"(in {elapsed:.1f}s)", - flush=True, - ) - - args.output.parent.mkdir(parents=True, exist_ok=True) - # Re-read intermediate file if present (accumulates across runs). - raw_agg_path = args.output.with_suffix(".raw.json") - raw_aggs = ( - json.loads(raw_agg_path.read_text()) if raw_agg_path.exists() else {} - ) - raw_aggs[args.variable] = total - raw_agg_path.write_text(json.dumps(raw_aggs, indent=2)) - - comparison = compute_downstream_comparison(raw_aggs, DOWNSTREAM_BENCHMARKS_2024) - report = {name: rec.to_dict() for name, rec in comparison.items()} - args.output.write_text(json.dumps(report, indent=2)) - print(f"[{time.strftime('%H:%M:%S')}] updated {args.output}", flush=True) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/sync_policyengine_theme.py b/scripts/sync_policyengine_theme.py deleted file mode 100644 index e04b5639..00000000 --- a/scripts/sync_policyengine_theme.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Sync exported PolicyEngine design tokens into browser-readable CSS variables.""" - -from __future__ import annotations - -import argparse -import re -import sys -from pathlib import Path - - -def default_source(repo_root: Path) -> Path: - """Return the first local PolicyEngine token export next to this repo.""" - - candidates = ( - repo_root.parent / "policyengine.org" / "packages" / "config" / "theme.css", - repo_root.parent / "policyengine" / "packages" / "config" / "theme.css", - repo_root.parent / "policyengine" / "apps" / "web" / "src" / "app" / "globals.css", - ) - for candidate in candidates: - if candidate.exists(): - return candidate - searched = ", ".join(str(path) for path in candidates) - raise FileNotFoundError(f"Could not find exported PolicyEngine theme. Searched: {searched}") - - -def render_browser_tokens( - source_text: str, - *, - source_path: Path, - repo_root: Path, -) -> str: - """Convert a Tailwind v4 @theme block into CSS custom properties.""" - - match = re.search(r"@theme\s*\{(?P.*?)\}", source_text, flags=re.DOTALL) - if not match: - raise ValueError(f"No @theme block found in {source_path}") - body = match.group("body").strip() - try: - display_source = source_path.relative_to(repo_root.parent) - except ValueError: - display_source = source_path - return ( - "/* Generated from the exported PolicyEngine design tokens.\n" - f" Source: {display_source}\n" - " Re-run: python scripts/sync_policyengine_theme.py\n" - "*/\n" - ":root {\n" - f"{body}\n" - "}\n" - ) - - -def main(argv: list[str] | None = None) -> int: - repo_root = Path(__file__).resolve().parents[1] - parser = argparse.ArgumentParser( - description="Sync @policyengine/config theme tokens into dashboard CSS." - ) - parser.add_argument("--source", type=Path) - parser.add_argument( - "--output", - type=Path, - default=repo_root / "dashboard" / "policyengine-theme.css", - ) - parser.add_argument("--check", action="store_true") - args = parser.parse_args(argv) - - source = (args.source or default_source(repo_root)).expanduser().resolve() - rendered = render_browser_tokens( - source.read_text(), - source_path=source, - repo_root=repo_root, - ) - - if args.check: - current = args.output.read_text() if args.output.exists() else "" - if current != rendered: - print(f"{args.output} is not synced with {source}", file=sys.stderr) - return 1 - return 0 - - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text(rendered) - print(args.output) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/scripts/zi_classifier_isolated_eval.py b/scripts/zi_classifier_isolated_eval.py deleted file mode 100644 index a762843f..00000000 --- a/scripts/zi_classifier_isolated_eval.py +++ /dev/null @@ -1,322 +0,0 @@ -"""Isolated per-column ZI classifier evaluation. - -Answers the diagnostic question behind the 5-way ZI-QDNN coverage tie: if we -strip the downstream draw network out of the loop and evaluate only the -zero/non-zero classifier's own calibration and discrimination, do the five -candidates still look equivalent? - -Protocol --------- - -- Same data as the coverage benchmark: enhanced_cps_2024, 77,006 persons, 14 - conditioning columns, 36 target columns, seed 42. -- Same outer 80/20 train/holdout split used by ScaleUpRunner. -- For each target column with training-set zero-fraction >= 10% (the upstream - ZI trigger) and at least 10 zero + 10 non-zero training rows, further split - training 80/20 (seed 42) into fit / val. -- Label is (~at_min).astype(int), matching `_MultiSourceBase.fit`. -- Fit each of 5 classifiers on (X_fit, label_fit), predict P(y>0) on X_val. -- Report: log-loss, Brier, ECE (10 equal-width bins), ROC-AUC, fit seconds. - -Aggregation ------------ - -For each classifier, report column-count-weighted mean and median across the -eligible target columns. The RF default should be the baseline everything else -is compared against, since it is what the coverage benchmark locked in. -""" - -from __future__ import annotations - -import argparse -import json -import logging -import time -from collections.abc import Callable -from pathlib import Path -from typing import Any - -import numpy as np -from sklearn.metrics import brier_score_loss, log_loss, roc_auc_score - -from microplex_us.bakeoff.local_methods import ( - _dnn_factory, - _hgb_factory, - _logistic_factory, - _rf_calibrated_factory, -) -from microplex_us.bakeoff.scale_up import ( - DEFAULT_CONDITION_COLS, - DEFAULT_ENHANCED_CPS_PATH, - DEFAULT_TARGET_COLS, - _load_enhanced_cps, -) - -LOGGER = logging.getLogger(__name__) - - -def _rf_default_factory(): - from sklearn.ensemble import RandomForestClassifier - - return RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1) - - -CLASSIFIERS: dict[str, Callable[[], Any]] = { - "RF_default": _rf_default_factory, - "Logistic": _logistic_factory, - "HistGB": _hgb_factory, - "RF_calibrated": _rf_calibrated_factory, - "DNN": _dnn_factory, -} - - -def _expected_calibration_error( - y_true: np.ndarray, p_hat: np.ndarray, n_bins: int = 10 -) -> float: - """Equal-width ECE: sum over bins of (n_bin/N) * |acc - conf|.""" - edges = np.linspace(0.0, 1.0, n_bins + 1) - ece = 0.0 - n = len(y_true) - for i in range(n_bins): - lo, hi = edges[i], edges[i + 1] - if i == n_bins - 1: - mask = (p_hat >= lo) & (p_hat <= hi) - else: - mask = (p_hat >= lo) & (p_hat < hi) - if not mask.any(): - continue - bin_conf = float(p_hat[mask].mean()) - bin_acc = float(y_true[mask].mean()) - ece += (mask.sum() / n) * abs(bin_conf - bin_acc) - return float(ece) - - -def _positive_class_proba(clf: Any, X: np.ndarray) -> np.ndarray: - """Return P(y == 1 | x) regardless of how the classifier orders classes.""" - proba = clf.predict_proba(X) - classes = np.asarray(clf.classes_) - pos_idx = int(np.where(classes == 1)[0][0]) - return proba[:, pos_idx] - - -def evaluate_column( - col: str, - X_fit: np.ndarray, - y_fit_label: np.ndarray, - X_val: np.ndarray, - y_val_label: np.ndarray, -) -> dict[str, dict[str, float]]: - """Fit every classifier on (X_fit, y_fit_label); score on val.""" - results: dict[str, dict[str, float]] = {} - for name, factory in CLASSIFIERS.items(): - clf = factory() - t0 = time.perf_counter() - clf.fit(X_fit, y_fit_label) - fit_s = time.perf_counter() - t0 - p_hat = _positive_class_proba(clf, X_val) - p_hat = np.clip(p_hat, 1e-6, 1 - 1e-6) - ll = float(log_loss(y_val_label, p_hat, labels=[0, 1])) - brier = float(brier_score_loss(y_val_label, p_hat)) - ece = _expected_calibration_error(y_val_label, p_hat, n_bins=10) - try: - auc = float(roc_auc_score(y_val_label, p_hat)) - except ValueError: - auc = float("nan") - results[name] = { - "log_loss": ll, - "brier": brier, - "ece": ece, - "auc": auc, - "fit_s": fit_s, - } - return results - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser(description=__doc__ or "") - parser.add_argument( - "--data-path", type=Path, default=DEFAULT_ENHANCED_CPS_PATH - ) - parser.add_argument("--year", default="2024") - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--holdout-frac", type=float, default=0.2) - parser.add_argument("--inner-val-frac", type=float, default=0.2) - parser.add_argument("--zero-threshold", type=float, default=0.1) - parser.add_argument( - "--output", - type=Path, - default=Path( - "/Users/maxghenis/PolicyEngine/microplex-us/artifacts/" - "zi_classifier_isolated_eval.json" - ), - ) - parser.add_argument("--log-level", default="INFO") - args = parser.parse_args(argv) - logging.basicConfig( - level=getattr(logging, args.log_level), - format="%(asctime)s %(levelname)s %(name)s: %(message)s", - ) - - columns = list(DEFAULT_CONDITION_COLS) + list(DEFAULT_TARGET_COLS) - df = _load_enhanced_cps(args.data_path, args.year, columns) - df = df.astype(np.float32) - LOGGER.info("loaded %d rows x %d cols", len(df), len(df.columns)) - - rng = np.random.default_rng(args.seed) - idx = rng.permutation(len(df)) - cut = int(len(df) * (1.0 - args.holdout_frac)) - train = df.iloc[idx[:cut]].reset_index(drop=True) - LOGGER.info("outer split: %d train rows (holdout discarded, not needed here)", len(train)) - - inner_rng = np.random.default_rng(args.seed + 1) - inner_idx = inner_rng.permutation(len(train)) - inner_cut = int(len(train) * (1.0 - args.inner_val_frac)) - fit_idx, val_idx = inner_idx[:inner_cut], inner_idx[inner_cut:] - LOGGER.info("inner split: %d fit / %d val", len(fit_idx), len(val_idx)) - - cond = list(DEFAULT_CONDITION_COLS) - X_train_all = train[cond].to_numpy() - X_fit_all = X_train_all[fit_idx] - X_val_all = X_train_all[val_idx] - - per_col: dict[str, Any] = {} - eligible: list[str] = [] - skipped: list[dict[str, Any]] = [] - - for col in DEFAULT_TARGET_COLS: - y = train[col].to_numpy() - min_val = float(np.nanmin(y)) - at_min = np.isclose(y, min_val, atol=1e-6) - zero_frac = float(at_min.mean()) - label = (~at_min).astype(int) - - fit_label = label[fit_idx] - val_label = label[val_idx] - n_zero_fit = int((fit_label == 0).sum()) - n_pos_fit = int((fit_label == 1).sum()) - n_zero_val = int((val_label == 0).sum()) - n_pos_val = int((val_label == 1).sum()) - - if zero_frac < args.zero_threshold: - skipped.append( - {"col": col, "reason": "below_zero_threshold", "zero_frac": zero_frac} - ) - continue - if n_zero_fit < 10 or n_pos_fit < 10: - skipped.append( - { - "col": col, - "reason": "insufficient_class_counts_fit", - "n_zero_fit": n_zero_fit, - "n_pos_fit": n_pos_fit, - } - ) - continue - if n_zero_val < 1 or n_pos_val < 1: - skipped.append( - { - "col": col, - "reason": "insufficient_class_counts_val", - "n_zero_val": n_zero_val, - "n_pos_val": n_pos_val, - } - ) - continue - - LOGGER.info( - "== %s == zero_frac=%.3f fit=%d/%d val=%d/%d (zero/pos)", - col, - zero_frac, - n_zero_fit, - n_pos_fit, - n_zero_val, - n_pos_val, - ) - - col_result = evaluate_column( - col=col, - X_fit=X_fit_all, - y_fit_label=fit_label, - X_val=X_val_all, - y_val_label=val_label, - ) - - per_col[col] = { - "zero_frac_train": zero_frac, - "min_val": min_val, - "n_zero_fit": n_zero_fit, - "n_pos_fit": n_pos_fit, - "n_zero_val": n_zero_val, - "n_pos_val": n_pos_val, - "classifiers": col_result, - } - eligible.append(col) - - summary = " ".join( - f"{clf}=ll{m['log_loss']:.4f}/auc{m['auc']:.3f}" - for clf, m in col_result.items() - ) - LOGGER.info(" %s", summary) - - # Aggregate across eligible columns - aggregate: dict[str, dict[str, float]] = {} - for clf in CLASSIFIERS: - rows = [per_col[c]["classifiers"][clf] for c in eligible] - if not rows: - continue - agg = { - "log_loss_mean": float(np.mean([r["log_loss"] for r in rows])), - "log_loss_median": float(np.median([r["log_loss"] for r in rows])), - "brier_mean": float(np.mean([r["brier"] for r in rows])), - "ece_mean": float(np.mean([r["ece"] for r in rows])), - "auc_mean": float(np.nanmean([r["auc"] for r in rows])), - "auc_median": float(np.nanmedian([r["auc"] for r in rows])), - "fit_s_total": float(np.sum([r["fit_s"] for r in rows])), - } - aggregate[clf] = agg - - out = { - "config": { - "data_path": str(args.data_path), - "year": args.year, - "seed": args.seed, - "holdout_frac": args.holdout_frac, - "inner_val_frac": args.inner_val_frac, - "zero_threshold": args.zero_threshold, - "n_train_rows": len(train), - "n_fit_rows": len(fit_idx), - "n_val_rows": len(val_idx), - "condition_cols": list(DEFAULT_CONDITION_COLS), - "target_cols": list(DEFAULT_TARGET_COLS), - "eligible_cols": eligible, - "skipped": skipped, - }, - "per_column": per_col, - "aggregate": aggregate, - } - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text(json.dumps(out, indent=2, default=str)) - LOGGER.info("wrote %s", args.output) - - print() - print(f"Eligible columns (zero_frac >= {args.zero_threshold}): {len(eligible)}") - print(f"Skipped columns: {len(skipped)}") - print() - print( - f"{'classifier':>15} {'log_loss':>9} {'log_loss_med':>12} " - f"{'brier':>7} {'ece':>7} {'auc':>6} {'auc_med':>7} {'total_fit_s':>11}" - ) - ordered = sorted(aggregate.items(), key=lambda kv: kv[1]["log_loss_mean"]) - for clf, agg in ordered: - print( - f"{clf:>15} {agg['log_loss_mean']:9.4f} {agg['log_loss_median']:12.4f} " - f"{agg['brier_mean']:7.4f} {agg['ece_mean']:7.4f} " - f"{agg['auc_mean']:6.3f} {agg['auc_median']:7.3f} " - f"{agg['fit_s_total']:11.1f}" - ) - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/__init__.py b/src/microplex_us/__init__.py deleted file mode 100644 index 44409b80..00000000 --- a/src/microplex_us/__init__.py +++ /dev/null @@ -1,415 +0,0 @@ -"""US-specific adapters, targets, and pipelines for microplex.""" - -# ruff: noqa: E402, I001 - -from importlib import import_module -from typing import Any - -from microplex.targets import TargetSet, TargetSpec - -from microplex_us.calibration_harness import ( - CalibrationHarness, - CalibrationResult, - run_pe_parity_suite, -) -from microplex_us.cps_synthetic import ( - CPSSummaryStats, - CPSSyntheticGenerator, - validate_synthetic, -) -from microplex_us.data import ( - create_sample_data, - get_data_info, - load_cps_asec, - load_cps_for_synthesis, -) - -try: - from microplex_us.geography import ( - BLOCK_LEN, - COUNTY_LEN, - STATE_LEN, - TRACT_LEN, - BlockGeography, - derive_geographies, - load_block_probabilities, - normalize_us_state_fips, - ) -except ImportError: - BLOCK_LEN = None - COUNTY_LEN = None - STATE_LEN = None - TRACT_LEN = None - BlockGeography = None - derive_geographies = None - load_block_probabilities = None - normalize_us_state_fips = None -from microplex_us.hierarchical import prepare_cps_for_hierarchical -from microplex_us.pe_targets import ( - PETargets, - create_calibration_targets, - get_pe_targets, -) - -_PIPELINE_EXPORTS = ( - "DEFAULT_ATOMIC_AGE_BINS", - "DEFAULT_ATOMIC_AGE_LABELS", - "DEFAULT_ATOMIC_EMPLOYMENT_INCOME_BINS", - "DEFAULT_ATOMIC_EMPLOYMENT_INCOME_LABELS", - "FrontierMetric", - "PEUSDataRebuildCheckpointEvidenceResult", - "PEUSDataRebuildCheckpointResult", - "PEUSDataRebuildProgram", - "PEUSDataRebuildStage", - "PEUSDataRebuildStatus", - "USHouseholdTargetReweightingResult", - "USMicroplexArtifactPaths", - "USMicroplexBuildConfig", - "USMicroplexBuildResult", - "USMicroplexExperimentReport", - "USMicroplexExperimentResult", - "USMicroplexPerformanceHarnessConfig", - "USMicroplexPerformanceHarnessRequest", - "USMicroplexPerformanceHarnessResult", - "USMicroplexPerformanceSession", - "USMicroplexPipeline", - "USMicroplexReducedBenchmarkHarnessConfig", - "USMicroplexReducedBenchmarkHarnessResult", - "USMicroplexReducedBenchmarkReport", - "USMicroplexReducedBenchmarkSpec", - "USMicroplexReducedCalibrationReport", - "USMicroplexReducedDimensionSpec", - "USMicroplexReducedMeasureSpec", - "USMicroplexReducedMultiCalibrationReport", - "USMicroplexRunRegistryEntry", - "USMicroplexSourceExperimentSpec", - "USMicroplexTargets", - "USMicroplexVersionedBuildArtifacts", - "backfill_us_pe_native_audit_bundle", - "backfill_us_pe_native_audit_bundles", - "backfill_us_pe_native_audit_root", - "backfill_us_pe_native_scores_bundle", - "backfill_us_pe_native_scores_bundles", - "backfill_us_pe_native_scores_root", - "append_us_microplex_run_index_entry", - "append_us_microplex_run_registry_entry", - "attach_policyengine_us_data_rebuild_checkpoint_evidence", - "build_and_save_versioned_us_microplex", - "build_and_save_versioned_us_microplex_from_data_dir", - "build_and_save_versioned_us_microplex_from_source_provider", - "build_and_save_versioned_us_microplex_from_source_providers", - "build_policyengine_us_data_rebuild_markdown", - "build_policyengine_us_data_rebuild_native_audit", - "build_policyengine_us_data_rebuild_pipeline", - "build_us_microplex", - "build_us_microplex_run_registry_entry", - "build_us_microplex_site_snapshot", - "calibrate_and_evaluate_us_reduced_benchmark_specs", - "calibrate_and_evaluate_us_reduced_benchmarks", - "compare_us_microplex_target_delta_rows", - "default_policyengine_us_data_rebuild_checkpoint_config", - "default_policyengine_us_data_rebuild_config", - "default_policyengine_us_data_rebuild_program", - "default_policyengine_us_data_rebuild_queries", - "default_policyengine_us_data_rebuild_source_providers", - "default_us_atomic_rung0_benchmarks", - "default_us_atomic_rung1_benchmarks", - "default_us_atomic_rung2_calibration", - "default_us_atomic_rung3_calibration", - "default_us_atomic_rung4_calibration", - "default_us_atomic_rung5_calibration", - "default_us_source_mix_experiments", - "evaluate_us_reduced_benchmark", - "discover_us_candidate_artifact_dirs", - "list_us_microplex_target_delta_rows", - "load_us_microplex_run_registry", - "rebuild_us_microplex_run_index", - "reduced_benchmark_specs_to_calibration_targets", - "reduced_benchmark_to_calibration_targets", - "resolve_us_microplex_frontier_artifact_dir", - "resolve_us_microplex_run_index_path", - "reweight_us_household_targets", - "run_policyengine_us_data_rebuild_checkpoint", - "run_us_microplex_performance_harness", - "run_us_microplex_reduced_benchmark_harness", - "run_us_microplex_source_experiments", - "save_us_microplex_artifacts", - "save_versioned_us_microplex_artifacts", - "save_versioned_us_microplex_build_result", - "select_us_microplex_frontier_entry", - "select_us_microplex_frontier_index_row", - "warm_us_microplex_parity_cache", - "write_policyengine_us_data_rebuild_native_audit", - "write_us_microplex_site_snapshot", - "rebuild_us_pe_native_run_registry", -) - -_POLICYENGINE_EXPORTS = ( - "PolicyEngineUSComparisonCache", - "PolicyEngineUSConstraint", - "PolicyEngineUSDBTarget", - "PolicyEngineUSDBTargetProvider", - "PolicyEngineUSEntityTableBundle", - "PolicyEngineUSMicrosimulationAdapter", - "PolicyEngineUSQuantityTarget", - "PolicyEngineUSSimulationTargetCompiler", - "PolicyEngineUSTargetComparisonReport", - "PolicyEngineUSTargetEvaluation", - "PolicyEngineUSTargetEvaluationReport", - "PolicyEngineUSVariableBinding", - "build_policyengine_us_export_column_names", - "build_policyengine_us_time_period_arrays", - "compare_policyengine_us_target_query_to_baseline", - "compile_policyengine_us_household_linear_constraints", - "compute_policyengine_us_definition_hash", - "detect_policyengine_pseudo_inputs", - "evaluate_policyengine_us_target_query", - "evaluate_policyengine_us_target_set", - "filter_supported_policyengine_us_targets", - "infer_policyengine_us_variable_bindings", - "load_policyengine_us_entity_tables", - "materialize_policyengine_us_variables", - "policyengine_us_formula_variables_for_targets", - "policyengine_us_variables_to_materialize", - "project_frame_to_time_period_arrays", - "write_policyengine_us_time_period_dataset", -) -_SOURCE_REGISTRY_EXPORTS = ( - "DEFAULT_SOURCE_VARIABLE_POLICIES", - "PUF_SOURCE_VARIABLE_POLICY", - "SourceVariablePolicy", - "SourceVariablePolicySpec", - "resolve_source_variable_capabilities", -) -from microplex_us.target_registry import ( - TargetCategory, - TargetGroup, - TargetLevel, - TargetRegistry, - get_registry, - print_registry_summary, -) - -_TARGETS_EXPORTS = ( - "POLICYENGINE_US_COUNT_ENTITIES", - "policyengine_db_target_to_canonical_spec", - "policyengine_db_targets_to_canonical_set", -) -from microplex_us.unified_calibration import ( - CalibrationTarget, - UnifiedCalibrator, - calibrate_to_pe_targets, -) - -try: - from microplex_us.validation import ( - AGI_BRACKETS, - FILING_STATUSES, - BaselineComparison, - MetricComparison, - SOITargets, - ValidationResult, - compute_baseline_comparison, - compute_validation_metrics, - export_comparison_json, - get_soi_years, - load_soi_targets, - validate_against_soi, - ) -except ImportError: - AGI_BRACKETS = None - FILING_STATUSES = None - BaselineComparison = None - MetricComparison = None - SOITargets = None - ValidationResult = None - compute_baseline_comparison = None - compute_validation_metrics = None - export_comparison_json = None - get_soi_years = None - load_soi_targets = None - validate_against_soi = None - -_LAZY_EXPORT_MODULES: dict[str, str] = { - **dict.fromkeys(_PIPELINE_EXPORTS, "microplex_us.pipelines"), - **dict.fromkeys(_POLICYENGINE_EXPORTS, "microplex_us.policyengine"), - **dict.fromkeys(_SOURCE_REGISTRY_EXPORTS, "microplex_us.source_registry"), - **dict.fromkeys(_TARGETS_EXPORTS, "microplex_us.targets"), -} - - -def __getattr__(name: str) -> Any: - """Resolve optional heavyweight convenience exports on first access.""" - module_name = _LAZY_EXPORT_MODULES.get(name) - if module_name is None: - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") - value = getattr(import_module(module_name), name) - globals()[name] = value - return value - - -__all__ = [ - "CalibrationHarness", - "CalibrationResult", - "run_pe_parity_suite", - "PETargets", - "create_calibration_targets", - "get_pe_targets", - "USMicroplexArtifactPaths", - "FrontierMetric", - "PEUSDataRebuildCheckpointEvidenceResult", - "PEUSDataRebuildCheckpointResult", - "PEUSDataRebuildProgram", - "PEUSDataRebuildStage", - "PEUSDataRebuildStatus", - "USMicroplexRunRegistryEntry", - "USMicroplexBuildConfig", - "USMicroplexBuildResult", - "USMicroplexPipeline", - "attach_policyengine_us_data_rebuild_checkpoint_evidence", - "build_policyengine_us_data_rebuild_pipeline", - "build_policyengine_us_data_rebuild_native_audit", - "default_policyengine_us_data_rebuild_checkpoint_config", - "default_policyengine_us_data_rebuild_config", - "default_policyengine_us_data_rebuild_queries", - "default_policyengine_us_data_rebuild_source_providers", - "append_us_microplex_run_index_entry", - "append_us_microplex_run_registry_entry", - "compare_us_microplex_target_delta_rows", - "USMicroplexTargets", - "USMicroplexVersionedBuildArtifacts", - "USMicroplexExperimentReport", - "USMicroplexExperimentResult", - "USMicroplexSourceExperimentSpec", - "USMicroplexPerformanceHarnessConfig", - "USMicroplexPerformanceHarnessRequest", - "USMicroplexPerformanceHarnessResult", - "USMicroplexPerformanceSession", - "USMicroplexReducedCalibrationReport", - "USMicroplexReducedMultiCalibrationReport", - "USMicroplexReducedBenchmarkHarnessConfig", - "USMicroplexReducedBenchmarkHarnessResult", - "USMicroplexReducedBenchmarkReport", - "USMicroplexReducedBenchmarkSpec", - "USMicroplexReducedDimensionSpec", - "USMicroplexReducedMeasureSpec", - "USHouseholdTargetReweightingResult", - "DEFAULT_ATOMIC_AGE_BINS", - "DEFAULT_ATOMIC_AGE_LABELS", - "DEFAULT_ATOMIC_EMPLOYMENT_INCOME_BINS", - "DEFAULT_ATOMIC_EMPLOYMENT_INCOME_LABELS", - "default_us_atomic_rung0_benchmarks", - "default_us_atomic_rung1_benchmarks", - "default_us_atomic_rung2_calibration", - "default_us_atomic_rung3_calibration", - "default_us_atomic_rung4_calibration", - "default_us_atomic_rung5_calibration", - "default_policyengine_us_data_rebuild_program", - "default_us_source_mix_experiments", - "build_and_save_versioned_us_microplex", - "build_and_save_versioned_us_microplex_from_data_dir", - "build_and_save_versioned_us_microplex_from_source_provider", - "build_and_save_versioned_us_microplex_from_source_providers", - "build_policyengine_us_data_rebuild_markdown", - "build_us_microplex", - "build_us_microplex_run_registry_entry", - "load_us_microplex_run_registry", - "list_us_microplex_target_delta_rows", - "rebuild_us_microplex_run_index", - "calibrate_and_evaluate_us_reduced_benchmark_specs", - "calibrate_and_evaluate_us_reduced_benchmarks", - "reweight_us_household_targets", - "resolve_us_microplex_frontier_artifact_dir", - "resolve_us_microplex_run_index_path", - "evaluate_us_reduced_benchmark", - "reduced_benchmark_to_calibration_targets", - "reduced_benchmark_specs_to_calibration_targets", - "run_policyengine_us_data_rebuild_checkpoint", - "write_policyengine_us_data_rebuild_native_audit", - "run_us_microplex_performance_harness", - "run_us_microplex_reduced_benchmark_harness", - "run_us_microplex_source_experiments", - "save_us_microplex_artifacts", - "save_versioned_us_microplex_build_result", - "build_us_microplex_site_snapshot", - "select_us_microplex_frontier_index_row", - "save_versioned_us_microplex_artifacts", - "select_us_microplex_frontier_entry", - "write_us_microplex_site_snapshot", - "warm_us_microplex_parity_cache", - "PolicyEngineUSConstraint", - "PolicyEngineUSComparisonCache", - "PolicyEngineUSDBTarget", - "PolicyEngineUSDBTargetProvider", - "PolicyEngineUSEntityTableBundle", - "PolicyEngineUSMicrosimulationAdapter", - "PolicyEngineUSQuantityTarget", - "PolicyEngineUSSimulationTargetCompiler", - "PolicyEngineUSTargetComparisonReport", - "PolicyEngineUSTargetEvaluation", - "PolicyEngineUSTargetEvaluationReport", - "PolicyEngineUSVariableBinding", - "build_policyengine_us_export_column_names", - "build_policyengine_us_time_period_arrays", - "compare_policyengine_us_target_query_to_baseline", - "compile_policyengine_us_household_linear_constraints", - "compute_policyengine_us_definition_hash", - "detect_policyengine_pseudo_inputs", - "evaluate_policyengine_us_target_query", - "evaluate_policyengine_us_target_set", - "filter_supported_policyengine_us_targets", - "infer_policyengine_us_variable_bindings", - "load_policyengine_us_entity_tables", - "materialize_policyengine_us_variables", - "policyengine_us_formula_variables_for_targets", - "policyengine_us_variables_to_materialize", - "project_frame_to_time_period_arrays", - "write_policyengine_us_time_period_dataset", - "SourceVariablePolicy", - "SourceVariablePolicySpec", - "PUF_SOURCE_VARIABLE_POLICY", - "DEFAULT_SOURCE_VARIABLE_POLICIES", - "resolve_source_variable_capabilities", - "CalibrationTarget", - "UnifiedCalibrator", - "calibrate_to_pe_targets", - "CPSSummaryStats", - "CPSSyntheticGenerator", - "validate_synthetic", - "load_cps_asec", - "load_cps_for_synthesis", - "create_sample_data", - "get_data_info", - "BlockGeography", - "load_block_probabilities", - "derive_geographies", - "normalize_us_state_fips", - "prepare_cps_for_hierarchical", - "STATE_LEN", - "COUNTY_LEN", - "TRACT_LEN", - "BLOCK_LEN", - "TargetRegistry", - "TargetSpec", - "TargetSet", - "TargetCategory", - "TargetLevel", - "TargetGroup", - "get_registry", - "print_registry_summary", - "POLICYENGINE_US_COUNT_ENTITIES", - "policyengine_db_target_to_canonical_spec", - "policyengine_db_targets_to_canonical_set", - "AGI_BRACKETS", - "FILING_STATUSES", - "SOITargets", - "get_soi_years", - "load_soi_targets", - "compute_validation_metrics", - "ValidationResult", - "validate_against_soi", - "MetricComparison", - "BaselineComparison", - "compute_baseline_comparison", - "export_comparison_json", -] diff --git a/src/microplex_us/asset_reconciliation.py b/src/microplex_us/asset_reconciliation.py deleted file mode 100644 index 3fdc23dd..00000000 --- a/src/microplex_us/asset_reconciliation.py +++ /dev/null @@ -1,228 +0,0 @@ -"""SCF net-worth component reconciliation. - -This module ports the self-contained numpy net-worth reconciliation logic -from policyengine-us-data's enhanced-CPS (eCPS) asset imputation so that the -microplex-us SCF donor block can reconcile its 19 imputed balance-sheet -component leaves to a direct SCF ``net_worth`` anchor, exactly as eCPS does. - -Mirrored verbatim (signs, fallbacks, scaling) from -``policyengine_us_data/utils/asset_imputation.py`` on -``PolicyEngine/policyengine-us-data`` ``upstream/main``: - - * ``SCF_NET_WORTH_COMPONENT_TARGETS`` -> L40-66 - * ``SCF_NET_WORTH_COMPONENT_VARIABLES`` -> L57 - * ``SCF_OTHER_ASSET_COMPONENT`` / ``..._DEBT_COMPONENT`` -> L58-59 - * ``NET_WORTH_COMPONENT_SIGNS`` -> L77-86 - * ``compute_net_worth_from_components`` -> L373-398 (def) - * ``rebalance_scf_net_worth_components`` -> L401-490 (def) - -Storage convention (matches eCPS): every component leaf is stored as a -**positive magnitude** -- including the debt leaves. The ``-1`` debt sign in -``NET_WORTH_COMPONENT_SIGNS`` is applied only when reconstructing net worth -(``compute_net_worth_from_components``) and when rebalancing -(``rebalance_scf_net_worth_components``), never to the stored leaf values. The -SCF raw debt columns (mrthel, ccbal, edn_inst, ...) are themselves stored as -non-negative balances, so no sign flip happens at load time either. -""" - -from __future__ import annotations - -from collections.abc import Mapping, Sequence - -import numpy as np - -# Leaf -> raw SCF summary-extract source column(s). Single source column per -# leaf; mirrors SCF_NET_WORTH_COMPONENT_TARGETS in eCPS asset_imputation.py -# L40-66. Kept here as documentation / a single source of truth for tests. -SCF_NET_WORTH_COMPONENT_TARGETS: dict[str, tuple[str, ...]] = { - "scf_certificates_of_deposit": ("cds",), - "scf_savings_bonds": ("savbnd",), - "scf_retirement_assets": ("retqliq",), - "scf_cash_value_life_insurance": ("cashli",), - "scf_other_managed_assets": ("othma",), - "scf_other_financial_assets": ("othfin",), - "scf_primary_residence_value": ("houses",), - "scf_other_residential_real_estate": ("oresre",), - "scf_nonresidential_real_estate_equity": ("nnresre",), - "scf_business_equity": ("bus",), - "scf_other_nonfinancial_assets": ("othnfin",), - "scf_mortgage_debt": ("mrthel",), - "scf_other_residential_debt": ("resdbt",), - "scf_other_lines_of_credit": ("othloc",), - "scf_credit_card_debt": ("ccbal",), - "scf_vehicle_installment_debt": ("veh_inst",), - "scf_student_loan_debt": ("edn_inst",), - "scf_other_installment_debt": ("oth_inst",), - "scf_other_debt": ("odebt",), -} -SCF_NET_WORTH_COMPONENT_VARIABLES: tuple[str, ...] = tuple( - SCF_NET_WORTH_COMPONENT_TARGETS -) - -# Asset / debt fallback sinks used when an entire side of the balance sheet is -# unexpectedly empty for a household. Mirrors eCPS L58-59. -SCF_OTHER_ASSET_COMPONENT = "scf_other_financial_assets" -SCF_OTHER_DEBT_COMPONENT = "scf_other_debt" - -# Signed component map. Debt leaves enter net worth with -1; every asset leaf is -# +1 by default. Mirrors eCPS NET_WORTH_COMPONENT_SIGNS L77-86. ``auto_loan_balance`` -# is included because it is a SIPP/SCF-blended debt leaf that also subtracts. -NET_WORTH_COMPONENT_SIGNS: dict[str, float] = { - "auto_loan_balance": -1.0, - "scf_mortgage_debt": -1.0, - "scf_other_residential_debt": -1.0, - "scf_other_lines_of_credit": -1.0, - "scf_credit_card_debt": -1.0, - "scf_vehicle_installment_debt": -1.0, - "scf_student_loan_debt": -1.0, - "scf_other_installment_debt": -1.0, - "scf_other_debt": -1.0, -} - -# SIPP/SCF-blended policy leaves that the rebalance must preserve (it only -# rescales the SCF-only components). Mirrors eCPS protected_variables default -# (SIPP_LIQUID_ASSET_VARIABLES + SIPP_VEHICLE_ASSET_VARIABLES). -PROTECTED_BLENDED_COMPONENT_VARIABLES: tuple[str, ...] = ( - "bank_account_assets", - "stock_assets", - "bond_assets", - "household_vehicles_value", -) - - -def compute_net_worth_from_components( - *, - components: Mapping[str, Sequence[float]], - component_signs: Mapping[str, float] = NET_WORTH_COMPONENT_SIGNS, -) -> np.ndarray: - """Compute household net worth from signed balance-sheet components. - - Verbatim port of eCPS ``compute_net_worth_from_components`` - (asset_imputation.py L373-398). - """ - iterator = iter(components.items()) - try: - first_variable, first_values = next(iterator) - except StopIteration: - return np.array([], dtype=np.float32) - - first_values = np.asarray(first_values, dtype=np.float32) - component_total = (component_signs.get(first_variable, 1.0) * first_values).astype( - np.float32 - ) - - for variable, values in iterator: - values = np.asarray(values, dtype=np.float32) - if values.shape != component_total.shape: - raise ValueError( - f"{variable} has shape {values.shape}, but expected " - f"{component_total.shape}." - ) - component_total += component_signs.get(variable, 1.0) * values - - return component_total.astype(np.float32) - - -def rebalance_scf_net_worth_components( - *, - components: Mapping[str, Sequence[float]], - target_net_worth: Sequence[float], - adjustable_variables: Sequence[str] = SCF_NET_WORTH_COMPONENT_VARIABLES, - protected_variables: Sequence[str] = PROTECTED_BLENDED_COMPONENT_VARIABLES, - component_signs: Mapping[str, float] = NET_WORTH_COMPONENT_SIGNS, -) -> dict[str, np.ndarray]: - """Rebalance SCF-only leaves so the component formula matches net worth. - - Component QRFs are fit sequentially but still predict each leaf separately, - so their sum can drift from the direct SCF net worth distribution. Preserve - the final SIPP/SCF-blended policy leaves and proportionally scale SCF-only - same-sign leaves to the direct SCF net worth anchor. - - Verbatim port of eCPS ``rebalance_scf_net_worth_components`` - (asset_imputation.py L401-490). - """ - adjusted = { - variable: np.asarray(values, dtype=np.float32).copy() - for variable, values in components.items() - } - if not adjusted: - return adjusted - - target_net_worth = np.asarray(target_net_worth, dtype=np.float32) - first_shape = target_net_worth.shape - for variable, values in adjusted.items(): - if values.shape != first_shape: - raise ValueError( - f"{variable} has shape {values.shape}, but target_net_worth " - f"has shape {first_shape}." - ) - - protected_variables = set(protected_variables) - adjustable_variables = tuple( - variable - for variable in adjustable_variables - if variable in adjusted and variable not in protected_variables - ) - if not adjustable_variables: - return adjusted - - fixed_total = np.zeros_like(target_net_worth, dtype=np.float32) - for variable, values in adjusted.items(): - if variable not in adjustable_variables: - fixed_total += component_signs.get(variable, 1.0) * values - - asset_variables = [ - variable - for variable in adjustable_variables - if component_signs.get(variable, 1.0) >= 0 - ] - debt_variables = [ - variable - for variable in adjustable_variables - if component_signs.get(variable, 1.0) < 0 - ] - - asset_total = np.zeros_like(target_net_worth, dtype=np.float32) - for variable in asset_variables: - asset_total += adjusted[variable] - - debt_total = np.zeros_like(target_net_worth, dtype=np.float32) - for variable in debt_variables: - debt_total += adjusted[variable] - - desired_adjustable_total = target_net_worth - fixed_total - positive_target = desired_adjustable_total >= 0 - - required_assets = np.maximum(desired_adjustable_total + debt_total, 0) - asset_scale = np.divide( - required_assets, - asset_total, - out=np.ones_like(required_assets, dtype=np.float32), - where=(asset_total > 0) & positive_target, - ) - for variable in asset_variables: - adjusted[variable][positive_target] *= asset_scale[positive_target] - - needs_asset_fallback = positive_target & (asset_total <= 0) & (required_assets > 0) - if needs_asset_fallback.any() and SCF_OTHER_ASSET_COMPONENT in adjusted: - adjusted[SCF_OTHER_ASSET_COMPONENT][needs_asset_fallback] = required_assets[ - needs_asset_fallback - ] - - required_debts = np.maximum(asset_total - desired_adjustable_total, 0) - debt_scale = np.divide( - required_debts, - debt_total, - out=np.ones_like(required_debts, dtype=np.float32), - where=(debt_total > 0) & ~positive_target, - ) - for variable in debt_variables: - adjusted[variable][~positive_target] *= debt_scale[~positive_target] - - needs_debt_fallback = (~positive_target) & (debt_total <= 0) & (required_debts > 0) - if needs_debt_fallback.any() and SCF_OTHER_DEBT_COMPONENT in adjusted: - adjusted[SCF_OTHER_DEBT_COMPONENT][needs_debt_fallback] = required_debts[ - needs_debt_fallback - ] - - return adjusted diff --git a/src/microplex_us/bakeoff/__init__.py b/src/microplex_us/bakeoff/__init__.py deleted file mode 100644 index c1b1db90..00000000 --- a/src/microplex_us/bakeoff/__init__.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Scale-up benchmark harness for synthesizer comparison. - -Implements the stage-1/2/3 scale-up protocol from -`docs/synthesizer-benchmark-scale-up.md`: load real enhanced_cps_2024, -sub-sample to the stage's row count, fit each specified synthesizer on the -conditioning + target column set, and report PRDC coverage, training wall -time, peak RSS, and rare-cell preservation. - -Use from the CLI: - - uv run python -m microplex_us.bakeoff.scale_up \\ - --stage stage1 \\ - --methods ZI-QRF ZI-MAF ZI-QDNN \\ - --output artifacts/scale_up_stage1.json - -or programmatically: - - from microplex_us.bakeoff import ScaleUpRunner, stage1_config - runner = ScaleUpRunner(stage1_config()) - results = runner.run() -""" - -from microplex_us.bakeoff.scale_up import ( - ScaleUpResult, - ScaleUpRunner, - ScaleUpStageConfig, - DEFAULT_CONDITION_COLS, - DEFAULT_TARGET_COLS, - stage1_config, - stage2_config, - stage3_config, -) - -__all__ = [ - "ScaleUpResult", - "ScaleUpRunner", - "ScaleUpStageConfig", - "DEFAULT_CONDITION_COLS", - "DEFAULT_TARGET_COLS", - "stage1_config", - "stage2_config", - "stage3_config", -] diff --git a/src/microplex_us/bakeoff/__main__.py b/src/microplex_us/bakeoff/__main__.py deleted file mode 100644 index de59867a..00000000 --- a/src/microplex_us/bakeoff/__main__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Entry point for `python -m microplex_us.bakeoff`.""" - -from microplex_us.bakeoff.scale_up import main - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/bakeoff/local_methods.py b/src/microplex_us/bakeoff/local_methods.py deleted file mode 100644 index 0b488aca..00000000 --- a/src/microplex_us/bakeoff/local_methods.py +++ /dev/null @@ -1,293 +0,0 @@ -"""Local synthesizer methods for the bakeoff harness. - -These extend the `microplex.eval.benchmark` set without modifying the -upstream library. Methods defined here follow the same `_MultiSourceBase` -protocol so they slot into `ScaleUpRunner.fit_and_generate` unchanged. - -Current contents: - -- `CARTMethod`: synthpop-style CART per-column imputation. Each target - column gets a decision tree fit on the shared conditioning variables; - at generation time, the tree routes each synthetic record to a leaf, - and the predicted value is drawn uniformly from the training-set - values that landed in that leaf. This matches the default draw in - `synthpop`'s `syn.cart` (Nowok, Raab, and Dibben, 2016). - -- `ZICARTMethod`: zero-inflated variant that uses a random-forest - classifier for P(y > 0 | x) on columns where the training-set zero - fraction exceeds 10 %, then applies `CARTMethod` on the non-zero - subset. Mirrors `ZIQRFMethod`'s structure. -""" - -from __future__ import annotations - -from typing import Any - -import numpy as np -from microplex.eval.benchmark import _MultiSourceBase -from sklearn.tree import DecisionTreeRegressor - - -class CARTMethod(_MultiSourceBase): - """Synthpop-style CART per-column synthesis. - - Each column gets a `DecisionTreeRegressor` fit on the shared - conditioning variables. At generation time, each record is routed - to a leaf via `tree.apply`, and the synthetic value is sampled - uniformly from the training-set outcomes that landed in that leaf. - This reproduces `synthpop`'s default CART draw. - """ - - name = "CART" - - def __init__( - self, - max_depth: int | None = None, - min_samples_leaf: int = 5, - random_state: int = 42, - **kwargs: Any, - ) -> None: - super().__init__(zero_inflated=False) - self.max_depth = max_depth - self.min_samples_leaf = min_samples_leaf - self.random_state = random_state - - def _fit_column(self, col: str, X: np.ndarray, y: np.ndarray) -> None: - tree = DecisionTreeRegressor( - max_depth=self.max_depth, - min_samples_leaf=self.min_samples_leaf, - random_state=self.random_state, - ) - tree.fit(X, y) - leaf_ids = tree.apply(X) - leaf_to_values: dict[int, np.ndarray] = {} - for lid, val in zip(leaf_ids.tolist(), y.tolist(), strict=False): - leaf_to_values.setdefault(lid, []).append(val) - for lid, vals in leaf_to_values.items(): - leaf_to_values[lid] = np.asarray(vals, dtype=float) - self._col_models[col] = { - "tree": tree, - "leaf_to_values": leaf_to_values, - "fallback_value": float(np.median(y)) if len(y) > 0 else 0.0, - } - - def _generate_column( - self, - col: str, - X: np.ndarray, - rng: np.random.RandomState, - ) -> np.ndarray: - model = self._col_models[col] - tree = model["tree"] - leaf_to_values = model["leaf_to_values"] - fallback = model["fallback_value"] - leaf_ids = tree.apply(X) - out = np.empty(len(X), dtype=float) - for i, lid in enumerate(leaf_ids.tolist()): - vals = leaf_to_values.get(lid) - if vals is None or len(vals) == 0: - out[i] = fallback - else: - out[i] = float(vals[rng.randint(len(vals))]) - return out - - -class ZICARTMethod(CARTMethod): - """Zero-Inflated CART: random-forest zero classifier + CART leaf draw.""" - - name = "ZI-CART" - - def __init__(self, **kwargs: Any) -> None: - super().__init__(**kwargs) - self.zero_inflated = True - - -# --- Alternative zero-inflation classifiers (QDNN family) ---------------- - -def _patch_zi_classifier(method_instance: Any, classifier_factory: Any) -> None: - """Monkey-patch a ZI method's fit so the zero-classifier is a custom one. - - The upstream `_MultiSourceBase.fit` hardcodes - `RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)`. - This helper re-wraps `fit` so the zero-classifier is built by - `classifier_factory()` instead. All other fit/generate behavior is - preserved. - """ - import numpy as np - import pandas as pd - - original_fit = method_instance.fit.__func__ - - def patched_fit(self, sources, shared_cols): - self.shared_cols_ = list(shared_cols) - all_cols = set(shared_cols) - for survey_name, df in sources.items(): - for col in df.columns: - if col not in all_cols: - all_cols.add(col) - self.col_to_survey_[col] = survey_name - self.all_cols_ = list(all_cols) - - shared_dfs = [] - for survey_name, df in sources.items(): - available = [c for c in shared_cols if c in df.columns] - if len(available) == len(shared_cols): - shared_dfs.append(df[shared_cols].copy()) - self.shared_data_ = ( - pd.concat(shared_dfs, ignore_index=True) - if shared_dfs - else list(sources.values())[0][shared_cols].copy() - ) - - for col in self.all_cols_: - if col in shared_cols: - continue - survey_name = self.col_to_survey_[col] - survey_df = sources[survey_name] - available_shared = [c for c in shared_cols if c in survey_df.columns] - X = survey_df[available_shared].values - y = survey_df[col].values - - min_val = float(np.nanmin(y)) - at_min = np.isclose(y, min_val, atol=1e-6) - zero_frac = at_min.sum() / len(y) - self._col_stats[col] = {"min": min_val, "zero_frac": zero_frac} - - if ( - self.zero_inflated - and zero_frac >= self.zero_threshold - and at_min.sum() >= 10 - ): - labels = (~at_min).astype(int) - unique_labels = np.unique(labels) - if len(unique_labels) < 2: - # Degenerate column — all zeros or all non-zeros in - # training. Fall back to a constant classifier to avoid - # sklearn's single-class error. - constant_prob = float(unique_labels[0]) - - class _Constant: - classes_ = np.array([0, 1]) - - def predict_proba(self, X): - n = len(X) - return np.column_stack( - [np.full(n, 1.0 - constant_prob), - np.full(n, constant_prob)] - ) - - self._zero_classifiers[col] = _Constant() - else: - clf = classifier_factory() - clf.fit(X, labels) - self._zero_classifiers[col] = clf - if (~at_min).sum() >= 10: - self._fit_column(col, X[~at_min], y[~at_min]) - else: - self._fit_column(col, X, y) - return self - - method_instance.fit = patched_fit.__get__(method_instance, type(method_instance)) - - -def _make_zi_variant(base_name: str, classifier_factory: Any): - """Create a method class that uses a custom zero-classifier.""" - from microplex.eval.benchmark import ZIQDNNMethod - - base_classes = {"ZI-QDNN": ZIQDNNMethod} - if base_name not in base_classes: - raise ValueError(f"Unsupported base method for ZI variant: {base_name}") - base_cls = base_classes[base_name] - - class _Variant(base_cls): # type: ignore[misc, valid-type] - def __init__(self, **kwargs: Any) -> None: - super().__init__(**kwargs) - _patch_zi_classifier(self, classifier_factory) - - return _Variant - - -def _rf_calibrated_factory(): - from sklearn.calibration import CalibratedClassifierCV - from sklearn.ensemble import RandomForestClassifier - - rf = RandomForestClassifier( - n_estimators=50, random_state=42, n_jobs=-1 - ) - return CalibratedClassifierCV(rf, method="isotonic", cv=3) - - -def _logistic_factory(): - from sklearn.linear_model import LogisticRegression - - return LogisticRegression(max_iter=500, n_jobs=-1) - - -def _hgb_factory(): - from sklearn.ensemble import HistGradientBoostingClassifier - - return HistGradientBoostingClassifier(random_state=42) - - -def _dnn_factory(): - """A small-MLP zero-classifier for parity with the ZI-QDNN draw network. - - Uses sklearn's MLPClassifier (hidden: 64, 32; ReLU; Adam; max_iter=100). - Probabilities are via softmax on the output head. Not pre-calibrated; - combine with isotonic wrapping if calibration matters. - """ - from sklearn.neural_network import MLPClassifier - from sklearn.pipeline import Pipeline - from sklearn.preprocessing import StandardScaler - - return Pipeline([ - ("scaler", StandardScaler()), - ( - "mlp", - MLPClassifier( - hidden_layer_sizes=(64, 32), - activation="relu", - solver="adam", - max_iter=100, - random_state=42, - early_stopping=True, - ), - ), - ]) - - -def zi_qdnn_variant_factory(variant: str): - """Return a ZIQDNNMethod subclass with a swapped zero-classifier.""" - if variant == "logistic": - return _make_zi_variant("ZI-QDNN", _logistic_factory) - if variant == "hgb": - return _make_zi_variant("ZI-QDNN", _hgb_factory) - if variant == "calibrated": - return _make_zi_variant("ZI-QDNN", _rf_calibrated_factory) - if variant == "dnn": - return _make_zi_variant("ZI-QDNN", _dnn_factory) - raise ValueError(f"Unknown ZI variant: {variant}") - - -# Concrete ZI-QDNN variant with a histogram gradient boosting zero-classifier. -# This is the `microplex-us` default for ZI-QDNN: on the 77k x 50 Enhanced CPS -# isolated per-column log-loss evaluation (26 ZI-eligible columns, seed 42), -# HistGB Pareto-dominates the upstream RF default on log-loss (0.225 vs 0.310), -# Brier (0.071 vs 0.081), ECE (0.005 vs 0.039), and ROC-AUC (0.809 vs 0.737). -# See `docs/zi-factorial.md` for the full comparison. -# -# PRDC coverage on the same config is insensitive to the swap (0.7017 vs -# 0.7081); the downstream QDNN draw swamps the classifier-level gap. The -# default is chosen on intrinsic classifier quality, not on measured -# synthesis gains. The upstream RF-backed ZIQDNNMethod is still registered -# under "ZI-QDNN-RF" in `scale_up.py` for regression testing. -ZIQDNNHistGBMethod = _make_zi_variant("ZI-QDNN", _hgb_factory) -ZIQDNNHistGBMethod.name = "ZI-QDNN" - - -__all__ = [ - "CARTMethod", - "ZICARTMethod", - "ZIQDNNHistGBMethod", - "zi_qdnn_variant_factory", -] diff --git a/src/microplex_us/bakeoff/scale_up.py b/src/microplex_us/bakeoff/scale_up.py deleted file mode 100644 index e3539116..00000000 --- a/src/microplex_us/bakeoff/scale_up.py +++ /dev/null @@ -1,857 +0,0 @@ -"""Synthesizer scale-up benchmark harness. - -Stages per `docs/synthesizer-benchmark-scale-up.md`: - -- stage1: 100,000 rows x 50 columns of real enhanced_cps_2024 data -- stage2: 1,000,000 rows x 50 columns (via row replication or a larger source) -- stage3: 3,373,378 rows x 155 columns (v6 seed-ready shape — requires - regenerating the seed from donor integration; out of scope for this harness) - -The harness is deliberately narrow: - -- Single data source (enhanced_cps_2024). -- Fixed pool of synthesizer methods via `microplex.eval.benchmark.*Method`. -- PRDC coverage + wall time + peak RSS + rare-cell preservation. -- One result row per (method, stage, seed). - -Wider comparisons (CTGAN, TVAE, external tabular models) are left to -follow-up harnesses. Multi-source fusion is NOT exercised here — the v6 -pipeline's multi-source donor integration happens upstream of this eval. -""" - -from __future__ import annotations - -import argparse -import json -import logging -import resource -import time -from dataclasses import asdict, dataclass, field -from pathlib import Path -from typing import Any - -import h5py -import numpy as np -import pandas as pd - -try: - from prdc import compute_prdc # noqa: F401 (probed at run time) -except ImportError: # pragma: no cover - optional dep - compute_prdc = None - -LOGGER = logging.getLogger(__name__) - -DEFAULT_ENHANCED_CPS_PATH = ( - Path.home() - / "PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5" -) - - -# Curated default conditioning variables — demographics + household structure. -# Chosen to be numeric, low-cardinality, and genuinely shared across typical -# microsimulation use cases. Kept to 14 to leave room for 36 target variables -# under a 50-column stage-1 cap. -DEFAULT_CONDITION_COLS: tuple[str, ...] = ( - "age", - "is_female", - "is_hispanic", - "cps_race", - "is_disabled", - "is_blind", - "is_military", - "is_full_time_college_student", - "is_separated", - "state_fips", # broadcast from household - "has_esi", - "has_marketplace_health_coverage", - "own_children_in_household", - "pre_tax_contributions", -) - - -# Curated default target variables — income components, wealth, benefits. -# Chosen to span zero-inflated (most benefits, capital gains), continuous -# heavy-tailed (employment income, interest), and derived (net_worth). -DEFAULT_TARGET_COLS: tuple[str, ...] = ( - # Labor income (2) - "employment_income_last_year", - "self_employment_income_last_year", - # Interest + dividends (4) - "taxable_interest_income", - "tax_exempt_interest_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - # Capital gains (2) - "long_term_capital_gains", - "short_term_capital_gains", - # Retirement income (4) - "taxable_pension_income", - "tax_exempt_pension_income", - "taxable_ira_distributions", - "social_security", - # Social Security split (3) - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - # Other income (5) - "rental_income", - "farm_income", - "unemployment_compensation", - "alimony_income", - "miscellaneous_income", - # Wealth (5) - "bank_account_assets", - "bond_assets", - "stock_assets", - "net_worth", - "auto_loan_balance", - # Benefits / transfers (11) - "snap_reported", - "housing_assistance", - "ssi_reported", - "tanf_reported", - "disability_benefits", - "workers_compensation", - "veterans_benefits", - "child_support_received", - "child_support_expense", - "real_estate_taxes", - "health_savings_account_ald", -) - - -@dataclass(frozen=True) -class ScaleUpStageConfig: - """One stage of the synthesizer scale-up protocol.""" - - stage: str - n_rows: int | None # None means "use all available" - methods: tuple[str, ...] - condition_cols: tuple[str, ...] = DEFAULT_CONDITION_COLS - target_cols: tuple[str, ...] = DEFAULT_TARGET_COLS - holdout_frac: float = 0.2 - seed: int = 42 - k: int = 5 # PRDC nearest-neighbor k - n_generate: int | None = None # None => match training-set size - prdc_max_samples: int = 20_000 - method_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Per-method hyperparameter overrides. - - Keys are the method registry names (`"ZI-QRF"`, `"ZI-MAF"`, - `"ZI-QDNN"`, ...); values are dicts of kwargs forwarded to the - method's constructor. Empty dict means "use method class defaults". - - Example: - method_kwargs={"ZI-MAF": {"n_layers": 8, "hidden_dim": 128, "epochs": 200}} - """ - """Cap on real and synth sample sizes fed to PRDC. - - The `prdc` library materializes full pairwise distance matrices - (O(n_real * n_synth * n_features)). With n_real = 15k and n_synth = - 61k and 50 features, that's ~7 GB per matrix — enough to OOM-kill - the process on a 48 GB workstation once multiple copies exist. The - metric is stable well below this scale: PRDC coverage on 15k real - vs 15k synthetic is essentially the same as 15k real vs 61k - synthetic. Cap keeps the evaluation tractable and consistent across - stages. - """ - data_path: Path = field(default=DEFAULT_ENHANCED_CPS_PATH) - year: str = "2024" - rare_cell_checks: tuple[dict[str, Any], ...] = field( - default_factory=lambda: ( - { - "name": "elderly_self_employed", - "mask": lambda df: (df["age"] >= 62) - & (df["self_employment_income_last_year"] > 0), - }, - { - "name": "young_dividend", - "mask": lambda df: (df["age"] < 30) - & (df["qualified_dividend_income"] > 0), - }, - { - "name": "disabled_ssdi", - "mask": lambda df: (df["is_disabled"] == 1) - & (df["social_security_disability"] > 0), - }, - { - "name": "top_1pct_employment", - "mask": lambda df: df["employment_income_last_year"] - >= df["employment_income_last_year"].quantile(0.99), - }, - ) - ) - - @property - def all_cols(self) -> list[str]: - # preserve order: conditioning first, then targets - seen: set[str] = set() - out: list[str] = [] - for c in list(self.condition_cols) + list(self.target_cols): - if c not in seen: - seen.add(c) - out.append(c) - return out - - -@dataclass -class ScaleUpResult: - """One (method, stage) outcome.""" - - stage: str - method: str - seed: int - n_train_rows: int - n_holdout_rows: int - n_cols: int - fit_wall_seconds: float - generate_wall_seconds: float - peak_rss_gb_during_fit: float - precision: float - density: float - coverage: float - rare_cell_ratios: dict[str, float] - zero_rate_mae: float - zero_rate_per_column: dict[str, dict[str, float]] = field(default_factory=dict) - notes: str = "" - - def to_dict(self) -> dict[str, Any]: - return asdict(self) - - -def stage1_config(methods: tuple[str, ...] = ("ZI-QRF", "ZI-MAF", "ZI-QDNN")) -> ScaleUpStageConfig: - """Stage 1: ~100k rows x 50 cols on real enhanced_cps_2024. - - enhanced_cps_2024 has 77,006 rows — use all of them. The nominal - 100k-row target from the protocol doc isn't achievable with only this - source; use the full dataset and note the actual row count in the - result record. - """ - return ScaleUpStageConfig(stage="stage1", n_rows=None, methods=methods) - - -def stage2_config(methods: tuple[str, ...] = ("ZI-QRF", "ZI-MAF", "ZI-QDNN")) -> ScaleUpStageConfig: - """Stage 2: 1M rows x 50 cols. - - Requires a larger source than enhanced_cps_2024 (77k rows). Intended - future use once the v6 seed-like 3.4M-row frame is retrievable. - Running stage 2 against enhanced_cps_2024 replicates rows, which is - not the same thing — not recommended. - """ - return ScaleUpStageConfig(stage="stage2", n_rows=1_000_000, methods=methods) - - -def stage3_config(methods: tuple[str, ...] = ("ZI-QRF", "ZI-MAF", "ZI-QDNN")) -> ScaleUpStageConfig: - """Stage 3: full 3.4M-row x 155-col v6 seed-ready shape.""" - return ScaleUpStageConfig(stage="stage3", n_rows=3_373_378, methods=methods) - - -_ENTITY_LINK_COLUMNS: tuple[tuple[str, str, str], ...] = ( - # (entity_name, entity_id_column, person_link_column) - ("household", "household_id", "person_household_id"), - ("spm_unit", "spm_unit_id", "person_spm_unit_id"), - ("tax_unit", "tax_unit_id", "person_tax_unit_id"), - ("family", "family_id", "person_family_id"), - ("marital_unit", "marital_unit_id", "person_marital_unit_id"), -) - - -def _build_entity_lookups( - f: h5py.File, year: str -) -> tuple[int, dict[str, tuple[int, np.ndarray]]]: - """Return (person_n, {entity_name: (entity_n, person_to_entity_position)}). - - For each non-person entity, returns a length-`person_n` integer array that, - when used to index a length-`entity_n` variable, broadcasts the entity - value down to person level. - """ - if "person_id" not in f or year not in f["person_id"]: - raise KeyError( - f"person_id/{year} missing from enhanced_cps file. Can't determine " - "person count." - ) - person_n = int(f["person_id"][year].shape[0]) - - lookups: dict[str, tuple[int, np.ndarray]] = {} - for ent_name, eid_col, pid_col in _ENTITY_LINK_COLUMNS: - if eid_col not in f or year not in f[eid_col]: - continue - if pid_col not in f or year not in f[pid_col]: - continue - entity_ids = f[eid_col][year][:] - person_ent_ids = f[pid_col][year][:] - id_to_idx = {int(v): i for i, v in enumerate(entity_ids)} - try: - lookup = np.fromiter( - (id_to_idx[int(v)] for v in person_ent_ids), - dtype=np.int64, - count=len(person_ent_ids), - ) - except KeyError as exc: - raise ValueError( - f"entity {ent_name!r}: person's {pid_col} value {exc} not in " - f"{eid_col} — entity table inconsistent" - ) from exc - lookups[ent_name] = (int(len(entity_ids)), lookup) - return person_n, lookups - - -def _load_enhanced_cps( - data_path: Path, - year: str, - columns: list[str], -) -> pd.DataFrame: - """Load enhanced_cps columns, broadcasting non-person entities to person level. - - enhanced_cps_2024 stores variables at their native entity level (person, - household, tax_unit, spm_unit, family, marital_unit). To land a flat - person-level DataFrame, this helper uses the `person__id` → - `_id` linkage to project parent-entity values down. - """ - if not data_path.exists(): - raise FileNotFoundError( - f"enhanced_cps_{year} not found at {data_path}. " - "Set `data_path` explicitly in ScaleUpStageConfig." - ) - - with h5py.File(data_path, "r") as f: - available = set(f.keys()) - missing = [c for c in columns if c not in available] - if missing: - raise KeyError( - f"Columns not in enhanced_cps: {missing[:5]}{'...' if len(missing) > 5 else ''}" - ) - - person_n, entity_lookups = _build_entity_lookups(f, year) - - data: dict[str, np.ndarray] = {} - for col in columns: - grp = f[col] - if year not in grp: - raise KeyError(f"Column {col!r} has no {year!r} entry") - arr = grp[year][:] - if arr.shape[0] == person_n: - data[col] = arr - continue - # Broadcast via entity lookup - broadcast = None - for ent_name, (ent_n, lookup) in entity_lookups.items(): - if arr.shape[0] == ent_n: - broadcast = arr[lookup] - break - if broadcast is None: - available_sizes = {e: n for e, (n, _) in entity_lookups.items()} - available_sizes["person"] = person_n - raise ValueError( - f"Column {col!r} has {arr.shape[0]} rows but no matching " - f"entity linkage. Sizes available: {available_sizes}" - ) - data[col] = broadcast - - return pd.DataFrame(data) - - -def _peak_rss_gb() -> float: - """Current process's max resident set size in GB. - - Unit of `ru_maxrss` is platform-dependent: - - Linux: kilobytes - - macOS (Darwin): bytes - - FreeBSD: kilobytes (but verify) - - Cross-checked against psutil on macOS Python 3.14: ru_maxrss is in bytes - (e.g., 190_873_600 raw = 0.18 GB matches `psutil.Process().memory_info().rss`). - """ - import sys - - r = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - if sys.platform == "darwin": - bytes_rss = r - else: - # Linux and most BSDs: kilobytes - bytes_rss = r * 1024 - return bytes_rss / (1024**3) - - -def _compute_rare_cell_ratios( - real: pd.DataFrame, - synthetic: pd.DataFrame, - checks: tuple[dict[str, Any], ...], -) -> dict[str, float]: - """Per-check: synthetic count / real count in the rare cell. - - Matches the pattern in `microplex/benchmarks/results/sparse_coverage.csv`. - 1.0 means the synthetic preserves the rare cell at its real frequency; - 0.0 means the cell is annihilated. - """ - ratios: dict[str, float] = {} - for check in checks: - name = check["name"] - mask_fn = check["mask"] - try: - real_mask = mask_fn(real).fillna(False) - except (KeyError, AttributeError) as exc: - ratios[name] = float("nan") - LOGGER.warning( - "rare-cell check %r skipped (%s: %s)", name, type(exc).__name__, exc - ) - continue - try: - synth_mask = mask_fn(synthetic).fillna(False) - except (KeyError, AttributeError): - ratios[name] = float("nan") - continue - real_count = max(int(real_mask.sum()), 1) - synth_count = int(synth_mask.sum()) - ratios[name] = float(synth_count) / float(real_count) - return ratios - - -def _compute_zero_rate_mae(real: pd.DataFrame, synthetic: pd.DataFrame) -> float: - """Mean absolute error in per-column zero-rate across the common column set.""" - cols = [c for c in real.columns if c in synthetic.columns] - errs = [] - for c in cols: - r_zero = float((real[c] == 0).mean()) - s_zero = float((synthetic[c] == 0).mean()) - errs.append(abs(r_zero - s_zero)) - return float(np.mean(errs)) if errs else 0.0 - - -def _compute_zero_rate_per_column( - real: pd.DataFrame, synthetic: pd.DataFrame -) -> dict[str, dict[str, float]]: - """Per-column {real_zero_rate, synth_zero_rate, abs_diff} breakdown.""" - cols = [c for c in real.columns if c in synthetic.columns] - out: dict[str, dict[str, float]] = {} - for c in cols: - r_zero = float((real[c] == 0).mean()) - s_zero = float((synthetic[c] == 0).mean()) - out[c] = { - "real": r_zero, - "synth": s_zero, - "abs_diff": abs(r_zero - s_zero), - } - return out - - -def _compute_prdc( - real: pd.DataFrame, - synthetic: pd.DataFrame, - k: int, - max_samples: int = 20_000, - seed: int = 42, -) -> tuple[float, float, float]: - """Return (precision, density, coverage) via the `prdc` library. - - `max_samples` caps both `real` and `synthetic` sample sizes before - PRDC to keep the O(n_real * n_synth * n_features) distance matrices - within a 48 GB-workstation budget. - """ - if compute_prdc is None: - raise ImportError( - "PRDC requires the `prdc` package. " - "Install with: uv pip install prdc" - ) - - from sklearn.preprocessing import StandardScaler - - cols = [c for c in real.columns if c in synthetic.columns] - if not cols: - raise ValueError("No shared columns between real and synthetic for PRDC") - - rng = np.random.default_rng(seed) - if len(real) > max_samples: - real = real.iloc[rng.choice(len(real), size=max_samples, replace=False)] - if len(synthetic) > max_samples: - synthetic = synthetic.iloc[ - rng.choice(len(synthetic), size=max_samples, replace=False) - ] - - r = real[cols].to_numpy(dtype=np.float64) - s = synthetic[cols].to_numpy(dtype=np.float64) - - if len(r) < k + 1 or len(s) < k + 1: - return (0.0, 0.0, 0.0) - - scaler = StandardScaler() - r_scaled = scaler.fit_transform(r) - s_scaled = scaler.transform(s) - - metrics = compute_prdc(r_scaled, s_scaled, nearest_k=k) - return ( - float(metrics["precision"]), - float(metrics["density"]), - float(metrics["coverage"]), - ) - - -def _snap_categorical_shared_cols( - synthetic: pd.DataFrame, - train: pd.DataFrame, - shared_cols: list[str], -) -> pd.DataFrame: - """Snap categorical-looking shared-column synthetic values to training-pool values. - - `microplex.eval.benchmark._MultiSourceBase.generate` adds Gaussian noise - (sigma=0.1) to EVERY shared-column value before regenerating the - non-shared columns. This pollutes binary and categorical conditioning - variables (e.g., `is_military=1` becomes `1.04`; `cps_race=3` becomes - `2.97`, `state_fips=6` becomes `6.11`). - - Heuristic: a shared column is "categorical-looking" if every value in - the training pool is exactly integer-valued (up to float precision). - Those columns have every synthetic value snapped to its nearest - training-pool value. Continuous shared columns (non-integer training - values) keep the noise — it may legitimately add variation for them. - - Examples of columns this catches: all is_* flags, cps_race, state_fips, - own_children_in_household. - - Examples of columns left alone: age (if fractional), pre_tax_contributions. - """ - out = synthetic.copy() - for col in shared_cols: - if col not in out.columns or col not in train.columns: - continue - train_vals = train[col].to_numpy() - # Integer-valued iff every value equals its rounded version. - if not np.all(np.isclose(train_vals, np.round(train_vals), atol=1e-6)): - continue - uniques = np.sort(pd.unique(train_vals)) - synth_vals = out[col].to_numpy() - # For every synthetic value, find the nearest training-pool value. - idx = np.searchsorted(uniques, synth_vals) - idx = np.clip(idx, 0, len(uniques) - 1) - left = uniques[np.clip(idx - 1, 0, len(uniques) - 1)] - right = uniques[idx] - snapped = np.where( - np.abs(synth_vals - left) <= np.abs(synth_vals - right), - left, - right, - ) - out[col] = snapped.astype(train[col].dtype, copy=False) - return out - - -def _build_method(method_name: str, kwargs: dict[str, Any] | None = None) -> Any: - from microplex.eval.benchmark import ( - CTGANMethod, - MAFMethod, - QDNNMethod, - QRFMethod, - TVAEMethod, - ZIMAFMethod, - ZIQDNNMethod, - ZIQRFMethod, - ) - - from microplex_us.bakeoff.local_methods import ( - CARTMethod, - ZICARTMethod, - ZIQDNNHistGBMethod, - ) - - registry = { - "QRF": QRFMethod, - "ZI-QRF": ZIQRFMethod, - "QDNN": QDNNMethod, - # ZI-QDNN defaults to HistGB zero-classifier (microplex-us override). - # The upstream RF-backed variant is kept under "ZI-QDNN-RF" so prior - # benchmark artifacts (which were produced with RF) remain reproducible. - # See docs/zi-factorial.md for the rationale. - "ZI-QDNN": ZIQDNNHistGBMethod, - "ZI-QDNN-RF": ZIQDNNMethod, - "MAF": MAFMethod, - "ZI-MAF": ZIMAFMethod, - "CTGAN": CTGANMethod, - "TVAE": TVAEMethod, - "CART": CARTMethod, - "ZI-CART": ZICARTMethod, - } - if method_name not in registry: - raise ValueError( - f"Unknown method {method_name!r}. Known: {sorted(registry)}" - ) - return registry[method_name](**(kwargs or {})) - - -class ScaleUpRunner: - """Runs one stage of the scale-up protocol.""" - - def __init__(self, config: ScaleUpStageConfig) -> None: - self.config = config - self.logger = logging.getLogger(f"{__name__}.ScaleUpRunner") - - def load_frame(self) -> pd.DataFrame: - df = _load_enhanced_cps( - self.config.data_path, self.config.year, self.config.all_cols - ) - self.logger.info( - "loaded enhanced_cps: %d rows, %d cols", len(df), len(df.columns) - ) - # Cast to a single dtype so downstream DataFrame.values stays - # numeric-uniform (torch-based methods reject object arrays, which - # is what pandas produces when columns mix bool/int32/float32). - df = df.astype(np.float32) - if self.config.n_rows is not None and len(df) > self.config.n_rows: - rng = np.random.default_rng(self.config.seed) - idx = rng.choice(len(df), size=self.config.n_rows, replace=False) - df = df.iloc[idx].reset_index(drop=True) - self.logger.info("subsampled to %d rows", len(df)) - return df - - def split(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: - rng = np.random.default_rng(self.config.seed) - idx = rng.permutation(len(df)) - cut = int(len(df) * (1.0 - self.config.holdout_frac)) - train_idx, holdout_idx = idx[:cut], idx[cut:] - train = df.iloc[train_idx].reset_index(drop=True) - holdout = df.iloc[holdout_idx].reset_index(drop=True) - return train, holdout - - def fit_and_generate( - self, method_name: str, train: pd.DataFrame, n_generate: int - ) -> tuple[pd.DataFrame, dict[str, float]]: - """Fit method on `train` and generate `n_generate` synthetic records.""" - method = _build_method( - method_name, kwargs=self.config.method_kwargs.get(method_name) - ) - - # The benchmark methods take a multi-source dict; pass a single source. - sources = {"enhanced_cps_2024": train.copy()} - shared_cols = list(self.config.condition_cols) - - before_rss = _peak_rss_gb() - t_fit = time.perf_counter() - method.fit(sources=sources, shared_cols=shared_cols) - fit_wall = time.perf_counter() - t_fit - peak_fit_rss = max(_peak_rss_gb(), before_rss) - - t_gen = time.perf_counter() - synthetic = method.generate(n_generate, seed=self.config.seed) - gen_wall = time.perf_counter() - t_gen - - synthetic = _snap_categorical_shared_cols(synthetic, train, shared_cols) - - return synthetic, { - "fit_wall_seconds": fit_wall, - "generate_wall_seconds": gen_wall, - "peak_rss_gb_during_fit": peak_fit_rss, - } - - def run( - self, - incremental_path: Path | None = None, - ) -> list[ScaleUpResult]: - """Run every configured method on the loaded frame; return results. - - If `incremental_path` is given, each method's `ScaleUpResult` is - appended to that path as JSONL *as soon as it completes*. This - guarantees at least partial output if a later method crashes or - the host is interrupted. - """ - df = self.load_frame() - train, holdout = self.split(df) - n_generate = self.config.n_generate or len(train) - self.logger.info( - "split %d train / %d holdout; will generate %d synthetic", - len(train), - len(holdout), - n_generate, - ) - - if incremental_path is not None: - incremental_path.parent.mkdir(parents=True, exist_ok=True) - # Truncate any prior JSONL so this run's output is self-contained. - incremental_path.write_text("") - - results: list[ScaleUpResult] = [] - for method_name in self.config.methods: - self.logger.info("== fitting %s ==", method_name) - try: - synthetic, timing = self.fit_and_generate( - method_name, train, n_generate - ) - except Exception as exc: # pragma: no cover - self.logger.error("method %s failed: %s", method_name, exc) - result = ScaleUpResult( - stage=self.config.stage, - method=method_name, - seed=self.config.seed, - n_train_rows=len(train), - n_holdout_rows=len(holdout), - n_cols=len(df.columns), - fit_wall_seconds=0.0, - generate_wall_seconds=0.0, - peak_rss_gb_during_fit=0.0, - precision=0.0, - density=0.0, - coverage=0.0, - rare_cell_ratios={}, - zero_rate_mae=0.0, - notes=f"FAILED: {type(exc).__name__}: {exc}", - ) - results.append(result) - self._persist_incremental(incremental_path, result) - continue - - precision, density, coverage = _compute_prdc( - holdout, - synthetic, - k=self.config.k, - max_samples=self.config.prdc_max_samples, - seed=self.config.seed, - ) - rare = _compute_rare_cell_ratios( - holdout, synthetic, self.config.rare_cell_checks - ) - zero_mae = _compute_zero_rate_mae(holdout, synthetic) - zero_per_col = _compute_zero_rate_per_column(holdout, synthetic) - - result = ScaleUpResult( - stage=self.config.stage, - method=method_name, - seed=self.config.seed, - n_train_rows=len(train), - n_holdout_rows=len(holdout), - n_cols=len(df.columns), - fit_wall_seconds=timing["fit_wall_seconds"], - generate_wall_seconds=timing["generate_wall_seconds"], - peak_rss_gb_during_fit=timing["peak_rss_gb_during_fit"], - precision=precision, - density=density, - coverage=coverage, - rare_cell_ratios=rare, - zero_rate_mae=zero_mae, - zero_rate_per_column=zero_per_col, - notes="", - ) - results.append(result) - self._persist_incremental(incremental_path, result) - self.logger.info( - " %s: coverage=%.3f precision=%.3f density=%.3f fit=%.1fs gen=%.1fs peak_rss=%.2fGB", - method_name, - coverage, - precision, - density, - timing["fit_wall_seconds"], - timing["generate_wall_seconds"], - timing["peak_rss_gb_during_fit"], - ) - return results - - @staticmethod - def _persist_incremental( - path: Path | None, result: ScaleUpResult - ) -> None: - """Append one `ScaleUpResult` as a JSONL row (if path is set).""" - if path is None: - return - with path.open("a") as f: - f.write(json.dumps(result.to_dict(), default=str)) - f.write("\n") - - -def _results_to_dataframe(results: list[ScaleUpResult]) -> pd.DataFrame: - rows: list[dict[str, Any]] = [] - for r in results: - d = r.to_dict() - rare = d.pop("rare_cell_ratios") - for cell_name, ratio in rare.items(): - d[f"rare__{cell_name}"] = ratio - rows.append(d) - return pd.DataFrame(rows) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser(description=__doc__ or "scale-up runner") - parser.add_argument( - "--stage", - choices=["stage1", "stage2", "stage3"], - default="stage1", - ) - parser.add_argument( - "--methods", - nargs="+", - default=["ZI-QRF", "ZI-MAF", "ZI-QDNN"], - ) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument( - "--output", - type=Path, - default=Path("artifacts/scale_up_results.json"), - ) - parser.add_argument( - "--log-level", - default="INFO", - choices=["DEBUG", "INFO", "WARNING", "ERROR"], - ) - parser.add_argument( - "--incremental-jsonl", - type=Path, - default=None, - help=( - "Optional path to a JSONL file where each method's result is " - "appended as soon as it completes. Defaults to the final " - "--output path with '.partial.jsonl' appended." - ), - ) - args = parser.parse_args(argv) - - if args.incremental_jsonl is None: - args.incremental_jsonl = args.output.with_suffix( - args.output.suffix + ".partial.jsonl" - ) - - logging.basicConfig( - level=getattr(logging, args.log_level), - format="%(asctime)s %(levelname)s %(name)s: %(message)s", - ) - - stage_fn = {"stage1": stage1_config, "stage2": stage2_config, "stage3": stage3_config} - cfg = stage_fn[args.stage](methods=tuple(args.methods)) - cfg = ScaleUpStageConfig( - stage=cfg.stage, - n_rows=cfg.n_rows, - methods=tuple(args.methods), - condition_cols=cfg.condition_cols, - target_cols=cfg.target_cols, - holdout_frac=cfg.holdout_frac, - seed=args.seed, - k=cfg.k, - n_generate=cfg.n_generate, - data_path=cfg.data_path, - year=cfg.year, - rare_cell_checks=cfg.rare_cell_checks, - ) - - runner = ScaleUpRunner(cfg) - results = runner.run(incremental_path=args.incremental_jsonl) - - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text( - json.dumps( - { - "stage": cfg.stage, - "methods": list(cfg.methods), - "seed": cfg.seed, - "n_conditioning_cols": len(cfg.condition_cols), - "n_target_cols": len(cfg.target_cols), - "results": [r.to_dict() for r in results], - }, - indent=2, - default=str, - ) - ) - LOGGER.info("wrote %d results to %s", len(results), args.output) - - df = _results_to_dataframe(results) - print() - print(df.to_string(index=False)) - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/calibration/__init__.py b/src/microplex_us/calibration/__init__.py deleted file mode 100644 index 1a8e6829..00000000 --- a/src/microplex_us/calibration/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Calibration backends for microplex-us. - -The mainline production calibrator is `MicrocalibrateAdapter`, which -wraps `microcalibrate`'s gradient-descent chi-squared solver. It is now -country-agnostic and lives in upstream `microplex.calibration` so every -country package (microplex-us, microplex-uk, etc.) shares one -identity-preserving calibrator. This module re-exports the adapter so -existing `from microplex_us.calibration import MicrocalibrateAdapter` -imports keep working. - -See `docs/calibrator-decision.md` for the rationale. -""" - -from microplex.calibration import ( - MicrocalibrateAdapter, - MicrocalibrateAdapterConfig, -) - -__all__ = [ - "MicrocalibrateAdapter", - "MicrocalibrateAdapterConfig", -] diff --git a/src/microplex_us/calibration_harness.py b/src/microplex_us/calibration_harness.py deleted file mode 100644 index 4066389b..00000000 --- a/src/microplex_us/calibration_harness.py +++ /dev/null @@ -1,498 +0,0 @@ -"""Calibration harness for PE parity experiments over canonical target specs.""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any - -import numpy as np -import pandas as pd -from microplex.core import EntityType -from microplex.targets import ( - FilterOperator, - TargetAggregation, - TargetFilter, - TargetProvider, - TargetQuery, - TargetSpec, -) - -from microplex_us.target_registry import ( - TargetCategory, - TargetLevel, - TargetRegistry, - get_registry, - target_available_in_cps, - target_category, - target_group_name, - target_level, - target_requires_imputation, -) - - -@dataclass -class CalibrationResult: - """Result of a calibration run.""" - - weights: np.ndarray - targets_used: list[str] - errors: dict[str, float] - iterations: int - converged: bool - weight_stats: dict[str, float] - - @property - def mean_error(self) -> float: - return float(np.mean(list(self.errors.values()))) if self.errors else 0.0 - - @property - def max_error(self) -> float: - return max(self.errors.values()) if self.errors else 0.0 - - def summary(self) -> str: - """Generate summary string.""" - lines = [ - "Calibration Result:", - f" Targets: {len(self.targets_used)}", - f" Converged: {self.converged} ({self.iterations} iterations)", - f" Mean error: {self.mean_error:.2f}%", - f" Max error: {self.max_error:.2f}%", - f" Weight CV: {self.weight_stats.get('cv', 0):.2f}", - ] - return "\n".join(lines) - - -class CalibrationHarness: - """Harness for calibration experiments over one entity frame at a time.""" - - def __init__( - self, - registry: TargetRegistry | None = None, - *, - target_provider: TargetProvider | None = None, - ): - if target_provider is None: - self.registry = registry or get_registry() - self.target_provider = self.registry - else: - self.registry = registry - self.target_provider = target_provider - self._results: dict[str, CalibrationResult] = {} - - def select_targets( - self, - *, - categories: list[TargetCategory] | None = None, - levels: list[TargetLevel] | None = None, - groups: list[str] | None = None, - only_available: bool = False, - entity: EntityType | str | None = None, - period: int | str | None = None, - provider_filters: dict[str, Any] | None = None, - ) -> list[TargetSpec]: - """Select canonical targets from the configured provider.""" - query = TargetQuery( - period=period, - entity=entity, - provider_filters=dict(provider_filters or {}), - ) - targets = self.target_provider.load_target_set(query).targets - return [ - target - for target in targets - if _matches_us_target_filters( - target, - categories=categories, - levels=levels, - groups=groups, - only_available=only_available, - ) - ] - - def get_target_vector( - self, - df: pd.DataFrame, - targets: list[TargetSpec], - *, - entity: EntityType | str | None = None, - ) -> tuple[np.ndarray, np.ndarray, list[str]]: - """Build a design matrix and target vector from canonical targets.""" - resolved_entity = _resolve_entity(entity) - n_rows = len(df) - design_rows: list[np.ndarray] = [] - target_values: list[float] = [] - target_names: list[str] = [] - - for spec in targets: - if resolved_entity is not None and spec.entity is not resolved_entity: - continue - - if any(feature not in df.columns for feature in spec.required_features): - continue - - row = _build_constraint_row(df, spec) - design_rows.append(row) - target_values.append(spec.value) - target_names.append(spec.name) - - design_matrix = ( - np.column_stack(design_rows) if design_rows else np.zeros((n_rows, 0)) - ) - target_vector = np.array(target_values, dtype=float) - return design_matrix, target_vector, target_names - - def calibrate( - self, - df: pd.DataFrame, - targets: list[TargetSpec], - weight_col: str = "weight", - *, - entity: EntityType | str | None = None, - max_iter: int = 100, - tol: float = 1e-6, - bounds: tuple[float, float] = (0.01, 100.0), - verbose: bool = True, - ) -> CalibrationResult: - """Run IPF calibration against canonical targets for one entity frame.""" - if weight_col in df.columns: - weights = df[weight_col].to_numpy(dtype=float, copy=True) - else: - weights = np.ones(len(df), dtype=float) - - design_matrix, target_vec, names = self.get_target_vector( - df, - targets, - entity=entity, - ) - n_samples, n_targets = design_matrix.shape - - if verbose: - print(f"Calibrating {n_samples:,} samples to {n_targets} targets") - - if n_targets == 0: - return CalibrationResult( - weights=weights, - targets_used=[], - errors={}, - iterations=0, - converged=True, - weight_stats=_weight_stats(weights), - ) - - converged = False - for iteration in range(max_iter): - old_weights = weights.copy() - - for target_index in range(n_targets): - if target_vec[target_index] == 0: - continue - - current = np.sum(weights * design_matrix[:, target_index]) - if current <= 0: - continue - - factor = target_vec[target_index] / current - factor = np.clip(factor, bounds[0], bounds[1]) - mask = design_matrix[:, target_index] > 0 - weights[mask] *= factor - - max_change = np.max(np.abs(weights - old_weights) / (old_weights + 1e-10)) - if max_change < tol: - converged = True - if verbose: - print(f"Converged after {iteration + 1} iterations") - break - - errors: dict[str, float] = {} - if verbose: - print(f"\n{'Target':<40} {'Computed':>15} {'Target':>15} {'Error':>10}") - print("-" * 85) - - for target_index, name in enumerate(names): - computed = float(np.sum(weights * design_matrix[:, target_index])) - target = float(target_vec[target_index]) - if target != 0: - error = abs(computed - target) / abs(target) * 100 - else: - error = 0 if computed == 0 else 100 - errors[name] = min(error, 100.0) - - if verbose: - if abs(target) > 1e9: - computed_str = f"${computed / 1e9:.1f}B" - target_str = f"${target / 1e9:.1f}B" - elif abs(target) > 1e6: - computed_str = f"{computed / 1e6:.1f}M" - target_str = f"{target / 1e6:.1f}M" - else: - computed_str = f"{computed:,.0f}" - target_str = f"{target:,.0f}" - print(f"{name:<40} {computed_str:>15} {target_str:>15} {error:>9.1f}%") - - return CalibrationResult( - weights=weights, - targets_used=names, - errors=errors, - iterations=iteration + 1, - converged=converged, - weight_stats=_weight_stats(weights), - ) - - def run_experiment( - self, - df: pd.DataFrame, - name: str, - *, - categories: list[TargetCategory] | None = None, - levels: list[TargetLevel] | None = None, - groups: list[str] | None = None, - only_available: bool = False, - entity: EntityType | str | None = None, - period: int | str | None = None, - provider_filters: dict[str, Any] | None = None, - **calibrate_kwargs, - ) -> CalibrationResult: - """Run a calibration experiment over a filtered target subset.""" - selected = self.select_targets( - categories=categories, - levels=levels, - groups=groups, - only_available=only_available, - entity=entity, - period=period, - provider_filters=provider_filters, - ) - selected = [ - target - for target in selected - if not ( - target.value == 0 and target.aggregation is not TargetAggregation.COUNT - ) - ] - - print(f"\n=== Experiment: {name} ===") - print(f"Selected {len(selected)} targets") - - result = self.calibrate( - df, - selected, - entity=entity, - **calibrate_kwargs, - ) - self._results[name] = result - return result - - def compare_experiments(self) -> pd.DataFrame: - """Compare results across experiments.""" - records = [] - for name, result in self._results.items(): - records.append( - { - "experiment": name, - "n_targets": len(result.targets_used), - "converged": result.converged, - "iterations": result.iterations, - "mean_error": result.mean_error, - "max_error": result.max_error, - "weight_cv": result.weight_stats["cv"], - "weight_max": result.weight_stats["max"], - "zero_weights": result.weight_stats["zero_count"], - } - ) - return pd.DataFrame(records) - - def print_target_coverage( - self, - df: pd.DataFrame, - *, - entity: EntityType | str | None = None, - ) -> None: - """Print which canonical targets can be computed from the given frame.""" - print("=" * 70) - print("TARGET COVERAGE ANALYSIS") - print("=" * 70) - - all_targets = self.select_targets(entity=entity) - columns = set(df.columns) - - available: list[TargetSpec] = [] - missing_column: list[TargetSpec] = [] - needs_imputation: list[TargetSpec] = [] - - for target in all_targets: - if any(feature not in columns for feature in target.required_features): - missing_column.append(target) - elif target_requires_imputation(target): - needs_imputation.append(target) - else: - available.append(target) - - print(f"\nAvailable ({len(available)} targets):") - for category in TargetCategory: - count = sum(1 for target in available if target_category(target) is category) - if count: - print(f" {category.value}: {count}") - - print(f"\nMissing column ({len(missing_column)} targets):") - missing_features = { - feature - for target in missing_column - for feature in target.required_features - if feature not in columns - } - for feature in sorted(missing_features): - count = sum(1 for target in missing_column if feature in target.required_features) - print(f" {feature}: {count} targets") - - print(f"\nRequires imputation ({len(needs_imputation)} targets):") - for category in TargetCategory: - count = sum( - 1 for target in needs_imputation if target_category(target) is category - ) - if count: - print(f" {category.value}: {count}") - - -def run_pe_parity_suite( - df: pd.DataFrame, - weight_col: str = "weight", - *, - entity: EntityType | str = EntityType.PERSON, -) -> pd.DataFrame: - """Run the full PE parity calibration suite for a single entity frame.""" - harness = CalibrationHarness() - harness.print_target_coverage(df, entity=entity) - - print("\n" + "=" * 70) - print("RUNNING CALIBRATION EXPERIMENTS") - print("=" * 70) - - harness.run_experiment( - df, - "states_only", - groups=["state_population"], - entity=entity, - weight_col=weight_col, - verbose=True, - ) - harness.run_experiment( - df, - "income_available", - categories=[TargetCategory.INCOME], - only_available=True, - entity=entity, - weight_col=weight_col, - verbose=True, - ) - harness.run_experiment( - df, - "benefits_only", - groups=["benefit_programs"], - entity=entity, - weight_col=weight_col, - verbose=True, - ) - harness.run_experiment( - df, - "full_available", - groups=["state_population", "irs_soi_income", "benefit_programs"], - only_available=True, - entity=entity, - weight_col=weight_col, - verbose=True, - ) - harness.run_experiment( - df, - "all_targets", - only_available=False, - entity=entity, - weight_col=weight_col, - verbose=True, - ) - - print("\n" + "=" * 70) - print("EXPERIMENT COMPARISON") - print("=" * 70) - - comparison = harness.compare_experiments() - print(comparison.to_string(index=False)) - return comparison - - -def _resolve_entity(entity: EntityType | str | None) -> EntityType | None: - if entity is None or isinstance(entity, EntityType): - return entity - return EntityType(entity) - - -def _weight_stats(weights: np.ndarray) -> dict[str, float]: - mean_weight = float(np.mean(weights)) if len(weights) else 0.0 - std_weight = float(np.std(weights)) if len(weights) else 0.0 - return { - "mean": mean_weight, - "std": std_weight, - "cv": std_weight / mean_weight if mean_weight > 0 else 0.0, - "min": float(np.min(weights)) if len(weights) else 0.0, - "max": float(np.max(weights)) if len(weights) else 0.0, - "zero_count": int(np.sum(weights == 0)), - } - - -def _matches_us_target_filters( - target: TargetSpec, - *, - categories: list[TargetCategory] | None = None, - levels: list[TargetLevel] | None = None, - groups: list[str] | None = None, - only_available: bool = False, -) -> bool: - if categories and target_category(target) not in categories: - return False - if levels and target_level(target) not in levels: - return False - if groups and target_group_name(target) not in groups: - return False - if only_available and not target_available_in_cps(target): - return False - return True - - -def _build_constraint_row(df: pd.DataFrame, spec: TargetSpec) -> np.ndarray: - if spec.aggregation is TargetAggregation.MEAN: - raise NotImplementedError("Mean targets are not supported by this harness") - - mask = np.ones(len(df), dtype=bool) - for target_filter in spec.filters: - mask &= _evaluate_filter(df[target_filter.feature], target_filter) - - if spec.aggregation is TargetAggregation.COUNT: - return mask.astype(float) - - if spec.measure is None: - raise ValueError(f"Sum target {spec.name} is missing a measure") - - values = df[spec.measure].fillna(0).to_numpy(dtype=float, copy=False) - return mask.astype(float) * values - - -def _evaluate_filter(series: pd.Series, target_filter: TargetFilter) -> np.ndarray: - operator = target_filter.operator - value = target_filter.value - - if operator is FilterOperator.EQ: - return (series == value).to_numpy(dtype=bool, copy=False) - if operator is FilterOperator.NE: - return (series != value).to_numpy(dtype=bool, copy=False) - if operator is FilterOperator.GT: - return (series > value).to_numpy(dtype=bool, copy=False) - if operator is FilterOperator.GTE: - return (series >= value).to_numpy(dtype=bool, copy=False) - if operator is FilterOperator.LT: - return (series < value).to_numpy(dtype=bool, copy=False) - if operator is FilterOperator.LTE: - return (series <= value).to_numpy(dtype=bool, copy=False) - if operator is FilterOperator.IN: - return series.isin(value).to_numpy(dtype=bool, copy=False) - if operator is FilterOperator.NOT_IN: - return (~series.isin(value)).to_numpy(dtype=bool, copy=False) - raise ValueError(f"Unsupported filter operator: {operator}") diff --git a/src/microplex_us/capital_gains_lots.py b/src/microplex_us/capital_gains_lots.py deleted file mode 100644 index 209b1b42..00000000 --- a/src/microplex_us/capital_gains_lots.py +++ /dev/null @@ -1,360 +0,0 @@ -"""Synthetic capital-gains lot generation and relational persistence.""" - -from __future__ import annotations - -import json -import sqlite3 -from dataclasses import asdict, dataclass -from hashlib import blake2b -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd - -CAPITAL_GAINS_LOT_COLUMNS: tuple[str, ...] = ( - "lot_id", - "person_id", - "tax_unit_id", - "household_id", - "tax_year", - "lot_index", - "sale_time", - "holding_period", - "purchase_time", - "sale_proceeds", - "basis", - "gain_or_loss", - "asset_type", - "is_gain_lot", -) - - -@dataclass(frozen=True) -class SyntheticCapitalGainsLotConfig: - """Controls the first-pass synthetic-lot imputation.""" - - random_seed: int = 42 - max_lots_per_person: int = 4 - high_gain_threshold: float = 100_000.0 - medium_gain_threshold: float = 10_000.0 - annual_nominal_return: float = 0.07 - gain_basis_ratio_floor: float = 0.05 - gain_basis_ratio_ceiling: float = 0.95 - loss_basis_ratio_floor: float = 1.05 - max_holding_period_years: int = 35 - - -def generate_synthetic_capital_gains_lots( - persons: pd.DataFrame, - *, - period: int, - config: SyntheticCapitalGainsLotConfig | None = None, - gain_column: str = "long_term_capital_gains_before_response", -) -> pd.DataFrame: - """Generate deterministic synthetic long-term capital-gains lots. - - The generator is anchored to the existing person-level PolicyEngine input: - lots aggregate exactly back to each person's capital-gains amount. It is a - relational artifact scaffold, not a SOCA-calibrated production imputation. - """ - - resolved = config or SyntheticCapitalGainsLotConfig() - if gain_column not in persons.columns: - raise ValueError(f"persons is missing required column {gain_column!r}") - if resolved.max_lots_per_person < 1: - raise ValueError("max_lots_per_person must be at least 1") - - rows: list[dict[str, Any]] = [] - for position, (_, person) in enumerate(persons.iterrows()): - raw_gain = pd.to_numeric(person[gain_column], errors="coerce") - gain = 0.0 if pd.isna(raw_gain) else float(raw_gain) - if np.isclose(gain, 0.0): - continue - - person_id = _optional_int(person.get("person_id")) - tax_unit_id = _optional_int(person.get("tax_unit_id")) - household_id = _optional_int(person.get("household_id")) - sign = "gain" if gain > 0 else "loss" - stable_key = person_id if person_id is not None else f"row-{position}" - rng = np.random.default_rng( - _stable_seed( - resolved.random_seed, - period, - stable_key, - tax_unit_id, - sign, - "synthetic-capital-gains-lots-v1", - ) - ) - n_lots = _lot_count(abs(gain), resolved) - shares = _deterministic_lot_shares(n_lots, rng) - signed_lot_gains = shares * gain - holding_periods = _draw_holding_periods(n_lots, rng, resolved) - - for lot_index, (lot_gain, holding_period) in enumerate( - zip(signed_lot_gains, holding_periods, strict=True) - ): - sale_proceeds, basis = _basis_and_proceeds( - lot_gain, - int(holding_period), - resolved, - ) - sale_time = float(period) + 0.5 - rows.append( - { - "lot_id": 0, - "person_id": person_id, - "tax_unit_id": tax_unit_id, - "household_id": household_id, - "tax_year": int(period), - "lot_index": int(lot_index), - "sale_time": sale_time, - "holding_period": float(holding_period), - "purchase_time": sale_time - float(holding_period), - "sale_proceeds": float(sale_proceeds), - "basis": float(basis), - "gain_or_loss": float(lot_gain), - "asset_type": "unknown", - "is_gain_lot": bool(lot_gain > 0), - } - ) - - lots = pd.DataFrame(rows, columns=CAPITAL_GAINS_LOT_COLUMNS) - if lots.empty: - return lots - lots = lots.sort_values( - ["person_id", "tax_year", "lot_index"], kind="stable" - ).reset_index(drop=True) - lots["lot_id"] = np.arange(1, len(lots) + 1, dtype=np.int64) - return lots.astype( - { - "lot_id": "int64", - "tax_year": "int64", - "lot_index": "int64", - "sale_time": "float64", - "holding_period": "float64", - "purchase_time": "float64", - "sale_proceeds": "float64", - "basis": "float64", - "gain_or_loss": "float64", - "asset_type": "string", - "is_gain_lot": "bool", - } - ) - - -def validate_capital_gains_lot_anchors( - persons: pd.DataFrame, - lots: pd.DataFrame, - *, - gain_column: str = "long_term_capital_gains_before_response", - tolerance: float = 1e-5, - relative_tolerance: float = 1e-9, -) -> None: - """Raise if lot totals do not reconcile to person-level capital gains.""" - - if gain_column not in persons.columns: - raise ValueError(f"persons is missing required column {gain_column!r}") - if "person_id" not in persons.columns: - raise ValueError("persons is missing required column 'person_id'") - missing_lot_columns = {"person_id", "gain_or_loss"} - set(lots.columns) - if missing_lot_columns: - raise ValueError(f"lots is missing columns: {sorted(missing_lot_columns)}") - - anchors = ( - persons[["person_id", gain_column]] - .assign( - person_id=lambda df: pd.to_numeric(df["person_id"], errors="coerce"), - _anchor=lambda df: pd.to_numeric(df[gain_column], errors="coerce").fillna( - 0.0 - ), - ) - .groupby("person_id", dropna=False)["_anchor"] - .sum() - ) - lot_totals = ( - lots.assign( - person_id=pd.to_numeric(lots["person_id"], errors="coerce"), - gain_or_loss=pd.to_numeric(lots["gain_or_loss"], errors="coerce").fillna( - 0.0 - ), - ) - .groupby("person_id", dropna=False)["gain_or_loss"] - .sum() - ) - combined = pd.concat([anchors, lot_totals], axis=1).fillna(0.0) - combined.columns = ["anchor", "lot_total"] - deltas = (combined["anchor"] - combined["lot_total"]).abs() - bad = deltas[ - ~np.isclose( - combined["anchor"], - combined["lot_total"], - atol=tolerance, - rtol=relative_tolerance, - ) - ] - if not bad.empty: - worst_person = bad.idxmax() - raise ValueError( - "Synthetic capital-gains lots do not reconcile to person anchors; " - f"worst person_id={worst_person!r}, delta={float(bad.max())}" - ) - - -def write_capital_gains_lots_sqlite( - lots: pd.DataFrame, - path: str | Path, - *, - metadata: dict[str, Any] | None = None, - if_exists: str = "replace", -) -> Path: - """Persist synthetic lots to a compact SQLite artifact.""" - - output_path = Path(path) - output_path.parent.mkdir(parents=True, exist_ok=True) - missing = set(CAPITAL_GAINS_LOT_COLUMNS) - set(lots.columns) - if missing: - raise ValueError(f"lots is missing columns: {sorted(missing)}") - with sqlite3.connect(output_path) as conn: - lots.loc[:, CAPITAL_GAINS_LOT_COLUMNS].to_sql( - "capital_gains_lots", - conn, - index=False, - if_exists=if_exists, - ) - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_capital_gains_lots_person_period - ON capital_gains_lots (person_id, tax_year) - """ - ) - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_capital_gains_lots_tax_unit_period - ON capital_gains_lots (tax_unit_id, tax_year) - """ - ) - conn.execute("DROP TABLE IF EXISTS capital_gains_lot_metadata") - conn.execute( - """ - CREATE TABLE capital_gains_lot_metadata ( - key TEXT PRIMARY KEY, - value TEXT NOT NULL - ) - """ - ) - for key, value in (metadata or {}).items(): - conn.execute( - """ - INSERT INTO capital_gains_lot_metadata (key, value) - VALUES (?, ?) - """, - (str(key), json.dumps(value, sort_keys=True)), - ) - return output_path - - -def read_capital_gains_lots_sqlite(path: str | Path) -> pd.DataFrame: - """Read a synthetic capital-gains lot SQLite artifact.""" - - with sqlite3.connect(Path(path)) as conn: - return pd.read_sql_query( - """ - SELECT * - FROM capital_gains_lots - ORDER BY lot_id - """, - conn, - ) - - -def synthetic_capital_gains_lot_metadata( - config: SyntheticCapitalGainsLotConfig, - *, - period: int, - source_gain_column: str = "long_term_capital_gains_before_response", -) -> dict[str, Any]: - """Build metadata for the current synthetic-lot artifact contract.""" - - return { - "format_version": 1, - "tax_year": int(period), - "source_gain_column": source_gain_column, - "config": asdict(config), - "method": "deterministic_anchor_preserving_synthetic_lots_phase_1", - "capital_gains_lots_issue": ( - "https://github.com/PolicyEngine/policyengine-us-data/issues/1127" - ), - "limitations": ( - "Phase 1 prototype: no SOCA calibration, no asset type assignment, " - "and no mixed gross gain/loss reconstruction." - ), - } - - -def _lot_count(amount: float, config: SyntheticCapitalGainsLotConfig) -> int: - if amount >= config.high_gain_threshold: - return min(config.max_lots_per_person, 4) - if amount >= config.medium_gain_threshold: - return min(config.max_lots_per_person, 2) - return 1 - - -def _deterministic_lot_shares(n_lots: int, rng: np.random.Generator) -> np.ndarray: - if n_lots == 1: - return np.array([1.0], dtype=float) - shares = rng.dirichlet(np.full(n_lots, 1.5)) - shares[-1] = 1.0 - float(shares[:-1].sum()) - return shares - - -def _draw_holding_periods( - n_lots: int, - rng: np.random.Generator, - config: SyntheticCapitalGainsLotConfig, -) -> np.ndarray: - buckets = np.array([2, 3, 5, 8, 12, 20, 30], dtype=int) - weights = np.array([0.12, 0.16, 0.2, 0.18, 0.16, 0.12, 0.06], dtype=float) - holding_periods = rng.choice(buckets, size=n_lots, replace=True, p=weights) - return np.clip(holding_periods, 2, config.max_holding_period_years) - - -def _basis_and_proceeds( - gain_or_loss: float, - holding_period_years: int, - config: SyntheticCapitalGainsLotConfig, -) -> tuple[float, float]: - if gain_or_loss > 0: - raw_basis_ratio = 1.0 / ( - (1.0 + config.annual_nominal_return) ** holding_period_years - ) - basis_ratio = float( - np.clip( - raw_basis_ratio, - config.gain_basis_ratio_floor, - config.gain_basis_ratio_ceiling, - ) - ) - basis = gain_or_loss * basis_ratio / (1.0 - basis_ratio) - return basis + gain_or_loss, basis - - loss = abs(gain_or_loss) - basis_ratio = max( - 1.0 + config.annual_nominal_return * holding_period_years / 4.0, - config.loss_basis_ratio_floor, - ) - basis = loss * basis_ratio / (basis_ratio - 1.0) - return basis - loss, basis - - -def _optional_int(value: Any) -> int | None: - if value is None or pd.isna(value): - return None - return int(value) - - -def _stable_seed(*parts: Any) -> int: - payload = "|".join(str(part) for part in parts).encode() - digest = blake2b(payload, digest_size=8).digest() - return int.from_bytes(digest, byteorder="little", signed=False) diff --git a/src/microplex_us/cps_synthetic.py b/src/microplex_us/cps_synthetic.py deleted file mode 100644 index f61ec2ea..00000000 --- a/src/microplex_us/cps_synthetic.py +++ /dev/null @@ -1,260 +0,0 @@ -"""CPS-specific summary-stat synthetic data helpers.""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Self - -import numpy as np -import pandas as pd -from scipy import stats as scipy_stats -from scipy.interpolate import interp1d - - -@dataclass -class CPSSummaryStats: - """Summary statistics needed to generate CPS-shaped synthetic records.""" - - variables: list[str] - means: dict[str, float] - stds: dict[str, float] - quantiles: dict[str, np.ndarray] - zero_fractions: dict[str, float] - discrete_vars: list[str] - discrete_distributions: dict[str, dict[int, float]] - correlation_matrix: np.ndarray - continuous_vars: list[str] = field(default_factory=list) - quantile_values: dict[str, np.ndarray] = field(default_factory=dict) - min_values: dict[str, float] = field(default_factory=dict) - max_values: dict[str, float] = field(default_factory=dict) - - @classmethod - def from_dataframe( - cls, - data: pd.DataFrame, - weight_col: str | None = None, - discrete_threshold: int = 10, - ) -> Self: - variables = [column for column in data.columns if column != weight_col] - discrete_vars: list[str] = [] - continuous_vars: list[str] = [] - for variable in variables: - if ( - data[variable].nunique() <= discrete_threshold - and data[variable].dtype in [np.int64, np.int32, int, "int64", "int32"] - ): - discrete_vars.append(variable) - else: - continuous_vars.append(variable) - - if weight_col and weight_col in data.columns: - weights = data[weight_col].to_numpy(dtype=float) - else: - weights = np.ones(len(data), dtype=float) - weights = weights / weights.sum() - - means: dict[str, float] = {} - stds: dict[str, float] = {} - quantiles: dict[str, np.ndarray] = {} - quantile_values: dict[str, np.ndarray] = {} - zero_fractions: dict[str, float] = {} - min_values: dict[str, float] = {} - max_values: dict[str, float] = {} - - for variable in continuous_vars: - values = data[variable].to_numpy(dtype=float) - means[variable] = float(np.sum(weights * values)) - variance = np.sum(weights * (values - means[variable]) ** 2) - stds[variable] = float(np.sqrt(variance)) - zero_fractions[variable] = float(np.mean(values == 0)) - - positive_values = values[values > 0] - if len(positive_values) > 0: - min_values[variable] = float(np.min(positive_values)) - max_values[variable] = float(np.max(positive_values)) - q_probs = np.array([0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]) - q_values = np.quantile(positive_values, q_probs) - else: - min_values[variable] = 0.0 - max_values[variable] = 1.0 - q_probs = np.array([0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]) - q_values = np.zeros_like(q_probs) - quantiles[variable] = q_probs - quantile_values[variable] = q_values - - discrete_distributions: dict[str, dict[int, float]] = {} - for variable in discrete_vars: - values = data[variable].to_numpy() - distribution: dict[int, float] = {} - for category in np.unique(values): - distribution[int(category)] = float(np.sum(weights[values == category])) - discrete_distributions[variable] = distribution - - numeric_data = data[variables].apply(pd.to_numeric, errors="coerce") - correlation_matrix = numeric_data.corr(method="spearman").fillna(0).to_numpy() - - return cls( - variables=variables, - means=means, - stds=stds, - quantiles=quantiles, - zero_fractions=zero_fractions, - discrete_vars=discrete_vars, - discrete_distributions=discrete_distributions, - correlation_matrix=correlation_matrix, - continuous_vars=continuous_vars, - quantile_values=quantile_values, - min_values=min_values, - max_values=max_values, - ) - - -class CPSSyntheticGenerator: - """Gaussian-copula generator over CPS summary statistics.""" - - def __init__(self, stats: CPSSummaryStats): - self.stats = stats - correlation = stats.correlation_matrix.copy() - np.fill_diagonal(correlation, 1.0) - min_eigenvalue = np.min(np.linalg.eigvalsh(correlation)) - if min_eigenvalue < 1e-6: - correlation = correlation + (1e-6 - min_eigenvalue) * np.eye(correlation.shape[0]) - self.cholesky = np.linalg.cholesky(correlation) - self._build_marginal_transforms() - - def _build_marginal_transforms(self) -> None: - self.marginal_transforms: dict[str, interp1d] = {} - for variable in self.stats.continuous_vars: - probabilities = np.concatenate( - [[0.0], self.stats.quantiles[variable], [1.0]] - ) - values = np.concatenate( - [ - [self.stats.min_values[variable]], - self.stats.quantile_values[variable], - [self.stats.max_values[variable]], - ] - ) - unique_mask = np.concatenate([[True], np.diff(probabilities) > 1e-10]) - self.marginal_transforms[variable] = interp1d( - probabilities[unique_mask], - values[unique_mask], - kind="linear", - bounds_error=False, - fill_value=(values[unique_mask][0], values[unique_mask][-1]), - ) - - def generate(self, n: int, seed: int | None = None) -> pd.DataFrame: - if seed is not None: - np.random.seed(seed) - z = np.random.standard_normal((n, len(self.stats.variables))) - z_correlated = z @ self.cholesky.T - uniforms = scipy_stats.norm.cdf(z_correlated) - - result: dict[str, np.ndarray] = {} - for index, variable in enumerate(self.stats.variables): - if variable in self.stats.discrete_vars: - result[variable] = self._sample_discrete(variable, uniforms[:, index]) - else: - result[variable] = self._transform_continuous(variable, uniforms[:, index]) - return pd.DataFrame(result) - - def _sample_discrete(self, variable: str, uniforms: np.ndarray) -> np.ndarray: - distribution = self.stats.discrete_distributions[variable] - categories = sorted(distribution) - probabilities = np.array([distribution[category] for category in categories], dtype=float) - probabilities = probabilities / probabilities.sum() - cdf = np.cumsum(probabilities) - - result = np.zeros(len(uniforms), dtype=int) - for index, category in enumerate(categories): - lower = cdf[index - 1] if index > 0 else 0.0 - mask = (uniforms > lower) & (uniforms <= cdf[index]) - result[mask] = category - result[uniforms > cdf[-1]] = categories[-1] - return result - - def _transform_continuous(self, variable: str, uniforms: np.ndarray) -> np.ndarray: - zero_fraction = self.stats.zero_fractions.get(variable, 0.0) - if zero_fraction <= 0: - return self.marginal_transforms[variable](uniforms) - - result = np.zeros(len(uniforms), dtype=float) - positive_mask = uniforms >= zero_fraction - if positive_mask.any(): - positive_uniforms = (uniforms[positive_mask] - zero_fraction) / (1 - zero_fraction) - positive_uniforms = np.clip(positive_uniforms, 0, 1) - result[positive_mask] = self.marginal_transforms[variable](positive_uniforms) - return result - - -def validate_synthetic( - reference: pd.DataFrame, - synthetic: pd.DataFrame, - variables: list[str] | None = None, -) -> dict[str, dict[str, float] | float]: - """Compare synthetic data to a CPS-like reference table.""" - if variables is None: - variables = [column for column in reference.columns if column in synthetic.columns] - - metrics: dict[str, dict[str, float] | float] = { - "ks_statistics": {}, - "mean_errors": {}, - "std_errors": {}, - "correlation_errors": {}, - } - - for variable in variables: - reference_values = reference[variable].dropna().to_numpy() - synthetic_values = synthetic[variable].dropna().to_numpy() - if len(reference_values) == 0 or len(synthetic_values) == 0: - continue - - ks_statistic, _ = scipy_stats.ks_2samp(reference_values, synthetic_values) - metrics["ks_statistics"][variable] = float(ks_statistic) - - reference_mean = float(np.mean(reference_values)) - synthetic_mean = float(np.mean(synthetic_values)) - metrics["mean_errors"][variable] = ( - abs(synthetic_mean - reference_mean) / abs(reference_mean) - if reference_mean != 0 - else abs(synthetic_mean) - ) - - reference_std = float(np.std(reference_values)) - synthetic_std = float(np.std(synthetic_values)) - metrics["std_errors"][variable] = ( - abs(synthetic_std - reference_std) / reference_std - if reference_std != 0 - else abs(synthetic_std) - ) - - numeric_vars = [variable for variable in variables if variable in reference.columns and variable in synthetic.columns] - if len(numeric_vars) >= 2: - reference_corr = reference[numeric_vars].corr(method="spearman").fillna(0) - synthetic_corr = synthetic[numeric_vars].corr(method="spearman").fillna(0) - for index, left in enumerate(numeric_vars): - for right in numeric_vars[index + 1:]: - pair = f"{left}_vs_{right}" - metrics["correlation_errors"][pair] = abs( - float(synthetic_corr.loc[left, right]) - float(reference_corr.loc[left, right]) - ) - - metrics["mean_ks"] = ( - float(np.mean(list(metrics["ks_statistics"].values()))) - if metrics["ks_statistics"] - else 0.0 - ) - metrics["mean_corr_error"] = ( - float(np.mean(list(metrics["correlation_errors"].values()))) - if metrics["correlation_errors"] - else 0.0 - ) - return metrics - - -__all__ = [ - "CPSSummaryStats", - "CPSSyntheticGenerator", - "validate_synthetic", -] diff --git a/src/microplex_us/data.py b/src/microplex_us/data.py deleted file mode 100644 index 5bdb3254..00000000 --- a/src/microplex_us/data.py +++ /dev/null @@ -1,269 +0,0 @@ -"""US-specific CPS ASEC data helpers.""" - -from __future__ import annotations - -from pathlib import Path - -import numpy as np -import pandas as pd - -PACKAGE_ROOT = Path(__file__).resolve().parents[2] -DEFAULT_DATA_DIR_CANDIDATES = ( - PACKAGE_ROOT / "data", - PACKAGE_ROOT.parent / "microplex" / "data", -) -DEFAULT_DATA_DIR = next( - (candidate for candidate in DEFAULT_DATA_DIR_CANDIDATES if candidate.exists()), - DEFAULT_DATA_DIR_CANDIDATES[0], -) - - -def load_cps_asec( - data_dir: str | Path | None = None, - households_only: bool = False, - persons_only: bool = False, -) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]: - """Load preprocessed CPS ASEC household and person parquet files.""" - if data_dir is None: - data_dir = DEFAULT_DATA_DIR - else: - data_dir = Path(data_dir) - - household_path = data_dir / "cps_asec_households.parquet" - person_path = data_dir / "cps_asec_persons.parquet" - - if not household_path.exists() or not person_path.exists(): - raise FileNotFoundError( - f"CPS ASEC data files not found in {data_dir}.\n" - "Run the downloader in `microplex-us` or provide preprocessed parquet files." - ) - - if households_only: - return pd.read_parquet(household_path) - if persons_only: - return pd.read_parquet(person_path) - - households = pd.read_parquet(household_path) - persons = pd.read_parquet(person_path) - return households, persons - - -def load_cps_for_synthesis( - data_dir: str | Path | None = None, - sample_fraction: float | None = None, - random_state: int = 42, -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Load CPS ASEC and normalize it for the hierarchical synthesizer.""" - households, persons = load_cps_asec(data_dir) - households = _prepare_household_data(households) - persons = _prepare_person_data(persons) - - if sample_fraction is not None and 0 < sample_fraction < 1: - rng = np.random.default_rng(random_state) - sampled_household_ids = rng.choice( - households["household_id"].unique(), - size=int(len(households) * sample_fraction), - replace=False, - ) - households = households[households["household_id"].isin(sampled_household_ids)] - persons = persons[persons["household_id"].isin(sampled_household_ids)] - - return households, persons - - -def _prepare_household_data(df: pd.DataFrame) -> pd.DataFrame: - """Ensure CPS household data has the columns expected by synthesis code.""" - result = df.copy() - required_cols = { - "household_id": lambda: np.arange(len(result)), - "n_persons": lambda: np.ones(len(result)), - "n_adults": lambda: np.ones(len(result)), - "n_children": lambda: np.zeros(len(result)), - "state_fips": lambda: np.zeros(len(result)), - "tenure": lambda: np.ones(len(result)), - "hh_weight": lambda: np.ones(len(result)), - } - - for column, default_factory in required_cols.items(): - if column not in result.columns: - result[column] = default_factory() - - for column in ["n_persons", "n_adults", "n_children", "state_fips", "tenure"]: - result[column] = ( - pd.to_numeric(result[column], errors="coerce").fillna(0).astype(int) - ) - - result["hh_weight"] = ( - pd.to_numeric(result["hh_weight"], errors="coerce").fillna(1).astype(float) - ) - result["n_persons"] = result["n_persons"].clip(lower=1) - result["n_adults"] = result["n_adults"].clip(lower=1) - return result - - -def _prepare_person_data(df: pd.DataFrame) -> pd.DataFrame: - """Ensure CPS person data has the columns expected by synthesis code.""" - result = df.copy() - required_cols = { - "person_id": lambda: np.arange(len(result)), - "household_id": lambda: np.zeros(len(result), dtype=int), - "age": lambda: np.full(len(result), 30), - "sex": lambda: np.ones(len(result)), - "income": lambda: np.zeros(len(result)), - "employment_status": lambda: np.zeros(len(result)), - "education": lambda: np.ones(len(result)), - "relationship_to_head": lambda: np.ones(len(result)), - } - - for column, default_factory in required_cols.items(): - if column not in result.columns: - result[column] = default_factory() - - for column in [ - "age", - "sex", - "employment_status", - "education", - "relationship_to_head", - ]: - result[column] = ( - pd.to_numeric(result[column], errors="coerce").fillna(0).astype(int) - ) - result["income"] = ( - pd.to_numeric(result["income"], errors="coerce").fillna(0).astype(float) - ) - result["age"] = result["age"].clip(0, 120) - return result - - -def create_sample_data( - n_households: int = 1000, - seed: int = 42, -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Create CPS-shaped sample data for examples and tests.""" - rng = np.random.default_rng(seed) - - n_persons = rng.choice( - [1, 2, 3, 4, 5, 6, 7], - n_households, - p=[0.28, 0.34, 0.16, 0.12, 0.06, 0.03, 0.01], - ) - households = pd.DataFrame( - { - "household_id": np.arange(n_households), - "n_persons": n_persons, - "state_fips": rng.choice( - [ - 6, 48, 12, 36, 42, 17, 39, 13, 37, 26, 4, 34, 51, 53, 25, - 47, 29, 18, 55, 21, 24, 41, 8, 22, 5, 28, 20, 31, 35, 23, - ], - n_households, - ), - "tenure": rng.choice([1, 2, 3], n_households, p=[0.65, 0.34, 0.01]), - "hh_weight": rng.lognormal(8, 0.5, n_households), - } - ) - households["n_children"] = np.minimum( - rng.binomial(households["n_persons"], 0.25), - households["n_persons"] - 1, - ) - households["n_adults"] = households["n_persons"] - households["n_children"] - - people: list[dict[str, float | int]] = [] - person_id = 0 - for _, household in households.iterrows(): - household_id = household["household_id"] - n_adults = int(household["n_adults"]) - n_children = int(household["n_children"]) - - for adult_index in range(n_adults): - age = int(rng.integers(18, 85)) - education = int(rng.choice([1, 2, 3, 4], p=[0.10, 0.28, 0.30, 0.32])) - if rng.random() < 0.15: - income = 0.0 - else: - base_income = float(rng.lognormal(10.5, 1.0)) - age_factor = 1 + 0.02 * min(age - 18, 30) - 0.01 * max(age - 55, 0) - education_factor = 1 + 0.3 * education - income = max(0.0, base_income * age_factor * education_factor) - people.append( - { - "person_id": person_id, - "household_id": household_id, - "age": age, - "sex": int(rng.choice([1, 2])), - "income": income, - "employment_status": int(rng.choice([0, 1, 2], p=[0.35, 0.60, 0.05])), - "education": education, - "relationship_to_head": 1 if adult_index == 0 else (2 if adult_index == 1 else 3), - } - ) - person_id += 1 - - for _child_index in range(n_children): - people.append( - { - "person_id": person_id, - "household_id": household_id, - "age": int(rng.integers(0, 18)), - "sex": int(rng.choice([1, 2])), - "income": 0.0, - "employment_status": 0, - "education": 1, - "relationship_to_head": 4, - } - ) - person_id += 1 - - persons = pd.DataFrame(people) - return households, persons - - -def get_data_info(data_dir: str | Path | None = None) -> dict: - """Report availability and shape of local CPS ASEC parquet files.""" - if data_dir is None: - data_dir = DEFAULT_DATA_DIR - else: - data_dir = Path(data_dir) - - info = { - "data_dir": str(data_dir), - "households": {"exists": False}, - "persons": {"exists": False}, - } - - household_path = data_dir / "cps_asec_households.parquet" - person_path = data_dir / "cps_asec_persons.parquet" - - if household_path.exists(): - households = pd.read_parquet(household_path) - info["households"] = { - "exists": True, - "path": str(household_path), - "size_mb": household_path.stat().st_size / 1e6, - "n_records": len(households), - "columns": list(households.columns), - } - - if person_path.exists(): - persons = pd.read_parquet(person_path) - info["persons"] = { - "exists": True, - "path": str(person_path), - "size_mb": person_path.stat().st_size / 1e6, - "n_records": len(persons), - "columns": list(persons.columns), - } - - return info - - -__all__ = [ - "DEFAULT_DATA_DIR", - "load_cps_asec", - "load_cps_for_synthesis", - "_prepare_household_data", - "_prepare_person_data", - "create_sample_data", - "get_data_info", -] diff --git a/src/microplex_us/data_sources/__init__.py b/src/microplex_us/data_sources/__init__.py deleted file mode 100644 index 6644fa8f..00000000 --- a/src/microplex_us/data_sources/__init__.py +++ /dev/null @@ -1,140 +0,0 @@ -"""Data-source convenience exports for microplex-us. - -The package root resolves providers lazily so importing ``microplex_us.data_sources`` -does not require optional survey, benchmark, or core integration dependencies. -""" - -from __future__ import annotations - -from importlib import import_module -from typing import Any - - -def _exports(module: str, names: tuple[str, ...]) -> dict[str, tuple[str, str]]: - return {name: (module, name) for name in names} - - -_EXPORTS: dict[str, tuple[str, str]] = { - **_exports( - "microplex_us.data_sources.cps", - ( - "CPSDataset", - "CPSASECSourceProvider", - "CPSASECParquetSourceProvider", - "download_cps_asec", - "get_available_years", - "PERSON_VARIABLES", - "HOUSEHOLD_VARIABLES", - ), - ), - "load_cps_asec_polars": ( - "microplex_us.data_sources.cps", - "load_cps_asec", - ), - **_exports( - "microplex_us.data_sources.cps_mappings", - ( - "CoverageLevel", - "CoverageGap", - "VariableMapping", - "map_age", - "map_earned_income", - "map_filing_status", - "map_is_blind", - "map_is_dependent", - "map_ctc_qualifying_children", - "map_agi_proxy", - "map_household_size", - "get_mapping_metadata", - "get_all_mappings", - "coverage_summary", - ), - ), - **_exports( - "microplex_us.data_sources.cps_transform", - ( - "TransformedDataset", - "transform_cps_to_policyengine", - ), - ), - **_exports( - "microplex_us.data_sources.donor_surveys", - ( - "ACSSourceProvider", - "DonorSurveyProviderSpec", - "DonorSurveySourceProvider", - "SIPPSourceProvider", - "SIPPTipsSourceProvider", - "SIPPAssetsSourceProvider", - "SCFSourceProvider", - "resolve_sipp_donor_survey_spec", - ), - ), - **_exports( - "microplex_us.data_sources.family_imputation_benchmark", - ( - "DecomposableFamilyBenchmarkSpec", - "FamilyImputationMethodBenchmark", - "FamilyImputationBenchmarkResult", - "benchmark_decomposable_family_imputers", - "reconcile_component_predictions_to_total", - ), - ), - **_exports( - "microplex_us.data_sources.forbes", - ( - "FixedSpineResidualizationResult", - "FixedSpineTargetContribution", - "ForbesFixedSpine", - "ForbesFixedSpineConfig", - "append_forbes_fixed_spine_tables", - "build_forbes_fixed_spine", - "fixed_spine_contribution_diagnostics_json", - "forbes_fixed_spine_variable_bindings", - "read_forbes_fixed_spine_records", - "residualize_targets_for_fixed_spine", - ), - ), - **_exports( - "microplex_us.data_sources.puf", - ( - "load_puf", - "PUFSourceProvider", - "download_puf", - "map_puf_variables", - "uprate_puf", - "expand_to_persons", - "PUF_VARIABLE_MAP", - "UPRATING_FACTORS", - "PUF_EXCLUSIVE_VARS", - "SHARED_VARS", - ), - ), - **_exports( - "microplex_us.data_sources.psid", - ( - "PSIDDataset", - "PSIDSourceProvider", - "load_psid_panel", - "extract_transition_rates", - "get_age_specific_rates", - "calibrate_marriage_rates", - "calibrate_divorce_rates", - "create_psid_fusion_source", - "PSID_TO_MICROPLEX_VARS", - ), - ), -} - -__all__ = list(_EXPORTS) - - -def __getattr__(name: str) -> Any: - """Resolve data-source exports on first access.""" - export = _EXPORTS.get(name) - if export is None: - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") - module_name, attribute_name = export - value = getattr(import_module(module_name), attribute_name) - globals()[name] = value - return value diff --git a/src/microplex_us/data_sources/cps.py b/src/microplex_us/data_sources/cps.py deleted file mode 100644 index 86404fb9..00000000 --- a/src/microplex_us/data_sources/cps.py +++ /dev/null @@ -1,2865 +0,0 @@ -""" -CPS ASEC (Annual Social and Economic Supplement) data loading. - -The CPS ASEC is the primary source for income and poverty statistics in the US. -Released annually in March, it contains detailed income, employment, and -demographic information for ~100K households. - -Data source: https://www.census.gov/data/datasets/time-series/demo/cps/cps-asec.html -""" - -import zipfile -from collections.abc import Callable -from dataclasses import dataclass -from pathlib import Path - -import numpy as np -import pandas as pd -import polars as pl -from microplex.core import ( - EntityObservation, - EntityRelationship, - EntityType, - ObservationFrame, - RelationshipCardinality, - Shareability, - SourceArchetype, - SourceDescriptor, - SourceQuery, - TimeStructure, - apply_source_query, -) - -from microplex_us.data_sources.cps_age import randomize_cps_topcoded_age_80_84 -from microplex_us.data_sources.sampling import ( - sample_frame_with_state_floor, - sample_frame_without_replacement, -) -from microplex_us.source_registry import resolve_source_variable_capabilities - -# Default cache directory -DEFAULT_CACHE_DIR = Path.home() / ".cache" / "microplex" -CPS_ASEC_PROCESSED_CACHE_VERSION = "20260604_signed_self_employment_losses" - -CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP = { - "reported_has_direct_purchase_health_coverage_at_interview": "NOW_DIR", - "reported_has_marketplace_health_coverage_at_interview": "NOW_MRK", - "reported_has_subsidized_marketplace_health_coverage_at_interview": "NOW_MRKS", - "reported_has_unsubsidized_marketplace_health_coverage_at_interview": "NOW_MRKUN", - "reported_has_non_marketplace_direct_purchase_health_coverage_at_interview": ( - "NOW_NONM" - ), - "reported_has_employer_sponsored_health_coverage_at_interview": "NOW_GRP", - "reported_has_medicare_health_coverage_at_interview": "NOW_MCARE", - "reported_has_medicaid_health_coverage_at_interview": "NOW_CAID", - "reported_has_means_tested_health_coverage_at_interview": "NOW_MCAID", - "reported_has_chip_health_coverage_at_interview": "NOW_PCHIP", - "reported_has_other_means_tested_health_coverage_at_interview": "NOW_OTHMT", - "reported_has_tricare_health_coverage_at_interview": "NOW_MIL", - "reported_has_champva_health_coverage_at_interview": "NOW_CHAMPVA", - "reported_has_va_health_coverage_at_interview": "NOW_VACARE", - "reported_has_indian_health_service_coverage_at_interview": "NOW_IHSFLG", -} - -CURRENT_HEALTH_COVERAGE_RULE_INPUT_ALIAS_MAP = { - "has_marketplace_health_coverage_at_interview": ( - "reported_has_marketplace_health_coverage_at_interview" - ), - "has_non_marketplace_direct_purchase_health_coverage_at_interview": ( - "reported_has_non_marketplace_direct_purchase_health_coverage_at_interview" - ), - "has_medicaid_health_coverage_at_interview": ( - "reported_has_medicaid_health_coverage_at_interview" - ), - "has_other_means_tested_health_coverage_at_interview": ( - "reported_has_other_means_tested_health_coverage_at_interview" - ), - "has_tricare_health_coverage_at_interview": ( - "reported_has_tricare_health_coverage_at_interview" - ), - "has_champva_health_coverage_at_interview": ( - "reported_has_champva_health_coverage_at_interview" - ), - "has_va_health_coverage_at_interview": ( - "reported_has_va_health_coverage_at_interview" - ), - "has_indian_health_service_coverage_at_interview": ( - "reported_has_indian_health_service_coverage_at_interview" - ), -} - -CENSUS_OCCUPATION_CODE_TO_TTOC = { - 725: 502, - 2350: 507, - 2633: 502, - 2752: 206, - 2755: 207, - 2770: 208, - 2910: 503, - 3602: 501, - 3630: 602, - 4000: 105, - 4010: 106, - 4030: 106, - 4040: 101, - 4055: 107, - 4110: 102, - 4120: 103, - 4130: 104, - 4140: 108, - 4150: 109, - 4160: 106, - 4230: 304, - 4251: 402, - 4350: 506, - 4420: 210, - 4500: 603, - 4510: 603, - 4521: 605, - 4522: 601, - 4600: 508, - 4621: 607, - 4655: 501, - 5130: 203, - 5300: 303, - 6355: 403, - 6442: 404, - 7120: 401, - 7200: 409, - 7315: 405, - 7320: 406, - 7340: 401, - 7540: 408, - 7610: 401, - 7800: 110, - 8510: 401, - 9122: 806, - 9141: 803, - 9142: 802, - 9350: 801, - 9610: 805, - 9620: 809, -} - -# CPS ASEC data URLs by year -CPS_URLS = { - 2025: "https://www2.census.gov/programs-surveys/cps/datasets/2025/march/asecpub25csv.zip", - 2024: "https://www2.census.gov/programs-surveys/cps/datasets/2024/march/asecpub24csv.zip", - 2023: "https://www2.census.gov/programs-surveys/cps/datasets/2023/march/asecpub23csv.zip", - 2022: "https://www2.census.gov/programs-surveys/cps/datasets/2022/march/asecpub22csv.zip", - 2021: "https://www2.census.gov/programs-surveys/cps/datasets/2021/march/asecpub21csv.zip", -} - -# Key variable mappings (Census variable name -> our name) -PERSON_VARIABLES = { - # Demographics - "A_AGE": "age", - "A_SEX": "sex", - "PRDTRACE": "race", - "PEHSPNON": "hispanic", - "PRDTHSP": "_cps_hispanic_code", - "A_HGA": "education", - "PEDISDRS": "_disability_dressing", - "PEDISEAR": "_disability_hearing", - "PEDISEYE": "_disability_vision", - "PEDISOUT": "_disability_errands", - "PEDISPHY": "_disability_physical", - "PEDISREM": "_disability_cognitive", - "DIS_VAL1": "_disability_income_1", - "DIS_SC1": "_disability_income_code_1", - "DIS_VAL2": "_disability_income_2", - "DIS_SC2": "_disability_income_code_2", - "RESNSS1": "_social_security_reason_1", - "RESNSS2": "_social_security_reason_2", - # Employment - "A_CLSWKR": "class_of_worker", - "A_WKSTAT": "work_status", - "A_HRS1": "hours_worked", - "A_HSCOL": "_high_school_or_college_status", - "A_HRLYWK": "_is_paid_hourly_code", - "A_HRSPAY": "_hourly_pay_cents", - "A_UNMEM": "_union_member_code", - "POCCU2": "detailed_occupation_recode", - "PEIOOCC": "_detailed_census_occupation_code", - # Income (annual) - "WSAL_VAL": "wage_income", - "SEMP_VAL": "self_employment_income", - # Bundled retirement-contribution total (Census asks a single - # "how much did you contribute to retirement accounts?" question). - # Split into account-type-specific desired leaves in _process_persons, - # mirroring eCPS cps.py:1500-1552. Staging column dropped after split. - "RETCB_VAL": "_retirement_contributions", - "INT_VAL": "interest_income", - "DIV_VAL": "dividend_income", - "RNT_VAL": "rental_income", - "ANN_VAL": "_annuity_income", - "PNSN_VAL": "_pension_income", - "SS_VAL": "social_security", - "SSI_VAL": "ssi", - "UC_VAL": "unemployment_compensation", - "LKWEEKS": "weeks_unemployed", - "VET_VAL": "veterans_benefits", - "WC_VAL": "workers_compensation", - "DST_SC1": "_retirement_distribution_code_1", - "DST_SC2": "_retirement_distribution_code_2", - "DST_SC1_YNG": "_retirement_distribution_code_1_yng", - "DST_SC2_YNG": "_retirement_distribution_code_2_yng", - "DST_VAL1": "_retirement_distribution_value_1", - "DST_VAL2": "_retirement_distribution_value_2", - "DST_VAL1_YNG": "_retirement_distribution_value_1_yng", - "DST_VAL2_YNG": "_retirement_distribution_value_2_yng", - # CPS-derived direct income copies (mirror eCPS cps.py:1493-1495). - "SRVS_VAL": "survivor_benefits", - "ED_VAL": "educational_assistance", - "FIN_VAL": "financial_assistance", - "PTOTVAL": "total_person_income", - "OI_OFF": "_other_income_code", - "OI_VAL": "_other_income_value", - # Benefits - "PAW_VAL": "public_assistance", - "CSP_VAL": "child_support_received", - "CHSP_VAL": "child_support_expense", - "MCARE": "has_medicare", - "MCAID": "has_medicaid", - "NOW_GRP": "has_esi", - "NOW_MRK": "has_marketplace_health_coverage", - **{ - census_name: f"_{leaf}" - for leaf, census_name in CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP.items() - }, - "NOW_PRIV": "_reported_has_private_health_coverage_at_interview", - "NOW_PUB": "_reported_has_public_health_coverage_at_interview", - "NOW_COV": "_reported_current_health_coverage_code", - # Employer-sponsored insurance policyholder + premium inputs (eCPS - # cps.py:197-275). NOW_OWNGRP flags own-name current group (ESI) coverage; - # NOW_HIPAID is who pays the premium; NOW_GRPFTYP is family vs self-only - # plan. These seed the ESI policyholder recode and the premium imputation. - "NOW_OWNGRP": "_now_owngrp", - "NOW_HIPAID": "_now_hipaid", - "NOW_GRPFTYP": "_now_grpftyp", - "PHIP_VAL": "health_insurance_premiums_without_medicare_part_b", - "POTC_VAL": "over_the_counter_health_expenses", - "PMED_VAL": "other_medical_expenses", - "PEMCPREM": "medicare_part_b_premiums", - "WICYN": "_receives_wic", - "SPM_CAPHOUSESUB": "spm_unit_capped_housing_subsidy_reported", - "SPM_ENGVAL": "spm_unit_energy_subsidy", - "SPM_CAPWKCCXPNS": "spm_unit_capped_work_childcare_expenses", - "SPM_CHILDCAREXPNS": "spm_unit_pre_subsidy_childcare_expenses", - # Person relationship-to-householder code (eCPS cps.py:190-195, :1219). - # Codes 43/44/46/47 mark an unmarried partner of the household head. - "PERRP": "_person_relationship_to_householder", - # Identifiers - "PH_SEQ": "household_id", - "GESTFIPS": "state_fips", - "PF_SEQ": "family_id", - "TAX_ID": "tax_unit_id", - "SPM_ID": "spm_unit_id", - "A_LINENO": "person_number", - "A_SPOUSE": "spouse_person_number", - "A_FAMREL": "family_relationship", - "A_MARITL": "marital_status", - # Weights - "A_FNLWGT": "weight", - "MARSUPWT": "march_supplement_weight", -} - -HOUSEHOLD_VARIABLES = { - "H_SEQ": "household_id", - "GESTFIPS": "state_fips", - "GTCO": "county_fips", - "GTCBSA": "cbsa", - "HRHTYPE": "household_type", - "H_NUMPER": "household_size", - "HHINC": "household_income_bracket", - "HTOTVAL": "household_total_income", - "HSUP_WGT": "household_weight", -} - -PERSON_OBSERVATION_EXCLUDED_COLUMNS = ( - "person_id", - "household_id", - "weight", - "march_supplement_weight", - "year", -) - -HOUSEHOLD_OBSERVATION_EXCLUDED_COLUMNS = ( - "household_id", - "household_weight", - "year", -) - -CPS_INCOME_ALIAS_COMPONENT_GROUPS = ( - ("wage_income",), - ("self_employment_income",), - ("interest_income",), - ("dividend_income",), - ("rental_income",), - ("social_security",), - ("pension_income", "taxable_pension_income"), - ("unemployment_compensation",), - ("alimony_income",), -) -CPS_INCOME_ALIAS_COMPONENTS = tuple( - column for group in CPS_INCOME_ALIAS_COMPONENT_GROUPS for column in group -) - -PERSON_NONNEGATIVE_VALUE_COLUMNS = ( - "wage_income", - "interest_income", - "dividend_income", - "rental_income", - "social_security", - "ssi", - "unemployment_compensation", - "public_assistance", - "total_person_income", - "alimony_income", - "child_support_received", - "child_support_expense", - "disability_benefits", - "health_insurance_premiums_without_medicare_part_b", - "over_the_counter_health_expenses", - "other_medical_expenses", - "medicare_part_b_premiums", - "social_security_disability", - "social_security_retirement", - "social_security_survivors", - "social_security_dependents", - "spm_unit_energy_subsidy", - "spm_unit_capped_housing_subsidy_reported", - "spm_unit_capped_work_childcare_expenses", - "spm_unit_pre_subsidy_childcare_expenses", - "hourly_wage", - "self_employed_pension_contributions", - "traditional_401k_contributions", - "roth_401k_contributions", - "traditional_ira_contributions", - "roth_ira_contributions", - "taxable_private_pension_income", - "tax_exempt_private_pension_income", - "taxable_401k_distributions", - "tax_exempt_401k_distributions", - "taxable_403b_distributions", - "tax_exempt_403b_distributions", - "regular_ira_distributions", - "roth_ira_distributions", - "tax_exempt_ira_distributions", - "taxable_sep_distributions", - "tax_exempt_sep_distributions", - "other_type_retirement_account_distributions", - "keogh_distributions", - "veterans_benefits", - "workers_compensation", - "weeks_unemployed", -) - -PERSON_ZERO_DEFAULT_VALUE_COLUMNS = ( - "alimony_income", - "child_support_received", - "child_support_expense", - "disability_benefits", - "health_insurance_premiums_without_medicare_part_b", - "over_the_counter_health_expenses", - "other_medical_expenses", - "medicare_part_b_premiums", - "social_security_disability", - "social_security_retirement", - "social_security_survivors", - "social_security_dependents", - "spm_unit_energy_subsidy", - "spm_unit_capped_housing_subsidy_reported", - "spm_unit_capped_work_childcare_expenses", - "spm_unit_pre_subsidy_childcare_expenses", - "hourly_wage", - "self_employed_pension_contributions", - "traditional_401k_contributions", - "roth_401k_contributions", - "traditional_ira_contributions", - "roth_ira_contributions", - "taxable_private_pension_income", - "tax_exempt_private_pension_income", - "taxable_401k_distributions", - "tax_exempt_401k_distributions", - "taxable_403b_distributions", - "tax_exempt_403b_distributions", - "regular_ira_distributions", - "roth_ira_distributions", - "tax_exempt_ira_distributions", - "taxable_sep_distributions", - "tax_exempt_sep_distributions", - "other_type_retirement_account_distributions", - "keogh_distributions", - "veterans_benefits", - "workers_compensation", - "weeks_unemployed", -) - -PERSON_CACHE_REQUIRED_COLUMNS = ( - "state_fips", - "county_fips", - "cps_race", - "is_hispanic", - "is_disabled", - "has_esi", - "has_marketplace_health_coverage", - "alimony_income", - "child_support_received", - "child_support_expense", - "disability_benefits", - "health_insurance_premiums_without_medicare_part_b", - "other_medical_expenses", - "over_the_counter_health_expenses", - "medicare_part_b_premiums", - "social_security_disability", - "social_security_retirement", - "social_security_survivors", - "social_security_dependents", - "receives_wic", - "spm_unit_pre_subsidy_childcare_expenses", - "tax_exempt_private_pension_income", - "regular_ira_distributions", - "roth_ira_distributions", - "tax_exempt_ira_distributions", - "taxable_401k_distributions", - "taxable_403b_distributions", - "taxable_sep_distributions", - "other_type_retirement_account_distributions", - "keogh_distributions", - "veterans_benefits", - "workers_compensation", - "weeks_unemployed", -) - -PERSON_CPS_DISABILITY_COLUMNS = ( - "_disability_dressing", - "_disability_hearing", - "_disability_vision", - "_disability_errands", - "_disability_physical", - "_disability_cognitive", -) - -# eCPS difficulty_* eligibility leaves recoded from the ASEC PEDIS* fields -# (PEDIS{X} == 1 -> True, the same recode eCPS uses for is_blind from PEDISEYE). -# These are eCPS final-H5 contract columns, not pe-us variables, so they export -# via the legacy-contract entity map. Mirrors policyengine-us-data -# datasets/cps/cps.py (unmerged branch claude/document-census-tax-id-replacement) -# which maps each difficulty leaf to its PEDIS source field. -PERSON_CPS_DIFFICULTY_LEAVES = { - "_disability_dressing": "difficulty_dressing_or_bathing", - "_disability_hearing": "difficulty_hearing", - "_disability_vision": "difficulty_seeing", - "_disability_errands": "difficulty_doing_errands", - "_disability_physical": "difficulty_walking_or_climbing_stairs", - "_disability_cognitive": "difficulty_remembering_or_making_decisions", -} - -WORKERS_COMP_DISABILITY_CODE = 1 -ALIMONY_OTHER_INCOME_CODE = 20 -STRIKE_BENEFITS_OTHER_INCOME_CODE = 12 -SOCIAL_SECURITY_RETIREMENT_REASON_CODE = 1 -SOCIAL_SECURITY_DISABILITY_REASON_CODE = 2 -SOCIAL_SECURITY_SURVIVOR_REASON_CODES = (3, 5) -SOCIAL_SECURITY_DEPENDENT_REASON_CODES = (4, 6, 7) -MINIMUM_RETIREMENT_AGE = 62 - -# Retirement-contribution allocation fractions used to split the single -# bundled CPS RETCB_VAL total into the account-type-specific desired -# contribution leaves the eCPS contract requires. These mirror the eCPS -# split (PolicyEngine/policyengine-us-data -# policyengine_us_data/datasets/cps/cps.py:1500-1552) and trace exactly to -# policyengine_us_data/datasets/cps/imputation_parameters.yaml: -# se_pension_share_of_retirement_contributions: 0.046 (yaml line 30) -# dc_share_of_retirement_contributions: 0.908 (yaml line 38) -# roth_share_of_dc_contributions: 0.15 (yaml line 48) -# traditional_share_of_ira_contributions: 0.392 (yaml line 55) -# "Desired" means pre-statutory-limit; PolicyEngine-US applies the limits. -SE_PENSION_SHARE_OF_RETIREMENT_CONTRIBUTIONS = 0.046 -DC_SHARE_OF_RETIREMENT_CONTRIBUTIONS = 0.908 -ROTH_SHARE_OF_DC_CONTRIBUTIONS = 0.15 -TRADITIONAL_SHARE_OF_IRA_CONTRIBUTIONS = 0.392 -TAXABLE_PENSION_FRACTION = 0.590 -TAXABLE_401K_DISTRIBUTION_FRACTION = 1.0 -TAXABLE_403B_DISTRIBUTION_FRACTION = 1.0 -TAXABLE_SEP_DISTRIBUTION_FRACTION = 1.0 -RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR = { - 2021: {"401k": 19_500, "401k_catch_up": 6_500, "ira": 6_000, "ira_catch_up": 1_000}, - 2022: {"401k": 20_500, "401k_catch_up": 6_500, "ira": 6_000, "ira_catch_up": 1_000}, - 2023: {"401k": 22_500, "401k_catch_up": 7_500, "ira": 6_500, "ira_catch_up": 1_000}, - 2024: {"401k": 23_000, "401k_catch_up": 7_500, "ira": 7_000, "ira_catch_up": 1_000}, - 2025: {"401k": 23_500, "401k_catch_up": 7_500, "ira": 7_000, "ira_catch_up": 1_000}, -} -RETIREMENT_CATCH_UP_AGE = 50 - -# Census CPS ASEC 2024 technical documentation, PERRP (relationship to -# household reference person). Codes 43/44/46/47 mark an unmarried partner of -# the household head. Mirrors policyengine-us-data cps.py:190-195, :1219. -# https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar24.pdf -PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES = (43, 44, 46, 47) - -# Employer-sponsored insurance recode/imputation codes and plan-type priors, -# mirrored verbatim from policyengine-us-data cps.py:204-274. -ESI_HAS_CURRENT_OWN_COVERAGE = 1 # NOW_OWNGRP: holds ESI in own name. -ESI_EMPLOYER_PAYS_ALL = 1 # NOW_HIPAID -ESI_EMPLOYER_PAYS_SOME = 2 # NOW_HIPAID -ESI_FAMILY_PLAN = 1 # NOW_GRPFTYP -ESI_SELF_ONLY_PLAN = 2 # NOW_GRPFTYP -# AHRQ MEPS-IC Table IV.A.1 (private sector, 2024) plan-type averages. eCPS -# hardcodes these same constants to seed CPS policyholder premium records; -# national calibration later aligns the aggregate to the BEA full-economy -# employer premium total. These are constants in eCPS, not external data. -ESI_PLAN_PRIORS_2024 = { - "family": { - "total_premium": 21_207.52589669509, - "employee_contribution": 6_490.205059544782, - }, - "self_only": { - "total_premium": 8_389.275834815255, - "employee_contribution": 1_909.5781466113417, - }, -} -PE_CPS_UNDOCUMENTED_TARGET = 13e6 -PE_CPS_UNDOCUMENTED_WORKERS_TARGET = 8.3e6 -PE_CPS_UNDOCUMENTED_STUDENTS_TARGET = 0.21 * 1.9e6 - - -def derive_treasury_tipped_occupation_code( - census_occupation_codes: pd.Series | np.ndarray, -) -> np.ndarray: - """Map CPS detailed occupation codes to Treasury tipped occupation codes.""" - values = pd.Series(census_occupation_codes, copy=False) - values = pd.to_numeric(values, errors="coerce").fillna(-1).astype(int) - return ( - values.map(CENSUS_OCCUPATION_CODE_TO_TTOC).fillna(0).astype(np.int16).to_numpy() - ) - - -def processed_cps_asec_cache_path(*, year: int, cache_dir: Path) -> Path: - """Return the versioned processed-cache path for one CPS ASEC year.""" - return cache_dir / ( - f"cps_asec_{year}_processed_v{CPS_ASEC_PROCESSED_CACHE_VERSION}.parquet" - ) - - -def legacy_processed_cps_asec_cache_path(*, year: int, cache_dir: Path) -> Path: - """Return the legacy unversioned processed-cache path for one CPS ASEC year.""" - return cache_dir / f"cps_asec_{year}_processed.parquet" - - -@dataclass -class CPSDataset: - """Container for CPS ASEC data.""" - - persons: pl.DataFrame - households: pl.DataFrame - year: int - source: str - - @property - def n_persons(self) -> int: - return len(self.persons) - - @property - def n_households(self) -> int: - return len(self.households) - - def summary(self) -> dict: - """Return summary statistics.""" - return { - "year": self.year, - "n_persons": self.n_persons, - "n_households": self.n_households, - "states": self.households["state_fips"].n_unique(), - "total_weight": float(self.persons["weight"].sum()), - } - - -def _descriptor_from_tables( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - name: str, -) -> SourceDescriptor: - household_variables = tuple( - column - for column in households.columns - if column not in HOUSEHOLD_OBSERVATION_EXCLUDED_COLUMNS - ) - person_variables = tuple( - column - for column in persons.columns - if column not in PERSON_OBSERVATION_EXCLUDED_COLUMNS - ) - return SourceDescriptor( - name=name, - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - archetype=SourceArchetype.HOUSEHOLD_INCOME, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=household_variables, - weight_column="household_weight" - if "household_weight" in households.columns - else None, - period_column="year" if "year" in households.columns else None, - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=person_variables, - weight_column="weight" if "weight" in persons.columns else None, - period_column="year" if "year" in persons.columns else None, - ), - ), - variable_capabilities=resolve_source_variable_capabilities( - name, - (*household_variables, *person_variables), - ), - ) - - -def _ensure_person_ids(persons: pd.DataFrame) -> pd.DataFrame: - result = persons.copy() - if "person_id" in result.columns: - return result - if "person_number" in result.columns and "household_id" in result.columns: - result["person_id"] = ( - result["household_id"].astype(str) - + ":" - + result["person_number"].astype(str) - ) - return result - if "household_id" in result.columns: - result["person_id"] = ( - result["household_id"].astype(str) - + ":" - + result.groupby("household_id").cumcount().add(1).astype(str) - ) - return result - result["person_id"] = np.arange(len(result)).astype(str) - return result - - -def _add_cps_income_aliases(persons: pd.DataFrame) -> pd.DataFrame: - """Derive canonical income from CPS components for PE-style donor matching.""" - if "income" in persons.columns: - return persons - component_groups = [ - tuple(column for column in group if column in persons.columns) - for group in CPS_INCOME_ALIAS_COMPONENT_GROUPS - ] - component_groups = [group for group in component_groups if group] - if not component_groups: - if "total_person_income" not in persons.columns: - return persons - result = persons.copy() - result["income"] = ( - pd.to_numeric(result["total_person_income"], errors="coerce") - .fillna(0.0) - .astype(float) - ) - return result - - result = persons.copy() - income = pd.Series(0.0, index=result.index, dtype=float) - for group in component_groups: - column = group[0] - income = income + ( - pd.to_numeric(result[column], errors="coerce").fillna(0.0).astype(float) - ) - result["income"] = income.astype(float) - return result - - -def _repair_relationship_to_head( - persons: pd.DataFrame, - relationship: pd.Series, -) -> pd.Series: - """Repair household relationship patterns so each household has one clear head.""" - normalized = relationship.astype(int).copy() - if "household_id" not in persons.columns: - return normalized - - ages = pd.to_numeric(persons.get("age", 0), errors="coerce").fillna(0.0) - grouped = persons.groupby("household_id", sort=False).groups - for member_index in grouped.values(): - member_index = list(member_index) - household_relationship = normalized.loc[member_index].copy() - household_ages = ages.loc[member_index] - - head_index = household_relationship[household_relationship.eq(0)].index.tolist() - if not head_index: - spouse_candidates = [ - index - for index in household_relationship[ - household_relationship.eq(1) - ].index.tolist() - if household_ages.loc[index] >= 18 - ] - adult_candidates = [ - index - for index in household_relationship.index.tolist() - if household_ages.loc[index] >= 18 - ] - candidate_pool = ( - spouse_candidates - or adult_candidates - or household_relationship.index.tolist() - ) - head_choice = max( - candidate_pool, key=lambda index: household_ages.loc[index] - ) - normalized.loc[head_choice] = 0 - head_index = [head_choice] - elif len(head_index) > 1: - keep_head = max(head_index, key=lambda index: household_ages.loc[index]) - for index in head_index: - if index == keep_head: - continue - normalized.loc[index] = 3 if household_ages.loc[index] >= 19 else 2 - - spouse_index = normalized.loc[member_index][ - normalized.loc[member_index].eq(1) - ].index.tolist() - if len(spouse_index) > 1: - keep_spouse = max(spouse_index, key=lambda index: household_ages.loc[index]) - for index in spouse_index: - if index == keep_spouse: - continue - normalized.loc[index] = 3 if household_ages.loc[index] >= 19 else 2 - - return normalized - - -def _normalize_relationship_to_head(persons: pd.DataFrame) -> pd.Series: - """Normalize available CPS relationship coding to head/spouse/dependent/other.""" - family_normalized: pd.Series | None = None - if "family_relationship" in persons.columns: - family_relationship = ( - pd.to_numeric(persons["family_relationship"], errors="coerce") - .fillna(-1) - .astype(int) - ) - unique_values = set(family_relationship.unique().tolist()) - if unique_values.issubset({0, 1, 2, 3, 4}): - family_normalized = pd.Series(3, index=persons.index, dtype=int) - household_groups = ( - persons.groupby("household_id", sort=False).groups.values() - if "household_id" in persons.columns - else [persons.index] - ) - for member_index in household_groups: - member_index = list(member_index) - household_codes = set(family_relationship.loc[member_index].tolist()) - if 0 in household_codes: - mapped = family_relationship.loc[member_index].map( - {0: 0, 1: 1, 2: 2, 3: 3, 4: 3} - ) - else: - mapped = family_relationship.loc[member_index].map( - {1: 0, 2: 1, 3: 2, 4: 3} - ) - family_normalized.loc[member_index] = mapped.fillna(3).astype(int) - - if "relationship_to_head" not in persons.columns: - if family_normalized is not None: - return _repair_relationship_to_head(persons, family_normalized) - order = persons.groupby("household_id").cumcount() - normalized = pd.Series(3, index=persons.index, dtype=int) - normalized.loc[order == 0] = 0 - normalized.loc[ - (order == 1) - & (pd.to_numeric(persons.get("age", 0), errors="coerce").fillna(0) >= 18) - ] = 1 - normalized.loc[ - pd.to_numeric(persons.get("age", 0), errors="coerce").fillna(0) < 18 - ] = 2 - return _repair_relationship_to_head(persons, normalized) - - relationship = ( - pd.to_numeric(persons["relationship_to_head"], errors="coerce") - .fillna(-1) - .astype(int) - ) - unique_values = set(relationship.unique().tolist()) - if unique_values.issubset({0, 1, 2, 3}): - if family_normalized is not None: - relationship_detail = set(relationship.unique().tolist()) & {1, 2} - family_detail = set(family_normalized.unique().tolist()) & {1, 2} - if len(family_detail) > len(relationship_detail): - return _repair_relationship_to_head(persons, family_normalized) - return _repair_relationship_to_head(persons, relationship) - - if unique_values.issubset({1, 2, 3, 4}): - normalized = relationship.map({1: 0, 2: 1, 3: 3, 4: 2}).fillna(3).astype(int) - return _repair_relationship_to_head(persons, normalized) - - order = persons.groupby("household_id").cumcount() - normalized = pd.Series(3, index=persons.index, dtype=int) - normalized.loc[order == 0] = 0 - normalized.loc[ - (order == 1) - & (pd.to_numeric(persons.get("age", 0), errors="coerce").fillna(0) >= 18) - ] = 1 - normalized.loc[ - pd.to_numeric(persons.get("age", 0), errors="coerce").fillna(0) < 18 - ] = 2 - return _repair_relationship_to_head(persons, normalized) - - -def _add_cps_tax_unit_structure_columns(persons: pd.DataFrame) -> pd.DataFrame: - """Derive PE-style tax-unit role columns from CPS tax-unit identifiers and pointers.""" - if "tax_unit_id" not in persons.columns: - return persons - - result = persons.copy() - relationship = _normalize_relationship_to_head(result) - result["tax_unit_is_joint"] = 0.0 - result["tax_unit_count_dependents"] = 0.0 - result["is_tax_unit_head"] = 0.0 - result["is_tax_unit_spouse"] = 0.0 - result["is_tax_unit_dependent"] = 0.0 - - ages = pd.to_numeric(result.get("age", 0), errors="coerce").fillna(0.0) - spouse_person_number = ( - pd.to_numeric(result.get("spouse_person_number", 0), errors="coerce") - .fillna(0) - .astype(int) - ) - person_number = ( - pd.to_numeric(result.get("person_number", 0), errors="coerce") - .fillna(0) - .astype(int) - ) - - valid_tax_unit_ids = result["tax_unit_id"].notna() & result["tax_unit_id"].astype( - str - ).str.strip().ne("") - grouped = result.loc[valid_tax_unit_ids].groupby( - ["household_id", "tax_unit_id"], sort=False - ) - for _, unit_persons in grouped: - member_index = unit_persons.index - unit_relationship = relationship.loc[member_index] - dependent_index = unit_relationship[unit_relationship.eq(2)].index.tolist() - - spouse_index: list[int] = [] - by_number = { - int(number): idx - for idx, number in person_number.loc[member_index].items() - if int(number) > 0 - } - for idx in member_index: - spouse_number = int(spouse_person_number.loc[idx]) - current_number = int(person_number.loc[idx]) - if spouse_number <= 0 or current_number <= 0: - continue - spouse_idx = by_number.get(spouse_number) - if spouse_idx is None: - continue - if int(spouse_person_number.loc[spouse_idx]) != current_number: - continue - spouse_index.extend([int(idx), int(spouse_idx)]) - if not spouse_index: - spouse_index = ( - unit_relationship[unit_relationship.eq(1)].index.astype(int).tolist() - ) - spouse_index = [ - idx for idx in dict.fromkeys(spouse_index) if idx not in dependent_index - ] - - head_index: int | None = None - head_candidates = [ - int(idx) - for idx in unit_relationship[unit_relationship.eq(0)].index.tolist() - if int(idx) not in spouse_index - ] - if head_candidates: - head_index = head_candidates[0] - else: - nondependent_candidates = [ - int(idx) - for idx in member_index.tolist() - if int(idx) not in spouse_index and int(idx) not in dependent_index - ] - if nondependent_candidates: - head_index = max( - nondependent_candidates, - key=lambda idx: (float(ages.loc[idx]), -int(idx)), - ) - elif spouse_index: - head_index = spouse_index[0] - spouse_index = [idx for idx in spouse_index if idx != head_index] - else: - head_index = int(member_index[0]) - - spouse_index = [idx for idx in spouse_index if idx != head_index] - if len(spouse_index) > 1: - spouse_index = [ - max( - spouse_index, - key=lambda idx: (float(ages.loc[idx]), -int(idx)), - ) - ] - - result.loc[member_index, "tax_unit_is_joint"] = float(bool(spouse_index)) - result.loc[member_index, "tax_unit_count_dependents"] = float( - len(dependent_index) - ) - result.loc[dependent_index, "is_tax_unit_dependent"] = 1.0 - if head_index is not None: - result.loc[head_index, "is_tax_unit_head"] = 1.0 - result.loc[spouse_index, "is_tax_unit_spouse"] = 1.0 - - return result - - -def _build_observation_frame( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - source_name: str, -) -> ObservationFrame: - normalized_households = households.copy() - normalized_persons = _add_cps_tax_unit_structure_columns( - _add_cps_income_aliases(_ensure_person_ids(persons)) - ) - descriptor = _descriptor_from_tables( - households=normalized_households, - persons=normalized_persons, - name=source_name, - ) - frame = ObservationFrame( - source=descriptor, - tables={ - EntityType.HOUSEHOLD: normalized_households, - EntityType.PERSON: normalized_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - frame.validate() - return frame - - -def _sample_households_and_persons( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Sample households and keep all linked person records.""" - household_sort_columns = [ - column for column in ("household_id", "year") if column in households.columns - ] - person_sort_columns = [ - column - for column in ("household_id", "person_id", "person_number", "year") - if column in persons.columns - ] - if household_sort_columns: - households = households.sort_values( - household_sort_columns, - kind="mergesort", - ).reset_index(drop=True) - else: - households = households.reset_index(drop=True) - if person_sort_columns: - persons = persons.sort_values( - person_sort_columns, - kind="mergesort", - ).reset_index(drop=True) - else: - persons = persons.reset_index(drop=True) - if sample_n is None or sample_n >= len(households): - return households, persons - sampled_households = _sample_cps_households( - households=households, - persons=persons, - sample_n=sample_n, - random_seed=random_seed, - state_floor=state_floor, - state_age_floor=state_age_floor, - ) - sampled_keys = set(sampled_households["household_id"]) - sampled_persons = persons[persons["household_id"].isin(sampled_keys)].copy() - if household_sort_columns: - sampled_households = sampled_households.sort_values( - household_sort_columns, - kind="mergesort", - ) - if person_sort_columns: - sampled_persons = sampled_persons.sort_values( - person_sort_columns, - kind="mergesort", - ) - return sampled_households.reset_index(drop=True), sampled_persons.reset_index( - drop=True - ) - - -def _sample_cps_households( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, -) -> pd.DataFrame: - """Sample CPS households with optional state or state-age coverage floors.""" - - resolved_state_age_floor = int(state_age_floor or 0) - if ( - resolved_state_age_floor <= 0 - or "state_fips" not in households.columns - or "age" not in persons.columns - or "household_id" not in households.columns - or "household_id" not in persons.columns - ): - return sample_frame_with_state_floor( - households, - sample_n=sample_n, - random_seed=random_seed, - weight_col="household_weight", - state_floor=state_floor, - ) - - coverage = persons[["household_id", "age"]].merge( - households[["household_id", "state_fips"]], - on="household_id", - how="inner", - ) - coverage["age_band"] = coverage["age"].map(_cps_age_band_key) - coverage["state_fips"] = pd.to_numeric( - coverage["state_fips"], errors="coerce" - ).astype("Int64") - coverage = coverage.dropna(subset=["state_fips", "age_band"]).copy() - if coverage.empty: - return sample_frame_with_state_floor( - households, - sample_n=sample_n, - random_seed=random_seed, - weight_col="household_weight", - state_floor=state_floor, - ) - - rng = np.random.default_rng(random_seed) - selected_ids: set[int] = set() - for _, group in coverage.groupby(["state_fips", "age_band"], sort=True): - group_household_ids = pd.Index(group["household_id"].unique()) - already_selected = [hid for hid in group_household_ids if hid in selected_ids] - missing = resolved_state_age_floor - len(already_selected) - if missing <= 0: - continue - available_ids = [hid for hid in group_household_ids if hid not in selected_ids] - if not available_ids: - continue - candidate_households = households[ - households["household_id"].isin(available_ids) - ].copy() - sampled = sample_frame_without_replacement( - candidate_households, - sample_n=min(missing, len(candidate_households)), - random_seed=int(rng.integers(0, np.iinfo(np.int32).max)), - weight_col="household_weight", - ) - selected_ids.update(int(hid) for hid in sampled["household_id"].tolist()) - - if sample_n is not None and len(selected_ids) > sample_n: - raise ValueError( - "state_age_floor requires more sampled households than sample_n allows: " - f"selected={len(selected_ids)}, sample_n={sample_n}" - ) - - if not selected_ids: - return sample_frame_with_state_floor( - households, - sample_n=sample_n, - random_seed=random_seed, - weight_col="household_weight", - state_floor=state_floor, - ) - - selected = households[households["household_id"].isin(selected_ids)].copy() - remaining_n = int(sample_n) - len(selected) - if remaining_n <= 0: - return selected - - remainder = households[~households["household_id"].isin(selected_ids)].copy() - remainder_sample = sample_frame_without_replacement( - remainder, - sample_n=remaining_n, - random_seed=int(rng.integers(0, np.iinfo(np.int32).max)), - weight_col="household_weight", - ) - return pd.concat([selected, remainder_sample], axis=0, ignore_index=False) - - -def _cps_age_band_key(age: float | int | None) -> str | None: - value = pd.to_numeric(pd.Series([age]), errors="coerce").iloc[0] - if pd.isna(value): - return None - age_int = int(value) - if age_int < 0: - return None - if age_int >= 85: - return "85_plus" - lower = (age_int // 5) * 5 - upper = lower + 5 - return f"{lower}_{upper}" - - -@dataclass -class CPSASECSourceProvider: - """Source-provider wrapper around the CPS ASEC Census loader.""" - - year: int = 2023 - cache_dir: Path | None = None - download: bool = True - loader: Callable[..., CPSDataset] | None = None - _descriptor_cache: SourceDescriptor | None = None - - @property - def descriptor(self) -> SourceDescriptor: - if self._descriptor_cache is not None: - return self._descriptor_cache - return SourceDescriptor( - name="cps_asec", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - archetype=SourceArchetype.HOUSEHOLD_INCOME, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips",), - weight_column="household_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=("age",), - weight_column="weight", - ), - ), - ) - - def load_frame(self, query: SourceQuery | None = None) -> ObservationFrame: - query = query or SourceQuery() - provider_filters = query.provider_filters - loader = self.loader or load_cps_asec - dataset = loader( - year=int(provider_filters.get("year", self.year)), - cache_dir=provider_filters.get("cache_dir", self.cache_dir), - download=bool(provider_filters.get("download", self.download)), - ) - households = dataset.households.to_pandas() - persons = dataset.persons.to_pandas() - households, persons = _sample_households_and_persons( - households=households, - persons=persons, - sample_n=provider_filters.get("sample_n"), - random_seed=int(provider_filters.get("random_seed", 0)), - state_floor=provider_filters.get("state_floor"), - state_age_floor=provider_filters.get("state_age_floor"), - ) - frame = _build_observation_frame( - households=households, - persons=persons, - source_name=f"cps_asec_{dataset.year}", - ) - self._descriptor_cache = frame.source - return apply_source_query(frame, query) - - -@dataclass -class CPSASECParquetSourceProvider: - """Source-provider wrapper around split CPS household/person parquet files.""" - - data_dir: str | Path - year: int | None = None - households_filename: str = "cps_asec_households.parquet" - persons_filename: str = "cps_asec_persons.parquet" - _descriptor_cache: SourceDescriptor | None = None - - @property - def descriptor(self) -> SourceDescriptor: - if self._descriptor_cache is not None: - return self._descriptor_cache - return SourceDescriptor( - name="cps_asec_parquet", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - archetype=SourceArchetype.HOUSEHOLD_INCOME, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips",), - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=("age",), - ), - ), - ) - - def load_frame(self, query: SourceQuery | None = None) -> ObservationFrame: - data_dir = Path(self.data_dir) - households_path = data_dir / self.households_filename - persons_path = data_dir / self.persons_filename - if not households_path.exists() or not persons_path.exists(): - raise FileNotFoundError( - f"CPS ASEC data files not found in {data_dir}.\n" - "Expected household/person parquet files in the source directory." - ) - - households = pd.read_parquet(households_path) - persons = pd.read_parquet(persons_path) - query = query or SourceQuery() - provider_filters = query.provider_filters - if self.year is not None: - households = households.copy() - persons = persons.copy() - if "year" not in households.columns: - households["year"] = self.year - if "year" not in persons.columns: - persons["year"] = self.year - households, persons = _sample_households_and_persons( - households=households, - persons=persons, - sample_n=provider_filters.get("sample_n"), - random_seed=int(provider_filters.get("random_seed", 0)), - state_floor=provider_filters.get("state_floor"), - state_age_floor=provider_filters.get("state_age_floor"), - ) - frame = _build_observation_frame( - households=households, - persons=persons, - source_name="cps_asec_parquet", - ) - self._descriptor_cache = frame.source - return apply_source_query(frame, query) - - -def download_cps_asec( - year: int, - cache_dir: Path | None = None, - force: bool = False, -) -> Path: - """ - Download CPS ASEC data for a given year. - - Args: - year: Year of CPS ASEC (e.g., 2023) - cache_dir: Directory to cache downloads - force: Re-download even if cached - - Returns: - Path to downloaded/cached zip file - """ - - import httpx - - if cache_dir is None: - cache_dir = DEFAULT_CACHE_DIR - - cache_dir.mkdir(parents=True, exist_ok=True) - - if year not in CPS_URLS: - available = ", ".join(str(y) for y in sorted(CPS_URLS.keys())) - raise ValueError(f"CPS ASEC for {year} not available. Available: {available}") - - url = CPS_URLS[year] - filename = f"cps_asec_{year}.zip" - cache_path = cache_dir / filename - - if cache_path.exists() and not force: - print(f"Using cached CPS ASEC {year} from {cache_path}") - return cache_path - - print(f"Downloading CPS ASEC {year} from {url}...") - - with httpx.Client(follow_redirects=True, timeout=300) as client: - response = client.get(url) - response.raise_for_status() - - with open(cache_path, "wb") as f: - f.write(response.content) - - print(f"Downloaded {len(response.content) / 1_000_000:.1f} MB to {cache_path}") - return cache_path - - -def _read_cps_asec_raw_files( - zip_path: Path, -) -> tuple[pl.DataFrame, pl.DataFrame | None]: - # Schema overrides for columns with large IDs that overflow int64. - schema_overrides = { - "PERIDNUM": pl.Utf8, - "H_IDNUM": pl.Utf8, - "OCCURNUM": pl.Utf8, - "QSTNUM": pl.Utf8, - } - - with zipfile.ZipFile(zip_path, "r") as zf: - person_file = None - household_file = None - - for name in zf.namelist(): - lower = name.lower() - if "pppub" in lower and lower.endswith(".csv"): - person_file = name - elif "hhpub" in lower and lower.endswith(".csv"): - household_file = name - - if person_file is None: - raise ValueError(f"Could not find person file in {zip_path}") - - with zf.open(person_file) as f: - persons_raw = pl.read_csv( - f, - infer_schema_length=10000, - schema_overrides=schema_overrides, - ) - - if household_file is None: - households_raw = None - else: - with zf.open(household_file) as f: - households_raw = pl.read_csv( - f, - infer_schema_length=10000, - schema_overrides=schema_overrides, - ) - - return persons_raw, households_raw - - -def _attach_previous_year_income( - *, - persons: pl.DataFrame, - current_persons_raw: pl.DataFrame, -) -> pl.DataFrame: - # The EITC/CTC prior-year-earnings election (the COVID-era "lookback") - # expired after 2021, so employment_income_last_year / - # self_employment_income_last_year / previous_year_income_available feed no - # live PolicyEngine-US formula. Rather than load and panel-join the prior - # ASEC (an extra survey-year dependency that only covered the ~50% rotation - # overlap), fall back to current-year earnings as a placeholder. These - # columns can be dropped entirely once the export contract no longer - # requires them. - required = {"WSAL_VAL", "SEMP_VAL"} - if not required.issubset(set(current_persons_raw.columns)) or len(persons) != len( - current_persons_raw - ): - return persons.with_columns( - [ - pl.lit(-1.0).alias("employment_income_last_year"), - pl.lit(-1.0).alias("self_employment_income_last_year"), - pl.lit(False).alias("previous_year_income_available"), - ] - ) - - current = current_persons_raw.select(["WSAL_VAL", "SEMP_VAL"]).to_pandas() - employment = ( - pd.to_numeric(current["WSAL_VAL"], errors="coerce").fillna(0.0).to_numpy(float) - ) - self_employment = ( - pd.to_numeric(current["SEMP_VAL"], errors="coerce").fillna(0.0).to_numpy(float) - ) - return persons.with_columns( - [ - pl.Series("employment_income_last_year", employment), - pl.Series("self_employment_income_last_year", self_employment), - pl.Series( - "previous_year_income_available", - (employment != 0.0) | (self_employment != 0.0), - ), - ] - ) - - -def load_cps_asec( - year: int = 2023, - cache_dir: Path | None = None, - download: bool = True, -) -> CPSDataset: - """ - Load CPS ASEC data for a given year. - - Args: - year: Year of CPS ASEC (e.g., 2023) - cache_dir: Directory for cached data - download: Whether to download if not cached - - Returns: - CPSDataset with persons and households DataFrames - """ - if cache_dir is None: - cache_dir = DEFAULT_CACHE_DIR - - # Prefer a versioned processed cache so derivation-logic changes do not - # silently reuse stale pre-sim columns. - processed_path = processed_cps_asec_cache_path(year=year, cache_dir=cache_dir) - legacy_processed_path = legacy_processed_cps_asec_cache_path( - year=year, - cache_dir=cache_dir, - ) - if processed_path.exists(): - print(f"Loading processed CPS ASEC {year} from {processed_path}") - persons = pl.read_parquet(processed_path) - if _processed_persons_have_household_geography(persons): - households = _derive_households(persons) - return CPSDataset( - persons=persons, - households=households, - year=year, - source=str(processed_path), - ) - print( - f"Cached processed CPS ASEC {year} is missing state_fips; rebuilding from raw source" - ) - elif legacy_processed_path.exists(): - print( - "Ignoring legacy CPS ASEC processed cache " - f"{legacy_processed_path} because cache version " - f"{CPS_ASEC_PROCESSED_CACHE_VERSION} is required; rebuilding from raw source" - ) - - # Download if needed - zip_path = cache_dir / f"cps_asec_{year}.zip" - if not zip_path.exists(): - if not download: - raise FileNotFoundError( - f"CPS ASEC {year} not found at {zip_path}. " - "Set download=True to fetch from Census." - ) - zip_path = download_cps_asec(year, cache_dir) - - # Extract and parse - print(f"Parsing CPS ASEC {year}...") - - persons_raw, households_raw = _read_cps_asec_raw_files(zip_path) - - # Process person data - persons = _process_persons(persons_raw, year) - persons = _attach_previous_year_income( - persons=persons, - current_persons_raw=persons_raw, - ) - - # Process or derive household data - if households_raw is not None: - households = _process_households(households_raw, year) - else: - households = _derive_households(persons) - - persons = _attach_cps_ssn_card_type( - persons=persons, - households=households, - persons_raw=persons_raw, - ) - persons = _attach_household_geography_to_persons( - persons=persons, - households=households, - ) - - # Cache processed data - persons.write_parquet(processed_path) - print(f"Cached processed data to {processed_path}") - - return CPSDataset( - persons=persons, - households=households, - year=year, - source=str(zip_path), - ) - - -def _process_persons(df: pl.DataFrame, year: int) -> pl.DataFrame: - """Process raw person file into clean format.""" - selected = [ - pl.col(census_name).alias(our_name) - for census_name, our_name in PERSON_VARIABLES.items() - if census_name in df.columns - ] - if not selected: - raise ValueError("No recognized variables found in person file") - result = df.select(selected) - result = randomize_cps_topcoded_age_80_84(result) - - # Scale weights: CPS ASEC weights have 2 implied decimal places - # See CPS documentation: A_FNLWGT is expressed in units of 1/100 - # Divide by 100 to get actual population representation - if "weight" in result.columns: - result = result.with_columns((pl.col("weight") / 100).alias("weight")) - if "march_supplement_weight" in result.columns: - result = result.with_columns( - (pl.col("march_supplement_weight") / 100).alias("march_supplement_weight") - ) - - # Add derived columns - if "age" in result.columns: - result = result.with_columns( - [ - (pl.col("age") >= 18).alias("is_adult"), - (pl.col("age") < 18).alias("is_child"), - (pl.col("age") >= 65).alias("is_senior"), - ] - ) - - if "race" in result.columns and "cps_race" not in result.columns: - result = result.with_columns(pl.col("race").alias("cps_race")) - if "_cps_hispanic_code" in result.columns and "is_hispanic" not in result.columns: - result = result.with_columns( - (pl.col("_cps_hispanic_code") != 0).alias("is_hispanic") - ).drop("_cps_hispanic_code") - - health_staging_columns = [ - f"_{leaf}" for leaf in CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP - ] - available_health_staging = [ - column for column in health_staging_columns if column in result.columns - ] - if available_health_staging: - result = result.with_columns( - [ - (pl.col(f"_{leaf}") == 1).alias(leaf) - for leaf in CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP - if f"_{leaf}" in result.columns and leaf not in result.columns - ] - ) - result = result.with_columns( - [ - pl.col(reported_leaf).alias(leaf) - for leaf, reported_leaf in CURRENT_HEALTH_COVERAGE_RULE_INPUT_ALIAS_MAP.items() - if reported_leaf in result.columns and leaf not in result.columns - ] - ) - if ( - "_reported_has_private_health_coverage_at_interview" in result.columns - and "reported_has_private_health_coverage_at_interview" - not in result.columns - ): - result = result.with_columns( - ( - pl.col("_reported_has_private_health_coverage_at_interview") == 1 - ).alias("reported_has_private_health_coverage_at_interview") - ) - if ( - "_reported_has_public_health_coverage_at_interview" in result.columns - and "reported_has_public_health_coverage_at_interview" not in result.columns - ): - result = result.with_columns( - ( - pl.col("_reported_has_public_health_coverage_at_interview") == 1 - ).alias("reported_has_public_health_coverage_at_interview") - ) - if "_reported_current_health_coverage_code" in result.columns: - if "reported_is_insured_at_interview" not in result.columns: - result = result.with_columns( - (pl.col("_reported_current_health_coverage_code") == 1).alias( - "reported_is_insured_at_interview" - ) - ) - if "reported_is_uninsured_at_interview" not in result.columns: - result = result.with_columns( - (pl.col("_reported_current_health_coverage_code") != 1).alias( - "reported_is_uninsured_at_interview" - ) - ) - coverage_family_columns = [ - "reported_has_employer_sponsored_health_coverage_at_interview", - "reported_has_marketplace_health_coverage_at_interview", - "reported_has_non_marketplace_direct_purchase_health_coverage_at_interview", - "reported_has_medicare_health_coverage_at_interview", - "reported_has_means_tested_health_coverage_at_interview", - "reported_has_tricare_health_coverage_at_interview", - "reported_has_champva_health_coverage_at_interview", - "reported_has_va_health_coverage_at_interview", - "reported_has_indian_health_service_coverage_at_interview", - ] - available_coverage_family_columns = [ - column for column in coverage_family_columns if column in result.columns - ] - if ( - available_coverage_family_columns - and "reported_has_multiple_health_coverage_at_interview" - not in result.columns - ): - result = result.with_columns( - ( - pl.sum_horizontal( - *[ - pl.col(column).cast(pl.Int8) - for column in available_coverage_family_columns - ] - ) - > 1 - ).alias("reported_has_multiple_health_coverage_at_interview") - ) - if ( - "reported_has_marketplace_health_coverage_at_interview" in result.columns - and "has_marketplace_health_coverage" not in result.columns - ): - result = result.with_columns( - pl.col("reported_has_marketplace_health_coverage_at_interview").alias( - "has_marketplace_health_coverage" - ) - ) - if ( - "reported_has_employer_sponsored_health_coverage_at_interview" - in result.columns - and "has_esi" not in result.columns - ): - result = result.with_columns( - pl.col( - "reported_has_employer_sponsored_health_coverage_at_interview" - ).alias("has_esi") - ) - result = result.drop( - [ - column - for column in ( - *available_health_staging, - "_reported_has_private_health_coverage_at_interview", - "_reported_has_public_health_coverage_at_interview", - "_reported_current_health_coverage_code", - ) - if column in result.columns - ] - ) - - if ( - "_high_school_or_college_status" in result.columns - and "is_full_time_college_student" not in result.columns - ): - result = result.with_columns( - (pl.col("_high_school_or_college_status") == 2).alias( - "is_full_time_college_student" - ) - ).drop("_high_school_or_college_status") - elif "_high_school_or_college_status" in result.columns: - result = result.drop("_high_school_or_college_status") - - if "_is_paid_hourly_code" in result.columns: - if "is_paid_hourly" not in result.columns: - result = result.with_columns( - (pl.col("_is_paid_hourly_code") == 1).alias("is_paid_hourly") - ) - if ( - "_hourly_pay_cents" in result.columns - and "hourly_wage" not in result.columns - ): - result = result.with_columns( - pl.when( - (pl.col("_is_paid_hourly_code") == 1) - & (pl.col("_hourly_pay_cents") > 0) - ) - .then(pl.col("_hourly_pay_cents") / 100) - .otherwise(0.0) - .alias("hourly_wage") - ) - result = result.drop( - [ - column - for column in ("_is_paid_hourly_code", "_hourly_pay_cents") - if column in result.columns - ] - ) - elif "_hourly_pay_cents" in result.columns: - result = result.drop("_hourly_pay_cents") - - if ( - "_union_member_code" in result.columns - and "is_union_member_or_covered" not in result.columns - ): - result = result.with_columns( - (pl.col("_union_member_code") == 1).alias("is_union_member_or_covered") - ).drop("_union_member_code") - elif "_union_member_code" in result.columns: - result = result.drop("_union_member_code") - - if "detailed_occupation_recode" in result.columns: - occupation = pl.col("detailed_occupation_recode") - occupation_exprs: list[pl.Expr] = [] - if "has_never_worked" not in result.columns: - occupation_exprs.append((occupation == 53).alias("has_never_worked")) - if "is_military" not in result.columns: - occupation_exprs.append((occupation == 52).alias("is_military")) - if "is_computer_scientist" not in result.columns: - occupation_exprs.append((occupation == 8).alias("is_computer_scientist")) - if "is_farmer_fisher" not in result.columns: - occupation_exprs.append((occupation == 41).alias("is_farmer_fisher")) - if "is_executive_administrative_professional" not in result.columns: - occupation_exprs.append( - occupation.is_in( - [ - 1, - 2, - 3, - 5, - 6, - 7, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 18, - 19, - 25, - 26, - 27, - 28, - 29, - 34, - 36, - 38, - 39, - 40, - 42, - 50, - ] - ).alias("is_executive_administrative_professional") - ) - if occupation_exprs: - result = result.with_columns(occupation_exprs) - - if "_detailed_census_occupation_code" in result.columns: - if "treasury_tipped_occupation_code" not in result.columns: - tipped_codes = derive_treasury_tipped_occupation_code( - result["_detailed_census_occupation_code"].to_pandas() - ) - result = result.with_columns( - pl.Series("treasury_tipped_occupation_code", tipped_codes) - ) - if ( - "treasury_tipped_occupation_code" in result.columns - and "is_tipped_occupation" not in result.columns - ): - result = result.with_columns( - (pl.col("treasury_tipped_occupation_code") > 0).alias( - "is_tipped_occupation" - ) - ) - result = result.drop("_detailed_census_occupation_code") - - if { - "_other_income_code", - "_other_income_value", - }.issubset(set(result.columns)) and "alimony_income" not in result.columns: - other_income_exprs = [ - pl.when(pl.col("_other_income_code") == ALIMONY_OTHER_INCOME_CODE) - .then(pl.col("_other_income_value")) - .otherwise(0) - .alias("alimony_income") - ] - if "strike_benefits" not in result.columns: - other_income_exprs.append( - pl.when( - pl.col("_other_income_code") == STRIKE_BENEFITS_OTHER_INCOME_CODE - ) - .then(pl.col("_other_income_value")) - .otherwise(0) - .alias("strike_benefits") - ) - result = result.with_columns(other_income_exprs).drop( - ["_other_income_code", "_other_income_value"] - ) - else: - drop_columns = [ - column - for column in ("_other_income_code", "_other_income_value") - if column in result.columns - ] - if drop_columns: - result = result.drop(drop_columns) - if { - "_social_security_reason_1", - "_social_security_reason_2", - "social_security", - "age", - }.issubset(set(result.columns)) and ( - "social_security_disability" not in result.columns - or "social_security_retirement" not in result.columns - or "social_security_survivors" not in result.columns - or "social_security_dependents" not in result.columns - ): - reason_1 = pl.col("_social_security_reason_1") - reason_2 = pl.col("_social_security_reason_2") - has_retirement_reason = (reason_1 == SOCIAL_SECURITY_RETIREMENT_REASON_CODE) | ( - reason_2 == SOCIAL_SECURITY_RETIREMENT_REASON_CODE - ) - has_disability_reason = (reason_1 == SOCIAL_SECURITY_DISABILITY_REASON_CODE) | ( - reason_2 == SOCIAL_SECURITY_DISABILITY_REASON_CODE - ) - has_survivor_reason = reason_1.is_in( - SOCIAL_SECURITY_SURVIVOR_REASON_CODES - ) | reason_2.is_in(SOCIAL_SECURITY_SURVIVOR_REASON_CODES) - has_dependent_reason = reason_1.is_in( - SOCIAL_SECURITY_DEPENDENT_REASON_CODES - ) | reason_2.is_in(SOCIAL_SECURITY_DEPENDENT_REASON_CODES) - unclassified_social_security = ( - (pl.col("social_security") > 0) - & ~has_retirement_reason - & ~has_disability_reason - & ~has_survivor_reason - & ~has_dependent_reason - ) - derived_columns: list[pl.Expr] = [] - if "social_security_disability" not in result.columns: - derived_columns.append( - ( - pl.when(has_disability_reason & ~has_retirement_reason) - .then(pl.col("social_security")) - .otherwise(0.0) - + pl.when( - unclassified_social_security - & (pl.col("age") < MINIMUM_RETIREMENT_AGE) - ) - .then(pl.col("social_security")) - .otherwise(0.0) - ).alias("social_security_disability") - ) - if "social_security_retirement" not in result.columns: - derived_columns.append( - ( - pl.when(has_retirement_reason & ~has_disability_reason) - .then(pl.col("social_security")) - .otherwise(0.0) - + pl.when( - unclassified_social_security - & (pl.col("age") >= MINIMUM_RETIREMENT_AGE) - ) - .then(pl.col("social_security")) - .otherwise(0.0) - ).alias("social_security_retirement") - ) - if "social_security_survivors" not in result.columns: - derived_columns.append( - ( - pl.when( - has_survivor_reason - & ~has_retirement_reason - & ~has_disability_reason - ) - .then(pl.col("social_security")) - .otherwise(0.0) - ).alias("social_security_survivors") - ) - if "social_security_dependents" not in result.columns: - derived_columns.append( - ( - pl.when( - has_dependent_reason - & ~has_retirement_reason - & ~has_disability_reason - & ~has_survivor_reason - ) - .then(pl.col("social_security")) - .otherwise(0.0) - ).alias("social_security_dependents") - ) - result = result.with_columns(derived_columns).drop( - ["_social_security_reason_1", "_social_security_reason_2"] - ) - else: - drop_columns = [ - column - for column in ( - "_social_security_reason_1", - "_social_security_reason_2", - ) - if column in result.columns - ] - if drop_columns: - result = result.drop(drop_columns) - - private_pension_staging = [ - column - for column in ("_pension_income", "_annuity_income") - if column in result.columns - ] - if private_pension_staging: - private_pension_total = pl.sum_horizontal( - *[pl.col(column) for column in private_pension_staging] - ) - pension_exprs: list[pl.Expr] = [] - if "pension_income" not in result.columns: - pension_exprs.append(private_pension_total.alias("pension_income")) - if "taxable_private_pension_income" not in result.columns: - pension_exprs.append( - (private_pension_total * TAXABLE_PENSION_FRACTION).alias( - "taxable_private_pension_income" - ) - ) - if "tax_exempt_private_pension_income" not in result.columns: - pension_exprs.append( - (private_pension_total * (1 - TAXABLE_PENSION_FRACTION)).alias( - "tax_exempt_private_pension_income" - ) - ) - if "taxable_pension_income" not in result.columns: - pension_exprs.append( - (private_pension_total * TAXABLE_PENSION_FRACTION).alias( - "taxable_pension_income" - ) - ) - result = result.with_columns(pension_exprs).drop(private_pension_staging) - - retirement_distribution_pairs = [ - ("_retirement_distribution_code_1", "_retirement_distribution_value_1"), - ("_retirement_distribution_code_2", "_retirement_distribution_value_2"), - ( - "_retirement_distribution_code_1_yng", - "_retirement_distribution_value_1_yng", - ), - ( - "_retirement_distribution_code_2_yng", - "_retirement_distribution_value_2_yng", - ), - ] - available_retirement_distribution_pairs = [ - (code_column, value_column) - for code_column, value_column in retirement_distribution_pairs - if code_column in result.columns and value_column in result.columns - ] - if available_retirement_distribution_pairs: - distribution_by_code = {} - for code in range(1, 8): - distribution_by_code[code] = pl.sum_horizontal( - *[ - pl.when(pl.col(code_column) == code) - .then(pl.col(value_column)) - .otherwise(0.0) - for code_column, value_column in available_retirement_distribution_pairs - ] - ) - retirement_distribution_exprs: list[pl.Expr] = [] - if "taxable_401k_distributions" not in result.columns: - retirement_distribution_exprs.append( - (distribution_by_code[1] * TAXABLE_401K_DISTRIBUTION_FRACTION).alias( - "taxable_401k_distributions" - ) - ) - if "tax_exempt_401k_distributions" not in result.columns: - retirement_distribution_exprs.append( - ( - distribution_by_code[1] * (1 - TAXABLE_401K_DISTRIBUTION_FRACTION) - ).alias("tax_exempt_401k_distributions") - ) - if "taxable_403b_distributions" not in result.columns: - retirement_distribution_exprs.append( - (distribution_by_code[2] * TAXABLE_403B_DISTRIBUTION_FRACTION).alias( - "taxable_403b_distributions" - ) - ) - if "tax_exempt_403b_distributions" not in result.columns: - retirement_distribution_exprs.append( - ( - distribution_by_code[2] * (1 - TAXABLE_403B_DISTRIBUTION_FRACTION) - ).alias("tax_exempt_403b_distributions") - ) - if "roth_ira_distributions" not in result.columns: - retirement_distribution_exprs.append( - distribution_by_code[3].alias("roth_ira_distributions") - ) - if "regular_ira_distributions" not in result.columns: - retirement_distribution_exprs.append( - distribution_by_code[4].alias("regular_ira_distributions") - ) - if "taxable_ira_distributions" not in result.columns: - retirement_distribution_exprs.append( - distribution_by_code[4].alias("taxable_ira_distributions") - ) - if "tax_exempt_ira_distributions" not in result.columns: - retirement_distribution_exprs.append( - distribution_by_code[3].alias("tax_exempt_ira_distributions") - ) - if "keogh_distributions" not in result.columns: - retirement_distribution_exprs.append( - distribution_by_code[5].alias("keogh_distributions") - ) - if "taxable_sep_distributions" not in result.columns: - retirement_distribution_exprs.append( - (distribution_by_code[6] * TAXABLE_SEP_DISTRIBUTION_FRACTION).alias( - "taxable_sep_distributions" - ) - ) - if "tax_exempt_sep_distributions" not in result.columns: - retirement_distribution_exprs.append( - ( - distribution_by_code[6] * (1 - TAXABLE_SEP_DISTRIBUTION_FRACTION) - ).alias("tax_exempt_sep_distributions") - ) - if "other_type_retirement_account_distributions" not in result.columns: - retirement_distribution_exprs.append( - distribution_by_code[7].alias( - "other_type_retirement_account_distributions" - ) - ) - result = result.with_columns(retirement_distribution_exprs).drop( - [ - column - for pair in available_retirement_distribution_pairs - for column in pair - ] - ) - - # Split the bundled CPS retirement-contribution total (RETCB_VAL, staged - # as _retirement_contributions) into the five account-type-specific - # desired contribution leaves the eCPS contract requires. This mirrors - # PolicyEngine/policyengine-us-data - # policyengine_us_data/datasets/cps/cps.py:1500-1552 exactly: a - # proportional split using IRS SOI / BEA-FRED / Vanguard-PSCA shares - # (see imputation_parameters.yaml). The leaves are "desired" - # (pre-statutory-limit) inputs; PolicyEngine-US applies the limits. - _RETIREMENT_CONTRIBUTION_DESIRED_LEAVES = ( - "self_employed_pension_contributions_desired", - "traditional_401k_contributions_desired", - "roth_401k_contributions_desired", - "traditional_ira_contributions_desired", - "roth_ira_contributions_desired", - ) - _RETIREMENT_CONTRIBUTION_CAPPED_LEAVES = ( - "self_employed_pension_contributions", - "traditional_401k_contributions", - "roth_401k_contributions", - "traditional_ira_contributions", - "roth_ira_contributions", - ) - if { - "_retirement_contributions", - "wage_income", - "self_employment_income", - }.issubset(set(result.columns)) and any( - leaf not in result.columns - for leaf in ( - _RETIREMENT_CONTRIBUTION_DESIRED_LEAVES - + _RETIREMENT_CONTRIBUTION_CAPPED_LEAVES - ) - ): - retirement_contributions = pl.col("_retirement_contributions") - has_wages = pl.col("wage_income") > 0 - has_se = pl.col("self_employment_income") > 0 - has_earned_income = has_wages | has_se - - # 1) Self-employed pension: a share of the total, gated on SE income. - # No statutory limit applied here (PolicyEngine-US applies it). - se_pension = ( - pl.when(has_se) - .then( - retirement_contributions * SE_PENSION_SHARE_OF_RETIREMENT_CONTRIBUTIONS - ) - .otherwise(0.0) - ) - remaining = pl.max_horizontal( - retirement_contributions - se_pension, - pl.lit(0.0), - ) - - # 2) Split the remainder into a DC (401k) pool and an IRA pool. - # DC requires an employer, so it is gated on wages; the IRA pool - # takes whatever is left for anyone with earned income. - dc_pool = ( - pl.when(has_wages) - .then(remaining * DC_SHARE_OF_RETIREMENT_CONTRIBUTIONS) - .otherwise(0.0) - ) - ira_pool = pl.when(has_earned_income).then(remaining - dc_pool).otherwise(0.0) - traditional_401k_desired = dc_pool * (1 - ROTH_SHARE_OF_DC_CONTRIBUTIONS) - roth_401k_desired = dc_pool * ROTH_SHARE_OF_DC_CONTRIBUTIONS - traditional_ira_desired = ira_pool * TRADITIONAL_SHARE_OF_IRA_CONTRIBUTIONS - roth_ira_desired = ira_pool * (1 - TRADITIONAL_SHARE_OF_IRA_CONTRIBUTIONS) - - derived_retirement_columns: list[pl.Expr] = [] - if "self_employed_pension_contributions_desired" not in result.columns: - derived_retirement_columns.append( - se_pension.alias("self_employed_pension_contributions_desired") - ) - # DC pool: traditional/Roth 401(k) split. - if "traditional_401k_contributions_desired" not in result.columns: - derived_retirement_columns.append( - traditional_401k_desired.alias("traditional_401k_contributions_desired") - ) - if "roth_401k_contributions_desired" not in result.columns: - derived_retirement_columns.append( - roth_401k_desired.alias("roth_401k_contributions_desired") - ) - # IRA pool: traditional/Roth IRA split. - if "traditional_ira_contributions_desired" not in result.columns: - derived_retirement_columns.append( - traditional_ira_desired.alias("traditional_ira_contributions_desired") - ) - if "roth_ira_contributions_desired" not in result.columns: - derived_retirement_columns.append( - roth_ira_desired.alias("roth_ira_contributions_desired") - ) - limit_year = max( - min(year, max(RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR)), - min(RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR), - ) - limits = RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR[limit_year] - catch_up_eligible = pl.col("age") >= RETIREMENT_CATCH_UP_AGE - limit_401k = pl.lit(float(limits["401k"])) + ( - catch_up_eligible * float(limits["401k_catch_up"]) - ) - limit_ira = pl.lit(float(limits["ira"])) + ( - catch_up_eligible * float(limits["ira_catch_up"]) - ) - capped_se_pension = se_pension - capped_traditional_401k = ( - pl.when(has_wages) - .then(pl.min_horizontal(traditional_401k_desired, limit_401k)) - .otherwise(0.0) - ) - capped_remaining_401k_limit = pl.max_horizontal( - limit_401k - capped_traditional_401k, - pl.lit(0.0), - ) - capped_roth_401k = ( - pl.when(has_wages) - .then( - pl.min_horizontal( - roth_401k_desired, - capped_remaining_401k_limit, - ) - ) - .otherwise(0.0) - ) - capped_traditional_ira = ( - pl.when(has_earned_income) - .then(pl.min_horizontal(traditional_ira_desired, limit_ira)) - .otherwise(0.0) - ) - capped_remaining_ira_limit = pl.max_horizontal( - limit_ira - capped_traditional_ira, - pl.lit(0.0), - ) - capped_roth_ira = ( - pl.when(has_earned_income) - .then( - pl.min_horizontal( - roth_ira_desired, - capped_remaining_ira_limit, - ) - ) - .otherwise(0.0) - ) - capped_retirement_columns = { - "self_employed_pension_contributions": capped_se_pension, - "traditional_401k_contributions": capped_traditional_401k, - "roth_401k_contributions": capped_roth_401k, - "traditional_ira_contributions": capped_traditional_ira, - "roth_ira_contributions": capped_roth_ira, - } - for column, expression in capped_retirement_columns.items(): - if column not in result.columns: - derived_retirement_columns.append(expression.alias(column)) - result = result.with_columns(derived_retirement_columns).drop( - "_retirement_contributions" - ) - elif "_retirement_contributions" in result.columns: - result = result.drop("_retirement_contributions") - disability_columns = [ - column for column in PERSON_CPS_DISABILITY_COLUMNS if column in result.columns - ] - if disability_columns: - # eCPS difficulty_* leaves: PEDIS{X} == 1 -> True. Built from the staging - # columns before they are dropped below (the same staging feeds - # is_disabled). These are exported as eCPS dataset columns. - difficulty_exprs = [ - (pl.col(staging) == 1).alias(leaf) - for staging, leaf in PERSON_CPS_DIFFICULTY_LEAVES.items() - if staging in result.columns and leaf not in result.columns - ] - if difficulty_exprs: - result = result.with_columns(difficulty_exprs) - if disability_columns and "is_disabled" not in result.columns: - result = result.with_columns( - pl.any_horizontal( - *[(pl.col(column) == 1) for column in disability_columns] - ).alias("is_disabled") - ).drop(disability_columns) - elif disability_columns: - result = result.drop(disability_columns) - if { - "_disability_income_1", - "_disability_income_code_1", - "_disability_income_2", - "_disability_income_code_2", - }.issubset(set(result.columns)) and "disability_benefits" not in result.columns: - result = result.with_columns( - ( - pl.when( - pl.col("_disability_income_code_1") != WORKERS_COMP_DISABILITY_CODE - ) - .then(pl.col("_disability_income_1")) - .otherwise(0) - + pl.when( - pl.col("_disability_income_code_2") != WORKERS_COMP_DISABILITY_CODE - ) - .then(pl.col("_disability_income_2")) - .otherwise(0) - ).alias("disability_benefits") - ).drop( - [ - "_disability_income_1", - "_disability_income_code_1", - "_disability_income_2", - "_disability_income_code_2", - ] - ) - else: - drop_columns = [ - column - for column in ( - "_disability_income_1", - "_disability_income_code_1", - "_disability_income_2", - "_disability_income_code_2", - ) - if column in result.columns - ] - if drop_columns: - result = result.drop(drop_columns) - if "_receives_wic" in result.columns and "receives_wic" not in result.columns: - result = result.with_columns( - (pl.col("_receives_wic") == 1).alias("receives_wic") - ).drop("_receives_wic") - elif "_receives_wic" in result.columns: - result = result.drop("_receives_wic") - if ( - "spm_unit_capped_housing_subsidy_reported" in result.columns - and "receives_housing_assistance" not in result.columns - ): - result = result.with_columns( - (pl.col("spm_unit_capped_housing_subsidy_reported") > 0).alias( - "receives_housing_assistance" - ) - ) - if ( - "receives_housing_assistance" in result.columns - and "takes_up_housing_assistance_if_eligible" not in result.columns - ): - result = result.with_columns( - pl.col("receives_housing_assistance").alias( - "takes_up_housing_assistance_if_eligible" - ) - ) - # Unmarried partner of the household head (G8). Mirrors eCPS cps.py:1219 - # `perrp.isin(PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES)`. - if ( - "_person_relationship_to_householder" in result.columns - and "is_unmarried_partner_of_household_head" not in result.columns - ): - result = result.with_columns( - pl.col("_person_relationship_to_householder") - .is_in(PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES) - .alias("is_unmarried_partner_of_household_head") - ).drop("_person_relationship_to_householder") - elif "_person_relationship_to_householder" in result.columns: - result = result.drop("_person_relationship_to_householder") - # Employer-sponsored insurance policyholder + premium (G6). Mirrors eCPS - # cps.py:1576-1581: the policyholder flag is `NOW_OWNGRP == 1`, and the - # premium is `impute_employer_sponsored_insurance_premiums(person)` - # (eCPS cps.py:229-273), reproduced here on the renamed CPS columns. - _esi_source_columns = {"_now_owngrp", "_now_hipaid", "_now_grpftyp"} - if _esi_source_columns.issubset(set(result.columns)): - own_esi = pl.col("_now_owngrp") == ESI_HAS_CURRENT_OWN_COVERAGE - premium_status = pl.col("_now_hipaid") - plan_type = pl.col("_now_grpftyp") - if "reported_owns_employer_sponsored_health_insurance_at_interview" not in ( - result.columns - ): - result = result.with_columns( - own_esi.alias( - "reported_owns_employer_sponsored_health_insurance_at_interview" - ) - ) - if "employer_sponsored_insurance_premiums" not in result.columns: - # Employee-paid premium (PHIP_VAL), clipped at zero like eCPS. - employee_paid = ( - pl.when(pl.col("health_insurance_premiums_without_medicare_part_b") > 0) - .then(pl.col("health_insurance_premiums_without_medicare_part_b")) - .otherwise(0.0) - if "health_insurance_premiums_without_medicare_part_b" in result.columns - else pl.lit(0.0) - ) - total_premium = ( - pl.when(plan_type == ESI_SELF_ONLY_PLAN) - .then(ESI_PLAN_PRIORS_2024["self_only"]["total_premium"]) - .otherwise(ESI_PLAN_PRIORS_2024["family"]["total_premium"]) - ) - average_employee_contribution = ( - pl.when(plan_type == ESI_SELF_ONLY_PLAN) - .then(ESI_PLAN_PRIORS_2024["self_only"]["employee_contribution"]) - .otherwise(ESI_PLAN_PRIORS_2024["family"]["employee_contribution"]) - ) - employee_share = ( - pl.when(employee_paid > 0) - .then(employee_paid) - .otherwise(average_employee_contribution) - ) - employer_paid_when_some = (total_premium - employee_share).clip( - lower_bound=0.0 - ) - employer_paid = ( - pl.when(premium_status == ESI_EMPLOYER_PAYS_ALL) - .then(total_premium) - .when(premium_status == ESI_EMPLOYER_PAYS_SOME) - .then(employer_paid_when_some) - .otherwise(0.0) - ) - valid_owner_with_plan = own_esi & plan_type.is_in( - [ESI_FAMILY_PLAN, ESI_SELF_ONLY_PLAN] - ) - result = result.with_columns( - pl.when(valid_owner_with_plan) - .then(employer_paid) - .otherwise(0.0) - .alias("employer_sponsored_insurance_premiums") - ) - result = result.drop([c for c in _esi_source_columns if c in result.columns]) - else: - result = result.drop([c for c in _esi_source_columns if c in result.columns]) - for value_column in PERSON_ZERO_DEFAULT_VALUE_COLUMNS: - if value_column not in result.columns: - result = result.with_columns(pl.lit(0.0).alias(value_column)) - for bool_column in ( - "has_medicare", - "has_medicaid", - "has_esi", - "has_marketplace_health_coverage", - "receives_wic", - ): - if bool_column in result.columns: - result = result.with_columns((pl.col(bool_column) == 1).alias(bool_column)) - if ( - "has_medicare" in result.columns - and "takes_up_medicare_if_eligible" not in result.columns - ): - result = result.with_columns( - pl.col("has_medicare").alias("takes_up_medicare_if_eligible") - ) - if "weeks_unemployed" in result.columns: - result = result.with_columns( - pl.when(pl.col("weeks_unemployed") == -1) - .then(0) - .otherwise(pl.col("weeks_unemployed")) - .alias("weeks_unemployed") - ) - for col in PERSON_NONNEGATIVE_VALUE_COLUMNS: - if col in result.columns: - result = result.with_columns( - pl.when(pl.col(col) < 0).then(0).otherwise(pl.col(col)).alias(col) - ) - if ( - "marital_status" in result.columns - and "is_surviving_spouse" not in result.columns - ): - result = result.with_columns( - (pl.col("marital_status") == 4).alias("is_surviving_spouse") - ) - if "marital_status" in result.columns and "is_separated" not in result.columns: - result = result.with_columns( - (pl.col("marital_status") == 6).alias("is_separated") - ) - if {"household_id", "person_number", "spouse_person_number"}.issubset( - result.columns - ) and "marital_unit_id" not in result.columns: - raw_marital_unit_id = pl.col("household_id").cast( - pl.Int64 - ) * 1_000_000 + pl.max_horizontal( - pl.col("person_number").cast(pl.Int64), - pl.col("spouse_person_number").fill_null(0).cast(pl.Int64), - ) - result = result.with_columns( - raw_marital_unit_id.rank("dense").cast(pl.Int64).alias("marital_unit_id") - ) - - # Add year - result = result.with_columns(pl.lit(year).alias("year")) - - return result - - -def _attach_cps_ssn_card_type( - *, - persons: pl.DataFrame, - households: pl.DataFrame, - persons_raw: pl.DataFrame, -) -> pl.DataFrame: - """Derive PE-style CPS SSN card types from raw CPS columns.""" - if "ssn_card_type" in persons.columns: - return persons - - fallback = persons.with_columns(pl.lit("CITIZEN").alias("ssn_card_type")) - required_person_columns = { - "PRCITSHP", - "PEINUSYR", - "PENATVTY", - "A_HSCOL", - "A_AGE", - "A_MARITL", - "A_SPOUSE", - "MCARE", - "CAID", - "PEN_SC1", - "PEN_SC2", - "RESNSS1", - "RESNSS2", - "IHSFLG", - "CHAMPVA", - "MIL", - "PEIO1COW", - "A_MJOCC", - "SS_YN", - "SPM_ID", - "SPM_CAPHOUSESUB", - "PEAFEVER", - "SSI_YN", - "WSAL_VAL", - "SEMP_VAL", - } - if not required_person_columns.issubset(set(persons_raw.columns)): - return fallback - if not {"household_id"}.issubset(set(persons.columns)): - return fallback - if not {"household_id", "household_weight"}.issubset(set(households.columns)): - return fallback - if len(persons_raw) != len(persons): - return fallback - - household_weights = households.select( - ["household_id", "household_weight"] - ).to_pandas() - household_weight_map = dict( - zip( - pd.to_numeric(household_weights["household_id"], errors="coerce"), - pd.to_numeric( - household_weights["household_weight"], errors="coerce" - ).fillna(0.0), - ) - ) - person_household_ids = pd.to_numeric( - persons["household_id"].to_pandas(), - errors="coerce", - ) - person_weights = ( - person_household_ids.map(household_weight_map).fillna(0.0).to_numpy() - ) - - raw = persons_raw.select(sorted(required_person_columns)).to_pandas() - - def numeric_series(column: str, default: float = 0.0) -> pd.Series: - return pd.to_numeric(raw[column], errors="coerce").fillna(default) - - def select_random_subset_to_target( - eligible_ids: np.ndarray, - current_weighted: float, - target_weighted: float, - *, - random_seed: int, - ) -> np.ndarray: - if len(eligible_ids) == 0: - return np.array([], dtype=int) - - if current_weighted > target_weighted: - excess_weighted = current_weighted - target_weighted - total_reassignable_weight = float(np.sum(person_weights[eligible_ids])) - if total_reassignable_weight <= 0: - return np.array([], dtype=int) - share_to_move = min(excess_weighted / total_reassignable_weight, 1.0) - rng = np.random.default_rng(seed=random_seed) - random_draw = rng.random(len(eligible_ids)) - return eligible_ids[random_draw < share_to_move] - - needed_weighted = target_weighted - current_weighted - total_weight = float(np.sum(person_weights[eligible_ids])) - if total_weight <= 0: - return np.array([], dtype=int) - share_to_move = min(needed_weighted / total_weight, 1.0) - rng = np.random.RandomState(random_seed) - n_to_move = int(len(eligible_ids) * share_to_move) - if n_to_move <= 0: - return np.array([], dtype=int) - return rng.choice( - eligible_ids, - size=n_to_move, - replace=False, - ) - - prcitshp = numeric_series("PRCITSHP").astype(int) - peinusyr = numeric_series("PEINUSYR").astype(int) - birth_country = numeric_series("PENATVTY").astype(int) - age = numeric_series("A_AGE").astype(int) - marital = numeric_series("A_MARITL").astype(int) - spouse_pointer = numeric_series("A_SPOUSE").astype(int) - medicare = numeric_series("MCARE").astype(int) - medicaid = numeric_series("CAID").astype(int) - pension_source_1 = numeric_series("PEN_SC1").astype(int) - pension_source_2 = numeric_series("PEN_SC2").astype(int) - social_security_reason_1 = numeric_series("RESNSS1").astype(int) - social_security_reason_2 = numeric_series("RESNSS2").astype(int) - ihs = numeric_series("IHSFLG").astype(int) - champva = numeric_series("CHAMPVA").astype(int) - military_insurance = numeric_series("MIL").astype(int) - class_of_worker = numeric_series("PEIO1COW").astype(int) - major_occupation = numeric_series("A_MJOCC").astype(int) - social_security_recipient = numeric_series("SS_YN").astype(int) - spm_unit_id = numeric_series("SPM_ID") - capped_housing_subsidy = numeric_series("SPM_CAPHOUSESUB") - veteran = numeric_series("PEAFEVER").astype(int) - ssi_recipient = numeric_series("SSI_YN").astype(int) - wage_income = numeric_series("WSAL_VAL") - self_employment_income = numeric_series("SEMP_VAL") - student_status = numeric_series("A_HSCOL").astype(int) - - ssn_card_type = np.zeros(len(raw), dtype=np.int64) - citizens_mask = prcitshp.isin([1, 2, 3, 4]).to_numpy() - noncitizens = prcitshp.eq(5).to_numpy() - ssn_card_type[citizens_mask] = 1 - - potentially_undocumented = ~np.isin(ssn_card_type, [1, 2]) - arrived_before_1982 = peinusyr.isin([1, 2, 3, 4, 5, 6, 7]).to_numpy() - is_naturalized = prcitshp.eq(4).to_numpy() - is_adult = age.ge(18).to_numpy() - has_five_plus_years = peinusyr.isin(list(range(8, 27))).to_numpy() - has_three_plus_years = peinusyr.isin(list(range(8, 28))).to_numpy() - is_married = marital.isin([1, 2]).to_numpy() & spouse_pointer.gt(0).to_numpy() - eligible_naturalized = ( - is_naturalized - & is_adult - & (has_five_plus_years | (has_three_plus_years & is_married)) - ) - has_medicare = medicare.eq(1).to_numpy() - has_federal_pension = ( - pension_source_1.isin([3]).to_numpy() | pension_source_2.isin([3]).to_numpy() - ) - has_ss_disability = ( - social_security_reason_1.isin([2]).to_numpy() - | social_security_reason_2.isin([2]).to_numpy() - ) - has_ihs = ihs.eq(1).to_numpy() - has_medicaid = medicaid.eq(1).to_numpy() - has_champva = champva.eq(1).to_numpy() - has_military_insurance = military_insurance.eq(1).to_numpy() - is_government_worker = class_of_worker.isin([1, 2, 3]).to_numpy() - is_military_occupation = major_occupation.eq(11).to_numpy() - is_government_employee = is_government_worker | is_military_occupation - has_social_security = social_security_recipient.eq(1).to_numpy() - spm_housing_map = ( - pd.DataFrame( - { - "SPM_ID": spm_unit_id, - "SPM_CAPHOUSESUB": capped_housing_subsidy, - } - ) - .dropna(subset=["SPM_ID"]) - .groupby("SPM_ID", sort=False)["SPM_CAPHOUSESUB"] - .max() - ) - has_housing_assistance = spm_unit_id.map(spm_housing_map).fillna(0).gt(0).to_numpy() - is_military_connected = veteran.eq(1).to_numpy() | is_military_occupation - has_ssi = ssi_recipient.eq(1).to_numpy() - - assumed_documented = ( - arrived_before_1982 - | eligible_naturalized - | has_medicare - | has_federal_pension - | has_ss_disability - | has_ihs - | has_medicaid - | has_champva - | has_military_insurance - | is_government_employee - | has_social_security - | has_housing_assistance - | is_military_connected - | has_ssi - ) - ssn_card_type[potentially_undocumented & assumed_documented] = 3 - - worker_mask = ( - (ssn_card_type != 3) - & noncitizens - & ((wage_income.gt(0).to_numpy()) | (self_employment_income.gt(0).to_numpy())) - ) - student_mask = (ssn_card_type != 3) & noncitizens & student_status.eq(2).to_numpy() - - worker_ids = np.flatnonzero(worker_mask) - selected_workers = select_random_subset_to_target( - worker_ids, - current_weighted=float(np.sum(person_weights[worker_ids])), - target_weighted=PE_CPS_UNDOCUMENTED_WORKERS_TARGET, - random_seed=0, - ) - student_ids = np.flatnonzero(student_mask) - selected_students = select_random_subset_to_target( - student_ids, - current_weighted=float(np.sum(person_weights[student_ids])), - target_weighted=PE_CPS_UNDOCUMENTED_STUDENTS_TARGET, - random_seed=1, - ) - ssn_card_type[selected_workers] = 2 - ssn_card_type[selected_students] = 2 - - current_undocumented = float(np.sum(person_weights[ssn_card_type == 0])) - if current_undocumented < PE_CPS_UNDOCUMENTED_TARGET: - mixed_household_candidates: list[int] = [] - household_values = person_household_ids.to_numpy() - for household_id in pd.unique(household_values): - household_mask = household_values == household_id - household_codes = ssn_card_type[household_mask] - if not (np.any(household_codes == 0) and np.any(household_codes == 3)): - continue - household_indices = np.flatnonzero(household_mask) - mixed_household_candidates.extend( - household_indices[household_codes == 3].tolist() - ) - if mixed_household_candidates: - selected_indices = select_random_subset_to_target( - np.asarray(mixed_household_candidates, dtype=int), - current_weighted=current_undocumented, - target_weighted=PE_CPS_UNDOCUMENTED_TARGET, - random_seed=100, - ) - ssn_card_type[selected_indices] = 0 - - code_to_str = { - 0: "NONE", - 1: "CITIZEN", - 2: "NON_CITIZEN_VALID_EAD", - 3: "OTHER_NON_CITIZEN", - } - has_valid_ssn = ssn_card_type == 1 - taxpayer_id_type = np.where( - has_valid_ssn, - "VALID_SSN", - np.where(ssn_card_type != 0, "OTHER_TIN", "NONE"), - ) - immigration_status = _derive_cps_immigration_status( - ssn_card_type=ssn_card_type, - birth_country=birth_country.to_numpy(), - peinusyr=peinusyr.to_numpy(), - age=age.to_numpy(), - year=int(persons["year"][0]) - if "year" in persons.columns and len(persons) > 0 - else 2024, - ) - return persons.with_columns( - [ - pl.Series( - "ssn_card_type", - pd.Series(ssn_card_type).map(code_to_str).tolist(), - ), - pl.Series("has_valid_ssn", has_valid_ssn), - pl.Series("taxpayer_id_type", taxpayer_id_type.tolist()), - pl.Series("immigration_status_str", immigration_status.tolist()), - ] - ) - - -def _derive_cps_immigration_status( - *, - ssn_card_type: np.ndarray, - birth_country: np.ndarray, - peinusyr: np.ndarray, - age: np.ndarray, - year: int, -) -> np.ndarray: - """Approximate eCPS immigration-status tags from CPS ASEC citizenship inputs.""" - - arrival_year_map = { - 1: 1950, - 2: 1955, - 3: 1960, - 4: 1965, - 5: 1970, - 6: 1975, - 7: 1980, - 8: 1982, - 9: 1984, - 10: 1986, - 11: 1988, - 12: 1990, - 13: 1992, - 14: 1994, - 15: 1996, - 16: 1998, - 17: 2000, - 18: 2002, - 19: 2004, - 20: 2006, - 21: 2008, - 22: 2010, - 23: 2012, - 24: 2014, - 25: 2017, - 26: 2019, - 27: 2021, - 28: 2023, - 29: 2024, - } - arrival_years = pd.Series(peinusyr).map(arrival_year_map).fillna(2024).to_numpy() - years_in_us = year - arrival_years - age_at_entry = np.maximum(0, age - years_in_us) - - result = np.full(len(ssn_card_type), "LEGAL_PERMANENT_RESIDENT", dtype="U32") - result[ssn_card_type == 1] = "CITIZEN" - - arrived_before_1982 = np.isin(peinusyr, [1, 2, 3, 4, 5, 6, 7]) - result[(ssn_card_type == 0) & ~arrived_before_1982] = "UNDOCUMENTED" - - cofa_birth_country_codes = {511, 512} - cuban_haitian_birth_country_codes = {327, 332} - result[ - (ssn_card_type != 0) & np.isin(birth_country, list(cofa_birth_country_codes)) - ] = "LEGAL_PERMANENT_RESIDENT" - result[ - (ssn_card_type != 0) - & np.isin(birth_country, list(cuban_haitian_birth_country_codes)) - & (arrival_years >= 1980) - ] = "CUBAN_HAITIAN_ENTRANT" - result[ - (ssn_card_type == 2) - & (arrival_years <= 2007) - & (age_at_entry < 16) - & (age >= 15) - ] = "DACA" - result[(ssn_card_type == 3) & (years_in_us <= 5)] = "REFUGEE" - result[(ssn_card_type == 2) & (result == "LEGAL_PERMANENT_RESIDENT")] = "TPS" - return result - - -def _processed_persons_have_household_geography(persons: pl.DataFrame) -> bool: - """Whether cached processed person data can derive household geography.""" - required_columns = set(PERSON_CACHE_REQUIRED_COLUMNS) - if not required_columns.issubset(set(persons.columns)): - return False - return len(persons["state_fips"].drop_nulls()) > 0 - - -def _process_households(df: pl.DataFrame, year: int) -> pl.DataFrame: - """Process raw household file into clean format.""" - selected = [ - pl.col(census_name).alias(our_name) - for census_name, our_name in HOUSEHOLD_VARIABLES.items() - if census_name in df.columns - ] - if not selected: - raise ValueError("No recognized variables found in household file") - result = df.select(selected) - - # Scale weights: CPS ASEC weights have 2 implied decimal places - if "household_weight" in result.columns: - result = result.with_columns( - (pl.col("household_weight") / 100).alias("household_weight") - ) - - result = result.with_columns(pl.lit(year).alias("year")) - - return result - - -def _attach_household_geography_to_persons( - *, - persons: pl.DataFrame, - households: pl.DataFrame, -) -> pl.DataFrame: - """Propagate household geography onto cached person rows when needed.""" - if "household_id" not in households.columns: - return persons - geography_columns = [ - column - for column in ("state_fips", "county_fips") - if column in households.columns - ] - if not geography_columns: - return persons - joined = persons.join( - households.select(["household_id", *geography_columns]).rename( - {column: f"_household_{column}" for column in geography_columns} - ), - on="household_id", - how="left", - ) - for column in geography_columns: - household_column = f"_household_{column}" - if column in joined.columns: - joined = joined.with_columns( - pl.coalesce(column, household_column).alias(column) - ) - else: - joined = joined.with_columns(pl.col(household_column).alias(column)) - joined = joined.drop(household_column) - return joined - - -def _derive_households(persons: pl.DataFrame) -> pl.DataFrame: - """Derive household-level data from person records.""" - if "household_id" not in persons.columns: - raise ValueError("Cannot derive households without household_id") - - aggregations = [ - pl.len().alias("household_size"), - pl.col("weight").first().alias("household_weight"), - ] - if "state_fips" in persons.columns: - aggregations.append(pl.col("state_fips").first().alias("state_fips")) - else: - aggregations.append(pl.lit(None).alias("state_fips")) - if "county_fips" in persons.columns: - aggregations.append(pl.col("county_fips").first().alias("county_fips")) - else: - aggregations.append(pl.lit(None).alias("county_fips")) - if "total_person_income" in persons.columns: - aggregations.append( - pl.col("total_person_income").sum().alias("household_total_income") - ) - else: - aggregations.append(pl.lit(0).alias("household_total_income")) - if "is_child" in persons.columns: - aggregations.append(pl.col("is_child").sum().alias("num_children")) - else: - aggregations.append(pl.lit(0).alias("num_children")) - if "is_adult" in persons.columns: - aggregations.append(pl.col("is_adult").sum().alias("num_adults")) - else: - aggregations.append(pl.lit(0).alias("num_adults")) - - households = persons.group_by("household_id").agg(aggregations) - - if "year" in persons.columns: - year_val = persons.select("year").unique().to_series()[0] - households = households.with_columns(pl.lit(year_val).alias("year")) - - return households - - -def get_available_years() -> list[int]: - """Return list of available CPS ASEC years.""" - return sorted(CPS_URLS.keys()) diff --git a/src/microplex_us/data_sources/cps_age.py b/src/microplex_us/data_sources/cps_age.py deleted file mode 100644 index 3492488e..00000000 --- a/src/microplex_us/data_sources/cps_age.py +++ /dev/null @@ -1,46 +0,0 @@ -"""CPS ASEC age recodes.""" - -import warnings - -import numpy as np -import polars as pl - -CPS_AGE_80_84_RANDOMIZATION_KEY = "age_randomization_80_84" - - -def _stable_string_hash(value: str) -> np.uint64: - """Return a deterministic hash compatible with policyengine-us-data.""" - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "overflow encountered", RuntimeWarning) - result = np.uint64(0) - for byte in value.encode("utf-8"): - result = result * np.uint64(31) + np.uint64(byte) - result = result ^ (result >> np.uint64(33)) - result = result * np.uint64(0xFF51AFD7ED558CCD) - result = result ^ (result >> np.uint64(33)) - return result - - -def cps_seeded_rng(key: str) -> np.random.Generator: - """Create a deterministic CPS recode RNG without importing us-data.""" - seed = int(_stable_string_hash(key)) % (2**63) - return np.random.default_rng(seed=seed) - - -def randomize_cps_topcoded_age_80_84( - frame: pl.DataFrame, - *, - age_column: str = "age", -) -> pl.DataFrame: - """Spread CPS A_AGE==80, meaning ages 80-84, across integer ages 80-84.""" - if age_column not in frame.columns: - return frame - ages = frame[age_column].to_numpy().astype(np.int64, copy=True) - age_80 = ages == 80 - if not age_80.any(): - return frame - - rng = cps_seeded_rng(CPS_AGE_80_84_RANDOMIZATION_KEY) - draws = rng.integers(80, 85, len(ages), dtype=np.int64) - ages[age_80] = draws[age_80] - return frame.with_columns(pl.Series(age_column, ages)) diff --git a/src/microplex_us/data_sources/cps_mappings.py b/src/microplex_us/data_sources/cps_mappings.py deleted file mode 100644 index de755b94..00000000 --- a/src/microplex_us/data_sources/cps_mappings.py +++ /dev/null @@ -1,452 +0,0 @@ -""" -CPS ASEC -> policyengine-us variable mappings. - -Maps Census CPS columns to statute-defined variables in policyengine-us. -Each mapping documents: -- The policyengine-us variable it maps to -- The statutory reference (USC section) -- CPS columns used -- Coverage level (full, partial, derived, none) -- Gaps (what the statute requires that CPS can't provide) -""" - -from dataclasses import dataclass, field -from enum import Enum - -import polars as pl - -from microplex_us.data_sources.cps_age import randomize_cps_topcoded_age_80_84 - - -class CoverageLevel(Enum): - """How well CPS covers a policyengine-us variable.""" - - FULL = "full" # CPS provides all required data - PARTIAL = "partial" # CPS provides some, with known gaps - DERIVED = "derived" # Must be computed from other CPS variables - NONE = "none" # CPS doesn't have this data - - -@dataclass -class CoverageGap: - """A component required by statute but not available in CPS.""" - - component: str - statute_ref: str - impact: str # "high", "medium", "low" - notes: str - - -@dataclass -class VariableMapping: - """Metadata for a CPS -> policyengine-us variable mapping.""" - - policyengine_us_variable: str - statute_ref: str - cps_columns: list[str] - coverage: CoverageLevel - entity: str # "Person", "TaxUnit", "Household" - gaps: list[CoverageGap] = field(default_factory=list) - expected_gap_pct: float | None = None # Expected % undercount vs SOI - notes: str = "" - - -# ============================================================================= -# MAPPING REGISTRY -# ============================================================================= - -_MAPPINGS: dict[str, VariableMapping] = {} - - -def _register(mapping: VariableMapping) -> VariableMapping: - """Register a mapping in the global registry.""" - _MAPPINGS[mapping.policyengine_us_variable] = mapping - return mapping - - -# ============================================================================= -# DIRECT MAPPINGS (Full Coverage) -# ============================================================================= - -_register( - VariableMapping( - policyengine_us_variable="age", - statute_ref="26 USC 63(f), 24(c)(1)", - cps_columns=["A_AGE"], - coverage=CoverageLevel.FULL, - entity="Person", - notes="CPS A_AGE is as of survey date (March); A_AGE==80 represents ages 80-84.", - ) -) - - -def map_age(persons: pl.DataFrame) -> pl.DataFrame: - """Map CPS A_AGE to age, spreading the 80-84 topcode.""" - return randomize_cps_topcoded_age_80_84( - persons.with_columns(pl.col("A_AGE").alias("age")) - ) - - -_register( - VariableMapping( - policyengine_us_variable="household_size", - statute_ref="7 USC 2014(c)", - cps_columns=["H_NUMPER"], - coverage=CoverageLevel.FULL, - entity="Household", - notes="Direct mapping for SNAP household size.", - ) -) - - -def map_household_size(households: pl.DataFrame) -> pl.DataFrame: - """Map CPS H_NUMPER to household_size.""" - return households.with_columns(pl.col("H_NUMPER").alias("household_size")) - - -# ============================================================================= -# EARNED INCOME (Full Coverage) -# ============================================================================= - -_register( - VariableMapping( - policyengine_us_variable="earned_income", - statute_ref="26 USC 32(c)(2) - Earned income defined", - cps_columns=["WSAL_VAL", "SEMP_VAL"], - coverage=CoverageLevel.FULL, - entity="Person", - notes="Sum of wages/salaries and self-employment income. " - "32(c)(2) defines earned income for EITC purposes.", - ) -) - - -def map_earned_income(persons: pl.DataFrame) -> pl.DataFrame: - """ - Map CPS income to earned_income per 32(c)(2). - - Earned income = wages + salaries + tips + self-employment income - """ - return persons.with_columns( - (pl.col("WSAL_VAL").fill_null(0) + pl.col("SEMP_VAL").fill_null(0)).alias( - "earned_income" - ) - ) - - -# ============================================================================= -# FILING STATUS (Derived) -# ============================================================================= - -_register( - VariableMapping( - policyengine_us_variable="filing_status", - statute_ref="26 USC 1 (tax rates by status), 2 (definitions)", - cps_columns=["A_MARITL", "A_AGE", "A_EXPRRP"], - coverage=CoverageLevel.DERIVED, - entity="TaxUnit", - gaps=[ - CoverageGap( - component="head_of_household", - statute_ref="26 USC 2(b)", - impact="medium", - notes="Requires determining if taxpayer maintains household for " - "qualifying person. CPS has household relationships but " - "determining HoH status requires assumptions.", - ), - CoverageGap( - component="qualifying_widow", - statute_ref="26 USC 2(a)", - impact="low", - notes="Requires spouse death within 2 years AND dependent child. " - "CPS doesn't track year of spouse's death.", - ), - ], - notes="Simplified mapping: married w/ spouse -> joint, others -> single. " - "Head of household determination requires additional logic.", - ) -) - -# CPS A_MARITL codes -_MARITL_MARRIED_SPOUSE_PRESENT = 1 -_MARITL_MARRIED_SPOUSE_ABSENT = 2 -_MARITL_WIDOWED = 3 -_MARITL_DIVORCED = 4 -_MARITL_SEPARATED = 5 -_MARITL_NEVER_MARRIED = 7 - - -def map_filing_status(persons: pl.DataFrame) -> pl.DataFrame: - """ - Derive filing status from CPS marital status. - - Simplified mapping: - - Married, spouse present -> married_joint - - All others -> single - - TODO: Add head_of_household logic based on: - - Unmarried - - Maintains household for qualifying person (child, parent) - - Pays > 50% of household costs - """ - return persons.with_columns( - pl.when(pl.col("A_MARITL") == _MARITL_MARRIED_SPOUSE_PRESENT) - .then(pl.lit("married_joint")) - .otherwise(pl.lit("single")) - .alias("filing_status") - ) - - -# ============================================================================= -# BLINDNESS (Full Coverage) -# ============================================================================= - -_register( - VariableMapping( - policyengine_us_variable="is_blind", - statute_ref="26 USC 63(f)(2) - Additional standard deduction for blind", - cps_columns=["PEDISEYE"], - coverage=CoverageLevel.FULL, - entity="Person", - notes="PEDISEYE: 1 = serious difficulty seeing, 2 = no difficulty. " - "Tax definition of blind (63(f)(4)) is more specific " - "(corrected vision <= 20/200 or field <= 20 deg) but CPS proxy is reasonable.", - ) -) - - -def map_is_blind(persons: pl.DataFrame) -> pl.DataFrame: - """Map CPS PEDISEYE to is_blind.""" - return persons.with_columns((pl.col("PEDISEYE") == 1).alias("is_blind")) - - -# ============================================================================= -# IS DEPENDENT (Derived) -# ============================================================================= - -_register( - VariableMapping( - policyengine_us_variable="is_dependent", - statute_ref="26 USC 152 - Dependent defined", - cps_columns=["A_EXPRRP", "A_AGE", "WSAL_VAL"], - coverage=CoverageLevel.DERIVED, - entity="Person", - gaps=[ - CoverageGap( - component="support_test", - statute_ref="26 USC 152(c)(1)(D), 152(d)(1)(C)", - impact="medium", - notes="CPS doesn't track who provides >50% support.", - ), - CoverageGap( - component="joint_return_test", - statute_ref="26 USC 152(c)(1)(E)", - impact="low", - notes="Can't determine if dependent filed joint return.", - ), - ], - notes="Simplified: children under 19 (or under 24 if student) are dependents. " - "Doesn't verify support test or other 152 requirements.", - ) -) - - -def map_is_dependent(persons: pl.DataFrame) -> pl.DataFrame: - """ - Derive is_dependent from CPS relationships and age. - - Simplified: Person is dependent if: - - Has relationship code indicating child/grandchild - - Age < 19 (or < 24 for students, but we can't identify students easily) - """ - # A_EXPRRP codes for children: 4 = child, 8 = grandchild - return persons.with_columns( - ((pl.col("A_EXPRRP").is_in([4, 8])) & (pl.col("A_AGE") < 19)).alias( - "is_dependent" - ) - ) - - -# ============================================================================= -# CTC QUALIFYING CHILDREN (Derived) -# ============================================================================= - -_register( - VariableMapping( - policyengine_us_variable="ctc_qualifying_children", - statute_ref="26 USC 24(c) - Qualifying child (under 17, per 152(c))", - cps_columns=["A_AGE", "A_EXPRRP", "PH_SEQ", "A_LINENO"], - coverage=CoverageLevel.DERIVED, - entity="TaxUnit", - gaps=[ - CoverageGap( - component="citizenship_test", - statute_ref="26 USC 24(c)(2)", - impact="low", - notes="Child must be US citizen/national/resident. " - "CPS has citizenship but we don't filter on it.", - ), - CoverageGap( - component="ssn_requirement", - statute_ref="26 USC 24(h)(7)", - impact="low", - notes="Child must have SSN. Not in CPS.", - ), - ], - notes="Count of children under 17 with qualifying relationship. " - "Age limit is 17 per 24(c)(1): 'has not attained age 17'.", - ) -) - - -def map_ctc_qualifying_children(persons: pl.DataFrame) -> pl.DataFrame: - """ - Count CTC qualifying children per tax unit. - - A child qualifies if: - - Under age 17 (24(c)(1)) - - Is a qualifying child per 152(c) (relationship, age, residency, support) - - Simplified: count children (A_EXPRRP = 4) under 17 in same household. - """ - # First, identify qualifying children - persons_with_flag = persons.with_columns( - ( - (pl.col("A_EXPRRP") == 4) # Child of householder - & (pl.col("A_AGE") < 17) # Under 17 - ).alias("_is_ctc_child") - ) - - # Count per household (using PH_SEQ as household ID) - child_counts = persons_with_flag.group_by("PH_SEQ").agg( - pl.col("_is_ctc_child").sum().alias("ctc_qualifying_children") - ) - - # Join back and assign to reference person (A_LINENO = 1 typically) - result = persons_with_flag.join(child_counts, on="PH_SEQ", how="left") - - # Only the tax unit head gets the count; others get 0 - # (Simplified: reference person is tax unit head) - result = result.with_columns( - pl.when(pl.col("A_LINENO") == 1) - .then(pl.col("ctc_qualifying_children")) - .otherwise(0) - .alias("ctc_qualifying_children") - ) - - return result.drop("_is_ctc_child") - - -# ============================================================================= -# AGI PROXY (Partial Coverage) -# ============================================================================= - -_register( - VariableMapping( - policyengine_us_variable="adjusted_gross_income", - statute_ref="26 USC 62(a) - Adjusted gross income defined", - cps_columns=["WSAL_VAL", "SEMP_VAL", "INT_VAL", "DIV_VAL", "PNSN_VAL"], - coverage=CoverageLevel.PARTIAL, - entity="TaxUnit", - expected_gap_pct=0.15, # Expect ~15% undercount vs SOI - gaps=[ - CoverageGap( - component="capital_gains", - statute_ref="26 USC 61(a)(3), 1222", - impact="high", - notes="CPS does not collect capital gains. SOI 2021: ~$1.2T. " - "Major source of income for top brackets.", - ), - CoverageGap( - component="ira_deduction", - statute_ref="26 USC 62(a)(7)", - impact="medium", - notes="Above-the-line IRA deduction not in CPS. Our proxy will " - "overstate AGI by this amount.", - ), - CoverageGap( - component="student_loan_interest", - statute_ref="26 USC 62(a)(17)", - impact="low", - notes="Student loan interest deduction not in CPS.", - ), - CoverageGap( - component="self_employment_tax_deduction", - statute_ref="26 USC 62(a)(1)", - impact="medium", - notes="Deductible portion of SE tax not calculable from CPS.", - ), - CoverageGap( - component="interest_dividends_underreporting", - statute_ref="26 USC 61(a)(4), (7)", - impact="medium", - notes="CPS interest/dividends underreported by ~40% vs SOI.", - ), - ], - notes="AGI proxy using available CPS income. Known to undercount vs SOI " - "due to missing capital gains and underreported investment income. " - "Also overstates slightly due to missing above-line deductions.", - ) -) - - -def map_agi_proxy(persons: pl.DataFrame) -> pl.DataFrame: - """ - Construct AGI proxy from available CPS income variables. - - This is NOT true AGI (62). Missing components include: - - Capital gains (1222) - NOT in CPS - - Above-line deductions (62(a)(1)-(21)) - NOT in CPS - - Included (with notes): - - Wages/salaries (61(a)(1)) - WSAL_VAL - - Self-employment (61(a)(2)) - SEMP_VAL - - Interest (61(a)(4)) - INT_VAL (underreported ~40%) - - Dividends (61(a)(7)) - DIV_VAL (underreported ~40%) - - Pensions (61(a)(11)) - PNSN_VAL - """ - income_cols = ["WSAL_VAL", "SEMP_VAL", "INT_VAL", "DIV_VAL", "PNSN_VAL"] - - # Fill nulls with 0 for each column - expr = pl.lit(0) - for col in income_cols: - if col in persons.columns: - expr = expr + pl.col(col).fill_null(0) - - return persons.with_columns(expr.alias("agi_proxy")) - - -# ============================================================================= -# REGISTRY FUNCTIONS -# ============================================================================= - - -def get_mapping_metadata(variable_name: str) -> VariableMapping: - """Get metadata for a variable mapping.""" - # Handle aliases - if variable_name == "agi_proxy": - variable_name = "adjusted_gross_income" - - if variable_name not in _MAPPINGS: - raise KeyError(f"No mapping found for '{variable_name}'") - - return _MAPPINGS[variable_name] - - -def get_all_mappings() -> list[VariableMapping]: - """Get all registered variable mappings.""" - return list(_MAPPINGS.values()) - - -def coverage_summary() -> dict[str, list[str]]: - """ - Summarize coverage by level. - - Returns dict mapping coverage level to list of variable names. - """ - result = {level.value: [] for level in CoverageLevel} - - for mapping in _MAPPINGS.values(): - result[mapping.coverage.value].append(mapping.policyengine_us_variable) - - return result diff --git a/src/microplex_us/data_sources/cps_transform.py b/src/microplex_us/data_sources/cps_transform.py deleted file mode 100644 index 901e6e0f..00000000 --- a/src/microplex_us/data_sources/cps_transform.py +++ /dev/null @@ -1,326 +0,0 @@ -""" -Transform CPS data to policyengine-us variables. - -Applies all CPS -> policyengine-us mappings and constructs tax units. -""" - -from dataclasses import dataclass, field - -import polars as pl - -from microplex_us.data_sources.cps import CPSDataset -from microplex_us.data_sources.cps_age import randomize_cps_topcoded_age_80_84 -from microplex_us.data_sources.cps_mappings import ( - CoverageLevel, - get_all_mappings, -) - - -@dataclass -class TransformedDataset: - """CPS data transformed to policyengine-us variables.""" - - persons: pl.DataFrame - tax_units: pl.DataFrame - households: pl.DataFrame - year: int - source: str = "CPS ASEC" - - coverage_report: dict = field(default_factory=dict) - - def summary(self) -> dict: - """Summary statistics.""" - return { - "n_persons": len(self.persons), - "n_tax_units": len(self.tax_units), - "n_households": len(self.households), - "year": self.year, - "total_weight": float(self.tax_units["weight"].sum()), - } - - -def transform_cps_to_policyengine(cps: CPSDataset) -> TransformedDataset: - """ - Transform CPS data to policyengine-us variables. - - Steps: - 1. Apply person-level mappings (age, earned_income, is_blind, etc.) - 2. Construct tax units from households - 3. Apply tax-unit-level mappings (filing_status, agi_proxy, etc.) - 4. Generate coverage report - - Args: - cps: Raw CPS dataset - - Returns: - TransformedDataset with persons, tax_units, and coverage metadata - """ - # Step 1: Person-level transforms - persons = _transform_persons(cps.persons) - - # Step 2: Construct tax units - tax_units = _construct_tax_units(persons, cps.households) - - # Step 3: Generate coverage report - coverage_report = _generate_coverage_report() - - return TransformedDataset( - persons=persons, - tax_units=tax_units, - households=cps.households, - year=cps.year, - source=cps.source, - coverage_report=coverage_report, - ) - - -def _transform_persons(persons: pl.DataFrame) -> pl.DataFrame: - """Apply person-level variable mappings.""" - result = persons.clone() - - # Age - if "A_AGE" in result.columns: - result = randomize_cps_topcoded_age_80_84( - result.with_columns(pl.col("A_AGE").alias("age")) - ) - - # Earned income (wages + self-employment) - wage_col = "WSAL_VAL" if "WSAL_VAL" in result.columns else "wage_income" - semp_col = "SEMP_VAL" if "SEMP_VAL" in result.columns else "self_employment_income" - - if wage_col in result.columns or semp_col in result.columns: - wage_expr = ( - pl.col(wage_col).fill_null(0) if wage_col in result.columns else pl.lit(0) - ) - semp_expr = ( - pl.col(semp_col).fill_null(0) if semp_col in result.columns else pl.lit(0) - ) - - # Earned income for EITC is non-negative - result = result.with_columns( - (wage_expr + semp_expr).clip(lower_bound=0).alias("earned_income") - ) - - # Is blind - if "PEDISEYE" in result.columns: - result = result.with_columns((pl.col("PEDISEYE") == 1).alias("is_blind")) - else: - result = result.with_columns(pl.lit(False).alias("is_blind")) - - # Is dependent (child under 19 with child relationship) - if "A_EXPRRP" in result.columns and "age" in result.columns: - result = result.with_columns( - ( - (pl.col("A_EXPRRP").is_in([4, 8])) # Child or grandchild - & (pl.col("age") < 19) - ).alias("is_dependent") - ) - else: - result = result.with_columns(pl.lit(False).alias("is_dependent")) - - return result - - -def _construct_tax_units( - persons: pl.DataFrame, households: pl.DataFrame -) -> pl.DataFrame: - """ - Construct tax units from persons. - - Simplified tax unit construction: - - Each household becomes one tax unit - - Reference person is the filer - - Filing status based on marital status - - Aggregate income to tax unit level - - In reality, a household can have multiple tax units (e.g., adult children - who file separately). This is a simplification for v1. - """ - # Get household ID column - hh_id_col = "PH_SEQ" if "PH_SEQ" in persons.columns else "household_id" - - # Identify reference persons (filers) - # A_LINENO = 1 is typically the reference person - lineno_col = "A_LINENO" if "A_LINENO" in persons.columns else None - - # Aggregate earned income to household level - earned_income_agg = persons.group_by(hh_id_col).agg( - pl.col("earned_income").sum().alias("earned_income") - ) - - # Count CTC qualifying children per household - if "age" in persons.columns: - # A child qualifies if under 17 and has child relationship - if "A_EXPRRP" in persons.columns: - persons_with_ctc = persons.with_columns( - ( - (pl.col("A_EXPRRP") == 4) # Child of householder - & (pl.col("age") < 17) - ).alias("_is_ctc_child") - ) - else: - persons_with_ctc = persons.with_columns( - (pl.col("age") < 17).alias("_is_ctc_child") - ) - - ctc_counts = persons_with_ctc.group_by(hh_id_col).agg( - pl.col("_is_ctc_child").sum().alias("ctc_qualifying_children") - ) - else: - ctc_counts = persons.group_by(hh_id_col).agg( - pl.lit(0).alias("ctc_qualifying_children") - ) - - # Get reference person attributes for each household - if lineno_col: - ref_persons = persons.filter(pl.col(lineno_col) == 1) - else: - # Take first person per household as reference - ref_persons = persons.group_by(hh_id_col).first() - - # Determine filing status from marital status - # Check for both raw CPS column (A_MARITL) and processed column (marital_status) - if "A_MARITL" in ref_persons.columns: - ref_persons = ref_persons.with_columns( - pl.when(pl.col("A_MARITL") == 1) # Married, spouse present - .then(pl.lit("married_joint")) - .otherwise(pl.lit("single")) - .alias("filing_status") - ) - elif "marital_status" in ref_persons.columns: - # Processed CPS uses marital_status with value 1 = married spouse present - ref_persons = ref_persons.with_columns( - pl.when(pl.col("marital_status") == 1) # Married, spouse present - .then(pl.lit("married_joint")) - .otherwise(pl.lit("single")) - .alias("filing_status") - ) - else: - ref_persons = ref_persons.with_columns(pl.lit("single").alias("filing_status")) - - # Get weight - weight_col = "A_FNLWGT" if "A_FNLWGT" in ref_persons.columns else "weight" - if weight_col not in ref_persons.columns: - weight_col = "weight" - - # Select columns for tax unit - tu_cols = [hh_id_col, "filing_status"] - if "age" in ref_persons.columns: - tu_cols.append("age") - if weight_col in ref_persons.columns: - tu_cols.append(weight_col) - - tax_units = ref_persons.select([c for c in tu_cols if c in ref_persons.columns]) - - # Rename weight column - if weight_col in tax_units.columns and weight_col != "weight": - tax_units = tax_units.rename({weight_col: "weight"}) - - # Join aggregated values - tax_units = tax_units.join(earned_income_agg, on=hh_id_col, how="left") - tax_units = tax_units.join(ctc_counts, on=hh_id_col, how="left") - - # Compute AGI proxy at tax unit level - # For now, just use earned income as base (will add investment income) - tax_units = tax_units.with_columns( - pl.col("earned_income").fill_null(0).alias("agi_proxy") - ) - - # Add investment income if available in persons - int_col = "INT_VAL" if "INT_VAL" in persons.columns else "interest_income" - div_col = "DIV_VAL" if "DIV_VAL" in persons.columns else "dividend_income" - - if int_col in persons.columns or div_col in persons.columns: - invest_agg_exprs = [] - if int_col in persons.columns: - invest_agg_exprs.append(pl.col(int_col).fill_null(0).sum().alias("_int")) - if div_col in persons.columns: - invest_agg_exprs.append(pl.col(div_col).fill_null(0).sum().alias("_div")) - - if invest_agg_exprs: - invest_income = persons.group_by(hh_id_col).agg(invest_agg_exprs) - - tax_units = tax_units.join(invest_income, on=hh_id_col, how="left") - - # Add to AGI proxy - int_expr = ( - pl.col("_int").fill_null(0) - if "_int" in tax_units.columns - else pl.lit(0) - ) - div_expr = ( - pl.col("_div").fill_null(0) - if "_div" in tax_units.columns - else pl.lit(0) - ) - - tax_units = tax_units.with_columns( - (pl.col("agi_proxy") + int_expr + div_expr).alias("agi_proxy") - ) - - # Drop temp columns - tax_units = tax_units.drop( - [c for c in ["_int", "_div"] if c in tax_units.columns] - ) - - # Fill nulls - tax_units = tax_units.with_columns( - [ - pl.col("earned_income").fill_null(0), - pl.col("ctc_qualifying_children").fill_null(0), - pl.col("agi_proxy").fill_null(0), - ] - ) - - # Rename household ID to tax_unit_id - tax_units = tax_units.rename({hh_id_col: "tax_unit_id"}) - - return tax_units - - -def _generate_coverage_report() -> dict: - """Generate report on variable coverage.""" - mappings = get_all_mappings() - - full = [] - partial = [] - derived = [] - none = [] - gaps = [] - - for m in mappings: - if m.coverage == CoverageLevel.FULL: - full.append(m.policyengine_us_variable) - elif m.coverage == CoverageLevel.PARTIAL: - partial.append(m.policyengine_us_variable) - elif m.coverage == CoverageLevel.DERIVED: - derived.append(m.policyengine_us_variable) - else: - none.append(m.policyengine_us_variable) - - # Collect gaps - for gap in m.gaps: - gaps.append( - { - "variable": m.policyengine_us_variable, - "component": gap.component, - "statute_ref": gap.statute_ref, - "impact": gap.impact, - "notes": gap.notes, - } - ) - - return { - "full": full, - "partial": partial, - "derived": derived, - "none": none, - "gaps": gaps, - "summary": { - "n_full": len(full), - "n_partial": len(partial), - "n_derived": len(derived), - "n_none": len(none), - "n_gaps": len(gaps), - }, - } diff --git a/src/microplex_us/data_sources/donor_surveys.py b/src/microplex_us/data_sources/donor_surveys.py deleted file mode 100644 index 486051bb..00000000 --- a/src/microplex_us/data_sources/donor_surveys.py +++ /dev/null @@ -1,1337 +0,0 @@ -"""Spec-driven donor survey providers aligned with PE-US-data source-impute.""" - -from __future__ import annotations - -import json -import pickle -import subprocess -import tempfile -from collections.abc import Callable -from dataclasses import asdict, dataclass -from pathlib import Path -from textwrap import dedent - -import h5py -import numpy as np -import pandas as pd -from microplex.core import ( - EntityObservation, - EntityRelationship, - EntityType, - ObservationFrame, - RelationshipCardinality, - Shareability, - SourceArchetype, - SourceDescriptor, - SourceQuery, - TimeStructure, - apply_source_query, -) - -from microplex_us.data_sources.sampling import ( - sample_frame_with_state_floor, - sample_frame_without_replacement, -) -from microplex_us.pe_source_impute_specs import ( - PEPolicyengineDatasetLoaderSpec, - PESourceImputeBlockSpec, - apply_pe_source_impute_loader_postprocess, - get_pe_source_impute_block_spec, - resolve_sipp_source_impute_block_spec, -) -from microplex_us.pipelines.pe_native_scores import ( - build_policyengine_us_data_subprocess_env, - resolve_policyengine_us_data_python, - resolve_policyengine_us_data_repo_root, -) -from microplex_us.source_registry import resolve_source_variable_capabilities - -try: - from huggingface_hub import hf_hub_download - - HF_AVAILABLE = True -except ImportError: - HF_AVAILABLE = False - -PERSON_OBSERVATION_EXCLUDED_COLUMNS = ( - "person_id", - "household_id", - "weight", - "year", -) -HOUSEHOLD_OBSERVATION_EXCLUDED_COLUMNS = ( - "household_id", - "household_weight", - "year", -) - -DONOR_UPRATING_EXCLUDED_COLUMNS = { - "person_id", - "household_id", - "weight", - "year", - "age", - "sex", - "is_female", - "is_male", - "is_married", - "is_household_head", - "cps_race", - "state_fips", - "tenure", - "tenure_type", - "own_children_in_household", - "count_under_18", - "count_under_6", - "household_size", -} - -DONOR_UPRATING_FACTOR_ALIASES = { - "employment_income": "employment_income_before_lsr", - "income": "employment_income_before_lsr", - "interest_dividend_income": "taxable_interest_income", - "social_security_pension_income": "social_security_retirement", - "scf_certificates_of_deposit": "bank_account_assets", - "scf_savings_bonds": "bond_assets", - "scf_retirement_assets": "net_worth", - "scf_cash_value_life_insurance": "net_worth", - "scf_other_managed_assets": "stock_assets", - "scf_other_financial_assets": "net_worth", - "scf_primary_residence_value": "home_equity", - "scf_other_residential_real_estate": "household_other_real_estate_value", - "scf_nonresidential_real_estate_equity": "household_other_real_estate_equity", - "scf_business_equity": "household_business_assets_equity", - "scf_other_nonfinancial_assets": "net_worth", - "scf_mortgage_debt": "first_home_mortgage_balance", - "scf_other_residential_debt": "household_other_real_estate_debt", - "scf_other_lines_of_credit": "household_vehicles_debt", - "scf_credit_card_debt": "household_vehicles_debt", - "scf_vehicle_installment_debt": "household_vehicles_debt", - "scf_student_loan_debt": "household_vehicles_debt", - "scf_other_installment_debt": "household_vehicles_debt", - "scf_other_debt": "household_vehicles_debt", -} - -TARGET_YEAR_UPRATED_SURVEYS = {"sipp", "scf"} - - -@dataclass(frozen=True) -class DonorSurveyTables: - """Canonical household/person tables for one donor survey block.""" - - households: pd.DataFrame - persons: pd.DataFrame - - -DonorSurveyTablesLoader = Callable[..., DonorSurveyTables] - - -def _descriptor_from_tables( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - name: str, - shareability: Shareability, - archetype: SourceArchetype | None, -) -> SourceDescriptor: - household_variables = tuple( - column - for column in households.columns - if column not in HOUSEHOLD_OBSERVATION_EXCLUDED_COLUMNS - ) - person_variables = tuple( - column - for column in persons.columns - if column not in PERSON_OBSERVATION_EXCLUDED_COLUMNS - ) - return SourceDescriptor( - name=name, - shareability=shareability, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - archetype=archetype, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=household_variables, - weight_column="household_weight" - if "household_weight" in households.columns - else None, - period_column="year" if "year" in households.columns else None, - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=person_variables, - weight_column="weight" if "weight" in persons.columns else None, - period_column="year" if "year" in persons.columns else None, - ), - ), - variable_capabilities=resolve_source_variable_capabilities( - name, - (*household_variables, *person_variables), - ), - ) - - -def _build_static_descriptor( - *, - spec: PESourceImputeBlockSpec, - shareability: Shareability, -) -> SourceDescriptor: - return SourceDescriptor( - name=spec.descriptor_name, - shareability=shareability, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - archetype=spec.archetype, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=spec.household_variables, - weight_column="household_weight", - period_column="year", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=spec.person_variables, - weight_column="weight", - period_column="year", - ), - ), - ) - - -def _ensure_person_ids(persons: pd.DataFrame) -> pd.DataFrame: - result = persons.copy() - if "person_id" not in result.columns: - if "household_id" in result.columns: - result["person_id"] = ( - result["household_id"].astype(str) - + ":" - + result.groupby("household_id").cumcount().add(1).astype(str) - ) - return result - result["person_id"] = np.arange(len(result)).astype(str) - return result - - if not result["person_id"].duplicated().any(): - return result - - if "household_id" in result.columns: - composite = ( - result["household_id"].astype(str) + ":" + result["person_id"].astype(str) - ) - if not composite.duplicated().any(): - result["person_id"] = composite - return result - result["person_id"] = ( - result["household_id"].astype(str) - + ":" - + result.groupby("household_id").cumcount().add(1).astype(str) - ) - return result - - result["person_id"] = np.arange(len(result)).astype(str) - return result - - -def _sample_households_and_persons( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, -) -> tuple[pd.DataFrame, pd.DataFrame]: - households = households.reset_index(drop=True) - persons = persons.reset_index(drop=True) - if sample_n is None or sample_n >= len(households): - return households, persons - sampled_households = _sample_donor_households( - households=households, - persons=persons, - sample_n=sample_n, - random_seed=random_seed, - state_floor=state_floor, - state_age_floor=state_age_floor, - ) - keep = set(sampled_households["household_id"]) - sampled_persons = persons[persons["household_id"].isin(keep)].copy() - return ( - sampled_households.sort_values(["household_id"]).reset_index(drop=True), - sampled_persons.sort_values(["household_id", "person_id"]).reset_index( - drop=True - ), - ) - - -def _sample_donor_households( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, -) -> pd.DataFrame: - resolved_state_age_floor = int(state_age_floor or 0) - if ( - resolved_state_age_floor <= 0 - or "state_fips" not in households.columns - or "age" not in persons.columns - or "household_id" not in households.columns - or "household_id" not in persons.columns - ): - return sample_frame_with_state_floor( - households, - sample_n=sample_n, - random_seed=random_seed, - weight_col="household_weight", - state_floor=state_floor, - positive_only_when_weighted=True, - ) - - coverage = persons[["household_id", "age"]].merge( - households[["household_id", "state_fips"]], - on="household_id", - how="inner", - ) - coverage["age_band"] = coverage["age"].map(_donor_age_band_key) - coverage["state_fips"] = pd.to_numeric( - coverage["state_fips"], errors="coerce" - ).astype("Int64") - coverage = coverage.dropna(subset=["state_fips", "age_band"]).copy() - if coverage.empty: - return sample_frame_with_state_floor( - households, - sample_n=sample_n, - random_seed=random_seed, - weight_col="household_weight", - state_floor=state_floor, - positive_only_when_weighted=True, - ) - - rng = np.random.default_rng(random_seed) - selected_ids: set[str | int] = set() - for _, group in coverage.groupby(["state_fips", "age_band"], sort=True): - group_household_ids = pd.Index(group["household_id"].unique()) - already_selected = [hid for hid in group_household_ids if hid in selected_ids] - missing = resolved_state_age_floor - len(already_selected) - if missing <= 0: - continue - available_ids = [hid for hid in group_household_ids if hid not in selected_ids] - if not available_ids: - continue - candidate_households = households[ - households["household_id"].isin(available_ids) - ].copy() - sampled = sample_frame_without_replacement( - candidate_households, - sample_n=min(missing, len(candidate_households)), - random_seed=int(rng.integers(0, np.iinfo(np.int32).max)), - weight_col="household_weight", - positive_only_when_weighted=True, - ) - selected_ids.update(sampled["household_id"].tolist()) - - if sample_n is not None and len(selected_ids) > sample_n: - raise ValueError( - "state_age_floor requires more sampled donor households than sample_n allows: " - f"selected={len(selected_ids)}, sample_n={sample_n}" - ) - - if not selected_ids: - return sample_frame_with_state_floor( - households, - sample_n=sample_n, - random_seed=random_seed, - weight_col="household_weight", - state_floor=state_floor, - positive_only_when_weighted=True, - ) - - selected = households[households["household_id"].isin(selected_ids)].copy() - remaining_n = int(sample_n) - len(selected) - if remaining_n <= 0: - return selected - - remainder = households[~households["household_id"].isin(selected_ids)].copy() - remainder_sample = sample_frame_without_replacement( - remainder, - sample_n=remaining_n, - random_seed=int(rng.integers(0, np.iinfo(np.int32).max)), - weight_col="household_weight", - positive_only_when_weighted=True, - ) - return pd.concat([selected, remainder_sample], axis=0, ignore_index=False) - - -def _donor_age_band_key(age: float | int | None) -> str | None: - value = pd.to_numeric(pd.Series([age]), errors="coerce").iloc[0] - if pd.isna(value): - return None - age_int = int(value) - if age_int < 0: - return None - if age_int >= 85: - return "85_plus" - lower = (age_int // 5) * 5 - upper = lower + 5 - return f"{lower}_{upper}" - - -def _build_observation_frame( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - source_name: str, - shareability: Shareability, - archetype: SourceArchetype | None, -) -> ObservationFrame: - normalized_households = households.copy() - normalized_persons = _ensure_person_ids(persons) - descriptor = _descriptor_from_tables( - households=normalized_households, - persons=normalized_persons, - name=source_name, - shareability=shareability, - archetype=archetype, - ) - frame = ObservationFrame( - source=descriptor, - tables={ - EntityType.HOUSEHOLD: normalized_households, - EntityType.PERSON: normalized_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - frame.validate() - return frame - - -def _build_policyengine_dataset_loader_script( - spec: PEPolicyengineDatasetLoaderSpec, - *, - year: int, -) -> str: - payload = json.dumps( - { - "year": int(year), - "dataset_loader": asdict(spec), - } - ) - return dedent( - f""" -import importlib -import json -import pickle -import sys -import numpy as np -import pandas as pd - -payload = json.loads({payload!r}) -spec = payload["dataset_loader"] -out_path = sys.argv[1] -sample_n = None if sys.argv[2] == "None" else int(sys.argv[2]) -random_seed = int(sys.argv[3]) - -module = importlib.import_module(spec["module"]) -dataset_cls = getattr(module, spec["class_name"]) -data = dataset_cls().load_dataset() - -def _numeric(values): - return pd.to_numeric(pd.Series(np.asarray(values)), errors="coerce").fillna(0.0) - -def _boolean_float(values): - return pd.Series(np.asarray(values)).astype(bool).astype(float) - -def _text(values): - return pd.Series(np.asarray(values)).map( - lambda value: value.decode() if isinstance(value, (bytes, bytearray)) else str(value) - ) - -def _mapped_text(values, mapping): - return _text(values).map(mapping).fillna(0).astype(int) - -def _load_fallback(keys): - for key in keys: - if key in data: - return pd.Series(np.asarray(data[key])) - raise KeyError(f"Missing fallback keys {{keys}} in dataset payload") - -def _build_persons(): - if spec["builder_kind"] == "household_rows": - household_index = pd.Index(data[spec["household_index_key"]]) - person_households = pd.Index(data[spec["person_household_key"]]) - household_to_row = pd.Series( - np.arange(len(household_index), dtype=np.int64), - index=household_index, - ) - household_rows = household_to_row.loc[person_households].to_numpy() - persons = pd.DataFrame({{"household_id": person_households.to_numpy()}}) - if spec["person_id_key"] is not None: - persons["person_id"] = np.asarray(data[spec["person_id_key"]]) - for target, source in spec["direct_person_columns"].items(): - persons[target] = _numeric(data[source]) - for target, source in spec["boolean_person_columns"].items(): - persons[target] = _boolean_float(data[source]) - for target, source in spec["row_indexed_person_columns"].items(): - persons[target] = _numeric(np.asarray(data[source])[household_rows]) - for target, source in spec["mapped_row_person_columns"].items(): - persons[target] = _mapped_text( - np.asarray(data[source])[household_rows], - spec["mapped_value_tables"][target], - ) - elif spec["builder_kind"] == "single_person_households": - base_length = len(data[spec["length_source_key"]]) - if spec["generated_household_ids"]: - household_ids = np.arange(base_length, dtype=np.int64) + 1 - else: - household_ids = np.asarray(data[spec["household_index_key"]]) - persons = pd.DataFrame({{"household_id": household_ids}}) - if spec["person_id_from_household_id"]: - persons["person_id"] = persons["household_id"] - elif spec["person_id_key"] is not None: - persons["person_id"] = np.asarray(data[spec["person_id_key"]]) - for target, source in spec["direct_person_columns"].items(): - persons[target] = _numeric(data[source]) - for target, source in spec["boolean_person_columns"].items(): - persons[target] = _boolean_float(data[source]) - else: - raise ValueError(f"Unsupported dataset loader builder kind: {{spec['builder_kind']}}") - - for target, keys in spec["fallback_person_columns"].items(): - persons[target] = _numeric(_load_fallback(keys)) - if spec["sex_from_boolean_source"] is not None: - source = spec["sex_from_boolean_source"] - source_values = pd.Series(persons[source]).astype(bool).to_numpy() - persons["sex"] = np.where( - source_values, - spec["sex_true_value"], - spec["sex_false_value"], - ) - for target, source in spec["copy_person_columns"].items(): - persons[target] = persons[source] - for target, value in spec["constant_person_columns"].items(): - persons[target] = value - if spec["income_sum_columns"]: - persons["income"] = sum( - _numeric(persons[column]) for column in spec["income_sum_columns"] - ) - for column in spec["int_person_columns"]: - if column in persons.columns: - persons[column] = ( - pd.to_numeric(persons[column], errors="coerce") - .fillna(0) - .astype(int) - ) - persons["year"] = int(payload["year"]) - return persons - -persons = _build_persons() -households = ( - persons[ - ["household_id", "state_fips", "tenure", "weight", "year"] - ] - .rename(columns={{"weight": "household_weight"}}) - .drop_duplicates(subset=["household_id"]) -) - -if sample_n is not None and sample_n < len(households): - sampled = households.sample( - n=sample_n, - random_state=random_seed, - replace=False, - weights=households["household_weight"], - ).copy() - keep = set(sampled["household_id"]) - households = sampled.sort_values(["household_id"]).reset_index(drop=True) - persons = ( - persons[persons["household_id"].isin(keep)] - .sort_values(["household_id", "person_id"]) - .reset_index(drop=True) - ) -else: - households = households.sort_values(["household_id"]).reset_index(drop=True) - persons = persons.sort_values(["household_id", "person_id"]).reset_index(drop=True) - -with open(out_path, "wb") as handle: - pickle.dump({{"households": households, "persons": persons}}, handle) -""" - ) - - -def _decode_h5_values(values: np.ndarray) -> np.ndarray: - """Decode fixed-width HDF5 byte strings to ordinary Python strings.""" - if values.dtype.kind not in {"S", "O"}: - return values - return np.asarray( - [ - value.decode() if isinstance(value, (bytes, bytearray)) else value - for value in values - ] - ) - - -def _load_policyengine_us_data_h5_dataset( - *, - filename: str, - policyengine_us_data_repo: str | Path | None, - cache_dir: Path | None, -) -> dict[str, np.ndarray]: - if policyengine_us_data_repo is None: - h5_path = _download_policyengine_us_data_file( - filename=filename, - cache_dir=cache_dir, - ) - else: - repo_root = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - h5_path = repo_root / "policyengine_us_data" / "storage" / filename - if not h5_path.exists(): - raise FileNotFoundError(f"Missing PolicyEngine US-data H5 file: {h5_path}") - with h5py.File(h5_path, "r") as h5: - return {key: _decode_h5_values(np.asarray(h5[key])) for key in h5.keys()} - - -def _build_policyengine_dataset_tables_from_arrays( - *, - data: dict[str, np.ndarray], - dataset_loader: PEPolicyengineDatasetLoaderSpec, - year: int, -) -> DonorSurveyTables: - spec = asdict(dataset_loader) - - def _numeric(values): - return pd.to_numeric(pd.Series(np.asarray(values)), errors="coerce").fillna(0.0) - - def _boolean_float(values): - return pd.Series(np.asarray(values)).astype(bool).astype(float) - - def _text(values): - return pd.Series(_decode_h5_values(np.asarray(values))).astype(str) - - def _mapped_text(values, mapping): - return _text(values).map(mapping).fillna(0).astype(int) - - def _load_fallback(keys): - for key in keys: - if key in data: - return pd.Series(np.asarray(data[key])) - raise KeyError(f"Missing fallback keys {keys} in dataset payload") - - if spec["builder_kind"] == "household_rows": - household_index = pd.Index(data[spec["household_index_key"]]) - person_households = pd.Index(data[spec["person_household_key"]]) - household_to_row = pd.Series( - np.arange(len(household_index), dtype=np.int64), - index=household_index, - ) - household_rows = household_to_row.loc[person_households].to_numpy() - persons = pd.DataFrame({"household_id": person_households.to_numpy()}) - if spec["person_id_key"] is not None: - persons["person_id"] = np.asarray(data[spec["person_id_key"]]) - for target, source in spec["direct_person_columns"].items(): - persons[target] = _numeric(data[source]) - for target, source in spec["boolean_person_columns"].items(): - persons[target] = _boolean_float(data[source]) - for target, source in spec["row_indexed_person_columns"].items(): - persons[target] = _numeric(np.asarray(data[source])[household_rows]) - for target, source in spec["mapped_row_person_columns"].items(): - persons[target] = _mapped_text( - np.asarray(data[source])[household_rows], - spec["mapped_value_tables"][target], - ) - elif spec["builder_kind"] == "single_person_households": - base_length = len(data[spec["length_source_key"]]) - if spec["generated_household_ids"]: - household_ids = np.arange(base_length, dtype=np.int64) + 1 - else: - household_ids = np.asarray(data[spec["household_index_key"]]) - persons = pd.DataFrame({"household_id": household_ids}) - if spec["person_id_from_household_id"]: - persons["person_id"] = persons["household_id"] - elif spec["person_id_key"] is not None: - persons["person_id"] = np.asarray(data[spec["person_id_key"]]) - for target, source in spec["direct_person_columns"].items(): - persons[target] = _numeric(data[source]) - for target, source in spec["boolean_person_columns"].items(): - persons[target] = _boolean_float(data[source]) - else: - raise ValueError( - f"Unsupported dataset loader builder kind: {spec['builder_kind']}" - ) - - for target, keys in spec["fallback_person_columns"].items(): - persons[target] = _numeric(_load_fallback(keys)) - if spec["sex_from_boolean_source"] is not None: - source = spec["sex_from_boolean_source"] - source_values = pd.Series(persons[source]).astype(bool).to_numpy() - persons["sex"] = np.where( - source_values, - spec["sex_true_value"], - spec["sex_false_value"], - ) - for target, source in spec["copy_person_columns"].items(): - persons[target] = persons[source] - for target, value in spec["constant_person_columns"].items(): - persons[target] = value - if spec["income_sum_columns"]: - persons["income"] = sum( - _numeric(persons[column]) for column in spec["income_sum_columns"] - ) - for column in spec["int_person_columns"]: - if column in persons.columns: - persons[column] = ( - pd.to_numeric(persons[column], errors="coerce").fillna(0).astype(int) - ) - persons["year"] = int(year) - households = ( - persons[["household_id", "state_fips", "tenure", "weight", "year"]] - .rename(columns={"weight": "household_weight"}) - .drop_duplicates(subset=["household_id"]) - ) - return DonorSurveyTables( - households=households.sort_values(["household_id"]).reset_index(drop=True), - persons=persons.sort_values(["household_id", "person_id"]).reset_index( - drop=True - ), - ) - - -def _run_policyengine_dataset_loader( - *, - script: str, - sample_n: int | None, - random_seed: int, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> DonorSurveyTables: - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - resolved_python = resolve_policyengine_us_data_python( - policyengine_us_data_python, - repo_root=resolved_repo, - ) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - with tempfile.TemporaryDirectory(prefix="microplex-us-donor-") as tempdir: - payload_path = Path(tempdir) / "tables.pkl" - subprocess.run( - [ - str(resolved_python), - "-c", - script, - str(payload_path), - "None" if sample_n is None else str(int(sample_n)), - str(int(random_seed)), - ], - check=True, - cwd=resolved_repo, - env=env, - ) - with payload_path.open("rb") as handle: - payload = pickle.load(handle) - return DonorSurveyTables( - households=payload["households"], - persons=payload["persons"], - ) - - -def _run_policyengine_dataset_loader_from_spec( - *, - spec: PESourceImputeBlockSpec, - year: int, - sample_n: int | None, - random_seed: int, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> DonorSurveyTables: - dataset_loader = spec.dataset_loader - if dataset_loader is None: - raise ValueError( - f"PE source-impute block '{spec.key}' is missing a dataset loader spec" - ) - return _run_policyengine_dataset_loader( - script=_build_policyengine_dataset_loader_script(dataset_loader, year=year), - sample_n=sample_n, - random_seed=random_seed, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - - -def _default_acs_tables_loader( - *, - year: int, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, - cache_dir: Path | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> DonorSurveyTables: - spec = get_pe_source_impute_block_spec("acs") - if int(year) != spec.default_year and policyengine_us_data_repo is None: - raise ValueError( - f"{spec.descriptor_name} provider supports non-default years only " - "when policyengine_us_data_repo is provided" - ) - if int(year) == spec.default_year: - tables = _run_policyengine_dataset_loader_from_spec( - spec=spec, - year=year, - sample_n=None if state_floor else sample_n, - random_seed=random_seed, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - else: - dataset_loader = spec.dataset_loader - if dataset_loader is None: - raise ValueError( - f"PE source-impute block '{spec.key}' is missing a dataset loader spec" - ) - data = _load_policyengine_us_data_h5_dataset( - filename=f"acs_{int(year)}.h5", - policyengine_us_data_repo=policyengine_us_data_repo, - cache_dir=cache_dir, - ) - tables = _build_policyengine_dataset_tables_from_arrays( - data=data, - dataset_loader=dataset_loader, - year=year, - ) - households = ( - tables.households.drop_duplicates(subset=["household_id"]) - .sort_values(["household_id"]) - .reset_index(drop=True) - ) - persons = ( - tables.persons[ - tables.persons["household_id"].isin(set(households["household_id"])) - ] - .sort_values(["household_id", "person_id"]) - .reset_index(drop=True) - ) - households, persons = _sample_households_and_persons( - households=households, - persons=persons, - sample_n=sample_n, - random_seed=random_seed, - state_floor=state_floor, - state_age_floor=state_age_floor, - ) - return DonorSurveyTables(households=households, persons=persons) - - -def _default_scf_tables_loader( - *, - year: int, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, - cache_dir: Path | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> DonorSurveyTables: - _ = cache_dir - spec = get_pe_source_impute_block_spec("scf") - if int(year) != spec.default_year: - raise ValueError( - f"{spec.descriptor_name} provider currently supports year={spec.default_year} only" - ) - tables = _run_policyengine_dataset_loader_from_spec( - spec=spec, - year=year, - sample_n=None if state_floor else sample_n, - random_seed=random_seed, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - households, persons = _sample_households_and_persons( - households=tables.households, - persons=tables.persons, - sample_n=sample_n, - random_seed=random_seed, - state_floor=state_floor, - state_age_floor=state_age_floor, - ) - return DonorSurveyTables(households=households, persons=persons) - - -def _download_policyengine_us_data_file( - *, - filename: str, - cache_dir: Path | None, -) -> Path: - if cache_dir is None: - cache_dir = Path.home() / ".cache" / "microplex" - cache_dir.mkdir(parents=True, exist_ok=True) - destination = cache_dir / filename - if destination.exists(): - return destination - if not HF_AVAILABLE: - raise ImportError("huggingface_hub required: pip install huggingface_hub") - downloaded = hf_hub_download( - repo_id="PolicyEngine/policyengine-us-data", - filename=filename, - repo_type="model", - local_dir=cache_dir, - ) - return Path(downloaded) - - -def _load_policyengine_uprating_factors( - *, - policyengine_us_data_repo: str | Path | None, - cache_dir: Path | None, -) -> pd.DataFrame: - if policyengine_us_data_repo is None: - factors_path = _download_policyengine_us_data_file( - filename="uprating_factors.csv", - cache_dir=cache_dir, - ) - else: - repo_root = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - factors_path = ( - repo_root / "policyengine_us_data" / "storage" / "uprating_factors.csv" - ) - if not factors_path.exists(): - raise FileNotFoundError( - f"Missing PolicyEngine US-data uprating factors: {factors_path}" - ) - return pd.read_csv(factors_path).set_index("Variable") - - -def _uprate_donor_tables_to_target_year( - tables: DonorSurveyTables, - *, - spec: PESourceImputeBlockSpec, - source_year: int, - target_year: int | None, - policyengine_us_data_repo: str | Path | None, - cache_dir: str | Path | None, -) -> DonorSurveyTables: - if ( - target_year is None - or int(target_year) == int(source_year) - or spec.survey_name not in TARGET_YEAR_UPRATED_SURVEYS - ): - return tables - - resolved_cache_dir = None if cache_dir is None else Path(cache_dir) - factors = _load_policyengine_uprating_factors( - policyengine_us_data_repo=policyengine_us_data_repo, - cache_dir=resolved_cache_dir, - ) - start_column = str(int(source_year)) - end_column = str(int(target_year)) - if start_column not in factors.columns or end_column not in factors.columns: - raise ValueError( - "PolicyEngine US-data uprating factors do not cover " - f"{source_year}->{target_year}" - ) - - persons = tables.persons.copy() - for column in persons.columns: - if column in DONOR_UPRATING_EXCLUDED_COLUMNS: - continue - factor_name = DONOR_UPRATING_FACTOR_ALIASES.get(column, column) - if factor_name not in factors.index: - continue - start = float(factors.loc[factor_name, start_column]) - end = float(factors.loc[factor_name, end_column]) - if start == 0: - raise ValueError(f"Zero uprating base for {factor_name} in {source_year}") - persons[column] = pd.to_numeric(persons[column], errors="coerce").fillna( - 0.0 - ) * (end / start) - - if "year" in persons.columns: - persons["year"] = int(target_year) - households = tables.households.copy() - if "year" in households.columns: - households["year"] = int(target_year) - return DonorSurveyTables(households=households, persons=persons) - - -def _build_joined_raw_identifier( - frame: pd.DataFrame, - *, - parts: tuple[str, ...], -) -> pd.Series: - if not parts: - raise ValueError("Raw identifier spec must include at least one part") - values = frame.loc[:, list(parts)].astype(str) - return values.iloc[:, 0] if len(parts) == 1 else values.agg(":".join, axis=1) - - -def _load_sipp_tables_from_spec( - *, - spec: PESourceImputeBlockSpec, - year: int, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, - cache_dir: Path | None, -) -> DonorSurveyTables: - raw_loader = spec.raw_loader - if raw_loader is None: - raise ValueError( - f"PE source-impute block '{spec.key}' is missing a raw loader spec" - ) - if int(year) != spec.default_year: - raise ValueError( - f"{spec.descriptor_name} provider currently supports year={spec.default_year} only" - ) - sipp_path = _download_policyengine_us_data_file( - filename=raw_loader.filename, - cache_dir=cache_dir, - ) - read_csv_kwargs: dict[str, object] = {} - if raw_loader.delimiter is not None: - read_csv_kwargs["delimiter"] = raw_loader.delimiter - if raw_loader.usecols: - read_csv_kwargs["usecols"] = raw_loader.usecols - df = pd.read_csv(sipp_path, **read_csv_kwargs) - - for variable, source_column in raw_loader.direct_columns.items(): - values = pd.to_numeric(df[source_column], errors="coerce").fillna(0.0) - if variable in set(raw_loader.int_columns): - df[variable] = values.astype(int) - else: - df[variable] = values.astype(float) - for variable, contains in raw_loader.sum_columns_contains.items(): - matched_columns = [column for column in df.columns if contains in column] - df[variable] = ( - df[matched_columns].fillna(0).sum(axis=1) if matched_columns else 0.0 - ) - for variable, indicator in raw_loader.indicator_columns.items(): - raw_values = pd.to_numeric(df[indicator.column], errors="coerce").fillna(0.0) - df[variable] = raw_values.eq(indicator.equals).astype(float) - for variable, value in raw_loader.constant_columns.items(): - df[variable] = value - - df["year"] = int(year) - df["household_id"] = _build_joined_raw_identifier( - df, - parts=raw_loader.household_id_parts, - ) - df["person_id"] = _build_joined_raw_identifier( - df, - parts=raw_loader.person_id_parts, - ) - for variable, source_variable in raw_loader.copy_columns.items(): - df[variable] = df[source_variable] - - df = apply_pe_source_impute_loader_postprocess(df, spec) - for variable, source_variable in raw_loader.copy_columns.items(): - if source_variable in df.columns: - df[variable] = df[source_variable] - households = ( - df[["household_id", "weight", "state_fips", "tenure", "year"]] - .rename(columns={"weight": "household_weight"}) - .drop_duplicates(subset=["household_id"]) - .reset_index(drop=True) - ) - persons = df[ - [ - "person_id", - "household_id", - *spec.person_variables, - "weight", - "year", - ] - ].copy() - households, persons = _sample_households_and_persons( - households=households, - persons=persons, - sample_n=sample_n, - random_seed=random_seed, - state_floor=state_floor, - state_age_floor=state_age_floor, - ) - return DonorSurveyTables(households=households, persons=persons) - - -def _default_sipp_tips_tables_loader( - *, - year: int, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, - cache_dir: Path | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> DonorSurveyTables: - _ = policyengine_us_data_repo, policyengine_us_data_python - return _load_sipp_tables_from_spec( - spec=get_pe_source_impute_block_spec("sipp_tips"), - year=year, - sample_n=sample_n, - random_seed=random_seed, - state_floor=state_floor, - state_age_floor=state_age_floor, - cache_dir=cache_dir, - ) - - -def _default_sipp_assets_tables_loader( - *, - year: int, - sample_n: int | None, - random_seed: int, - state_floor: int | None = None, - state_age_floor: int | None = None, - cache_dir: Path | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> DonorSurveyTables: - _ = policyengine_us_data_repo, policyengine_us_data_python - return _load_sipp_tables_from_spec( - spec=get_pe_source_impute_block_spec("sipp_assets"), - year=year, - sample_n=sample_n, - random_seed=random_seed, - state_floor=state_floor, - state_age_floor=state_age_floor, - cache_dir=cache_dir, - ) - - -BLOCK_LOADERS: dict[str, DonorSurveyTablesLoader] = { - "acs": _default_acs_tables_loader, - "sipp_tips": _default_sipp_tips_tables_loader, - "sipp_assets": _default_sipp_assets_tables_loader, - "scf": _default_scf_tables_loader, -} - - -DonorSurveyProviderSpec = PESourceImputeBlockSpec - - -def _default_loader_for_spec(spec: PESourceImputeBlockSpec) -> DonorSurveyTablesLoader: - return BLOCK_LOADERS[spec.key] - - -def resolve_sipp_donor_survey_spec(block: str) -> DonorSurveyProviderSpec: - return resolve_sipp_source_impute_block_spec(block) - - -class DonorSurveySourceProvider: - """Generic source provider for one donor survey block.""" - - def __init__( - self, - *, - spec: DonorSurveyProviderSpec, - year: int | None = None, - cache_dir: str | Path | None = None, - shareability: Shareability = Shareability.PUBLIC, - loader: DonorSurveyTablesLoader | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - target_year: int | None = None, - ) -> None: - self.spec = spec - self.year = int(spec.default_year if year is None else year) - self.target_year = None if target_year is None else int(target_year) - self.cache_dir = None if cache_dir is None else Path(cache_dir) - self.shareability = shareability - self.loader = loader - self.policyengine_us_data_repo = policyengine_us_data_repo - self.policyengine_us_data_python = policyengine_us_data_python - self._descriptor_cache: SourceDescriptor | None = None - - @property - def descriptor(self) -> SourceDescriptor: - if self._descriptor_cache is not None: - return self._descriptor_cache - return _build_static_descriptor( - spec=self.spec, - shareability=self.shareability, - ) - - def load_frame(self, query: SourceQuery | None = None) -> ObservationFrame: - query = query or SourceQuery() - provider_filters = query.provider_filters - loader = self.loader or _default_loader_for_spec(self.spec) - year = int(provider_filters.get("year", self.year)) - target_year = provider_filters.get("target_year", self.target_year) - resolved_target_year = None if target_year is None else int(target_year) - cache_dir = provider_filters.get("cache_dir", self.cache_dir) - policyengine_us_data_repo = provider_filters.get( - "policyengine_us_data_repo", - self.policyengine_us_data_repo, - ) - tables = loader( - year=year, - sample_n=provider_filters.get("sample_n"), - random_seed=int(provider_filters.get("random_seed", 0)), - state_floor=provider_filters.get("state_floor"), - state_age_floor=provider_filters.get("state_age_floor"), - cache_dir=cache_dir, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=provider_filters.get( - "policyengine_us_data_python", - self.policyengine_us_data_python, - ), - ) - tables = _uprate_donor_tables_to_target_year( - tables, - spec=self.spec, - source_year=year, - target_year=resolved_target_year, - policyengine_us_data_repo=policyengine_us_data_repo, - cache_dir=cache_dir, - ) - frame = _build_observation_frame( - households=tables.households, - persons=tables.persons, - source_name=self.spec.source_name(year), - shareability=self.shareability, - archetype=self.spec.archetype, - ) - self._descriptor_cache = frame.source - return apply_source_query(frame, query) - - -class ACSSourceProvider(DonorSurveySourceProvider): - """PolicyEngine-aligned ACS donor provider.""" - - def __init__( - self, - *, - year: int = get_pe_source_impute_block_spec("acs").default_year, - shareability: Shareability = Shareability.PUBLIC, - loader: DonorSurveyTablesLoader | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - target_year: int | None = None, - ) -> None: - super().__init__( - spec=get_pe_source_impute_block_spec("acs"), - year=year, - shareability=shareability, - loader=loader, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_year=target_year, - ) - - -class SIPPSourceProvider(DonorSurveySourceProvider): - """PolicyEngine-aligned SIPP donor provider with block-level specs.""" - - def __init__( - self, - *, - block: str, - year: int | None = None, - cache_dir: str | Path | None = None, - shareability: Shareability = Shareability.PUBLIC, - loader: DonorSurveyTablesLoader | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - target_year: int | None = None, - ) -> None: - self.block = block - super().__init__( - spec=resolve_sipp_donor_survey_spec(block), - year=year, - cache_dir=cache_dir, - shareability=shareability, - loader=loader, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_year=target_year, - ) - - -class SIPPTipsSourceProvider(SIPPSourceProvider): - """Backward-compatible alias for the SIPP tips donor block.""" - - def __init__( - self, - *, - year: int | None = None, - cache_dir: str | Path | None = None, - shareability: Shareability = Shareability.PUBLIC, - loader: DonorSurveyTablesLoader | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - target_year: int | None = None, - ) -> None: - super().__init__( - block="tips", - year=year, - cache_dir=cache_dir, - shareability=shareability, - loader=loader, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_year=target_year, - ) - - -class SIPPAssetsSourceProvider(SIPPSourceProvider): - """Backward-compatible alias for the SIPP asset donor block.""" - - def __init__( - self, - *, - year: int | None = None, - cache_dir: str | Path | None = None, - shareability: Shareability = Shareability.PUBLIC, - loader: DonorSurveyTablesLoader | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - target_year: int | None = None, - ) -> None: - super().__init__( - block="assets", - year=year, - cache_dir=cache_dir, - shareability=shareability, - loader=loader, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_year=target_year, - ) - - -class SCFSourceProvider(DonorSurveySourceProvider): - """PolicyEngine-aligned SCF donor provider.""" - - def __init__( - self, - *, - year: int = get_pe_source_impute_block_spec("scf").default_year, - shareability: Shareability = Shareability.PUBLIC, - loader: DonorSurveyTablesLoader | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - target_year: int | None = None, - ) -> None: - super().__init__( - spec=get_pe_source_impute_block_spec("scf"), - year=year, - shareability=shareability, - loader=loader, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_year=target_year, - ) diff --git a/src/microplex_us/data_sources/family_imputation_benchmark.py b/src/microplex_us/data_sources/family_imputation_benchmark.py deleted file mode 100644 index 9aa38db3..00000000 --- a/src/microplex_us/data_sources/family_imputation_benchmark.py +++ /dev/null @@ -1,1695 +0,0 @@ -"""Holdout benchmarks for decomposable-family imputers.""" - -from __future__ import annotations - -from collections.abc import Callable -from dataclasses import asdict, dataclass, fields -from typing import Any - -import numpy as np -import pandas as pd -from microplex.calibration import Calibrator -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor - -from microplex_us.data_sources.share_imputation import ( - fit_grouped_share_model, - predict_grouped_component_shares, -) - - -@dataclass(frozen=True) -class DecomposableFamilyBenchmarkSpec: - """Describe a decomposable-family holdout benchmark.""" - - total_column: str - component_columns: tuple[str, ...] - grouped_feature_sets: tuple[tuple[str, ...], ...] - qrf_condition_vars: tuple[str, ...] - implicit_component_column: str | None = None - weight_column: str = "weight" - group_eval_columns: tuple[str, ...] = () - qrf_n_estimators: int = 100 - forest_condition_vars: tuple[str, ...] = () - forest_n_estimators: int = 200 - forest_min_samples_leaf: int = 5 - support_gate_probability_threshold: float = 0.2 - forest_share_min_component_share: float = 0.05 - qrf_support_augmentation_max_extra_components: int = 1 - reweight_feature_sets: tuple[tuple[str, ...], ...] = () - reweight_method: str = "ipf" - reweight_max_iter: int = 100 - reweight_tol: float = 1e-6 - reweight_initial_weight_mode: str = "observed" - - @property - def explicit_component_columns(self) -> tuple[str, ...]: - if self.implicit_component_column is None: - return self.component_columns - return tuple( - column - for column in self.component_columns - if column != self.implicit_component_column - ) - - -@dataclass(frozen=True) -class FamilyImputationMethodBenchmark: - """Aggregate benchmark metrics for one imputation method.""" - - component_total_relative_error: dict[str, float] - component_support_relative_error: dict[str, float] - component_group_sum_mare: dict[str, float] - mean_component_total_relative_error: float - mean_component_support_relative_error: float - mean_component_group_sum_mare: float | None - pre_target_component_total_relative_error: dict[str, float] | None = None - pre_target_mean_component_total_relative_error: float | None = None - post_reweight_component_total_relative_error: dict[str, float] | None = None - post_reweight_component_support_relative_error: dict[str, float] | None = None - post_reweight_component_group_sum_mare: dict[str, float] | None = None - post_reweight_mean_component_total_relative_error: float | None = None - post_reweight_mean_component_support_relative_error: float | None = None - post_reweight_mean_component_group_sum_mare: float | None = None - post_reweight_total_error_lift: dict[str, float] | None = None - post_reweight_mean_component_total_error_lift: float | None = None - oracle_pre_target_component_total_relative_error: dict[str, float] | None = None - oracle_pre_target_mean_component_total_relative_error: float | None = None - oracle_post_reweight_component_total_relative_error: dict[str, float] | None = None - oracle_post_reweight_mean_component_total_relative_error: float | None = None - oracle_post_reweight_total_error_lift: dict[str, float] | None = None - oracle_post_reweight_mean_component_total_error_lift: float | None = None - post_reweight_excess_over_oracle_total_error: dict[str, float] | None = None - post_reweight_mean_component_total_error_excess_over_oracle: float | None = None - reweighting_summary: dict[str, Any] | None = None - repeat_metric_summary: dict[str, dict[str, float]] | None = None - - -@dataclass(frozen=True) -class FamilyImputationBenchmarkResult: - """Comparable holdout metrics for a decomposable family.""" - - spec: DecomposableFamilyBenchmarkSpec - row_count: int - train_row_count: int - eval_row_count: int - target_row_count: int - methods: dict[str, FamilyImputationMethodBenchmark] - repeat_count: int = 1 - split_seeds: tuple[int, ...] = () - repeat_summaries: tuple[dict[str, Any], ...] = () - - def to_dict(self) -> dict[str, Any]: - payload = asdict(self) - payload["spec"] = asdict(self.spec) - return payload - - -def _numeric_series(frame: pd.DataFrame, column: str) -> pd.Series: - if column not in frame.columns: - return pd.Series(0.0, index=frame.index, dtype=float) - return pd.to_numeric(frame[column], errors="coerce").fillna(0.0).astype(float) - - -def _median_or_none(values: list[float | None]) -> float | None: - numeric = [float(value) for value in values if value is not None] - if not numeric: - return None - return float(np.median(numeric)) - - -def _max_or_none(values: list[float | None]) -> float | None: - numeric = [float(value) for value in values if value is not None] - if not numeric: - return None - return float(max(numeric)) - - -def _aggregate_numeric_dicts( - dicts: list[dict[str, float] | None], -) -> dict[str, float] | None: - keys = sorted({key for mapping in dicts if mapping is not None for key in mapping}) - if not keys: - return None - aggregated = { - key: _median_or_none( - [ - None if mapping is None else mapping.get(key) - for mapping in dicts - ] - ) - for key in keys - } - return {key: float(value) for key, value in aggregated.items() if value is not None} - - -def _build_positive_family_frame( - reference: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, -) -> pd.DataFrame: - required_columns = { - spec.total_column, - spec.weight_column, - *spec.component_columns, - *spec.qrf_condition_vars, - *spec.group_eval_columns, - } - for feature_set in spec.grouped_feature_sets: - required_columns.update(feature_set) - frame = reference.loc[:, list(required_columns)].copy() - frame[spec.total_column] = _numeric_series(frame, spec.total_column) - frame[spec.weight_column] = _numeric_series(frame, spec.weight_column).clip(lower=0.0) - for column in spec.component_columns: - frame[column] = _numeric_series(frame, column).clip(lower=0.0) - positive_mask = frame[spec.total_column] > 0.0 - frame = frame.loc[positive_mask].copy() - if frame.empty: - raise ValueError("Benchmark requires at least one positive family-total row") - return frame.reset_index(drop=True) - - -def _encode_condition_frames( - train_frame: pd.DataFrame, - test_frame: pd.DataFrame, - *, - condition_columns: tuple[str, ...], -) -> tuple[pd.DataFrame, pd.DataFrame]: - encoded_train = pd.DataFrame(index=train_frame.index) - encoded_test = pd.DataFrame(index=test_frame.index) - for column in condition_columns: - train_series = train_frame[column] - test_series = test_frame[column] - if pd.api.types.is_numeric_dtype(train_series): - encoded_train[column] = pd.to_numeric(train_series, errors="coerce").fillna(0.0) - encoded_test[column] = pd.to_numeric(test_series, errors="coerce").fillna(0.0) - continue - train_values = train_series.astype("string").fillna("__MISSING__") - test_values = test_series.astype("string").fillna("__MISSING__") - categories = pd.Index(train_values.unique(), dtype="object") - encoded_train[column] = pd.Categorical(train_values, categories=categories).codes.astype(float) - encoded_test[column] = pd.Categorical(test_values, categories=categories).codes.astype(float) - return encoded_train, encoded_test - - -def _split_train_eval_target( - frame: pd.DataFrame, - *, - train_frac: float, - target_frac: float, - random_seed: int, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - if not 0.0 < train_frac < 1.0: - raise ValueError("train_frac must be between 0 and 1") - if not 0.0 < target_frac < 1.0: - raise ValueError("target_frac must be between 0 and 1") - if train_frac + target_frac >= 1.0: - raise ValueError("train_frac + target_frac must leave room for eval rows") - if len(frame) < 3: - raise ValueError("Benchmark requires at least three rows to create train/eval/target splits") - rng = np.random.default_rng(random_seed) - shuffled = frame.iloc[rng.permutation(len(frame))].reset_index(drop=True) - n_rows = len(shuffled) - n_train = max(1, int(np.floor(n_rows * train_frac))) - n_target = max(1, int(np.floor(n_rows * target_frac))) - if n_train + n_target >= n_rows: - n_target = max(1, n_rows - n_train - 1) - if n_train + n_target >= n_rows: - n_train = max(1, n_rows - n_target - 1) - train_frame = shuffled.iloc[:n_train].reset_index(drop=True) - target_frame = shuffled.iloc[n_train : n_train + n_target].reset_index(drop=True) - eval_frame = shuffled.iloc[n_train + n_target :].reset_index(drop=True) - if eval_frame.empty: - raise ValueError("Benchmark split produced no eval rows") - return train_frame, eval_frame, target_frame - - -def _weighted_component_totals( - frame: pd.DataFrame, - *, - component_columns: tuple[str, ...], - weight_column: str, -) -> dict[str, float]: - weights = _numeric_series(frame, weight_column) - return { - column: float((_numeric_series(frame, column) * weights).sum()) - for column in component_columns - } - - -def _weighted_component_support( - frame: pd.DataFrame, - *, - component_columns: tuple[str, ...], - weight_column: str, -) -> dict[str, float]: - weights = _numeric_series(frame, weight_column) - return { - column: float(weights[_numeric_series(frame, column) > 0.0].sum()) - for column in component_columns - } - - -def _relative_error(candidate: float, baseline: float) -> float: - baseline_abs = abs(float(baseline)) - if baseline_abs <= 1e-9: - return 0.0 if abs(float(candidate)) <= 1e-9 else 1.0 - return float(abs(float(candidate) - float(baseline)) / baseline_abs) - - -def _component_group_sum_mare( - actual: pd.DataFrame, - predicted: pd.DataFrame, - *, - component_columns: tuple[str, ...], - weight_column: str, - group_columns: tuple[str, ...], -) -> dict[str, float]: - if not group_columns: - return {} - actual_work = actual.loc[:, [*group_columns, weight_column, *component_columns]].copy() - predicted_work = predicted.loc[:, [*group_columns, weight_column, *component_columns]].copy() - actual_work[weight_column] = _numeric_series(actual_work, weight_column) - predicted_work[weight_column] = _numeric_series(predicted_work, weight_column) - - results: dict[str, float] = {} - for column in component_columns: - actual_work[f"__{column}_weighted"] = ( - _numeric_series(actual_work, column) * actual_work[weight_column] - ) - predicted_work[f"__{column}_weighted"] = ( - _numeric_series(predicted_work, column) * predicted_work[weight_column] - ) - actual_grouped = ( - actual_work.groupby(list(group_columns), dropna=False, observed=False)[ - f"__{column}_weighted" - ] - .sum() - .reset_index(name="actual") - ) - predicted_grouped = ( - predicted_work.groupby(list(group_columns), dropna=False, observed=False)[ - f"__{column}_weighted" - ] - .sum() - .reset_index(name="predicted") - ) - merged = actual_grouped.merge( - predicted_grouped, - on=list(group_columns), - how="outer", - sort=False, - ).fillna(0.0) - if merged.empty: - results[column] = 0.0 - continue - errors = [ - _relative_error(row.predicted, row.actual) - for row in merged.itertuples(index=False) - ] - results[column] = float(np.mean(errors)) if errors else 0.0 - return results - - -def _combined_categorical_column( - frame: pd.DataFrame, - feature_set: tuple[str, ...], -) -> pd.Series: - if len(feature_set) == 1: - return frame[feature_set[0]].astype("string").fillna("__MISSING__") - combined = frame.loc[:, list(feature_set)].astype("string").fillna("__MISSING__") - return combined.agg("||".join, axis=1) - - -def _build_reweighting_targets( - frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - required_categories_frame: pd.DataFrame | None = None, -) -> tuple[pd.DataFrame, dict[str, dict[str, float]]]: - if not spec.reweight_feature_sets: - return frame.copy(), {} - prepared = frame.copy() - required_prepared = ( - required_categories_frame.copy() if required_categories_frame is not None else None - ) - targets: dict[str, dict[str, float]] = {} - weights = _numeric_series(prepared, spec.weight_column) - for feature_set in spec.reweight_feature_sets: - target_column = "__reweight__" + "__".join(feature_set) - prepared[target_column] = _combined_categorical_column(prepared, feature_set) - if required_prepared is not None: - required_prepared[target_column] = _combined_categorical_column( - required_prepared, - feature_set, - ) - grouped = ( - pd.DataFrame({target_column: prepared[target_column], "__weight": weights}) - .groupby(target_column, dropna=False, observed=False)["__weight"] - .sum() - ) - target_values = { - str(category): float(total) - for category, total in grouped.items() - } - if required_prepared is not None: - required_categories = ( - required_prepared[target_column] - .astype("string") - .fillna("__MISSING__") - .unique() - .tolist() - ) - for category in required_categories: - target_values.setdefault(str(category), 0.0) - targets[target_column] = target_values - return prepared, targets - - -def _apply_reweighting( - target_frame: pd.DataFrame, - eval_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, -) -> tuple[pd.DataFrame, dict[str, Any]] | None: - if not spec.reweight_feature_sets: - return None - target_prepared, targets = _build_reweighting_targets( - target_frame, - spec=spec, - required_categories_frame=eval_frame, - ) - eval_prepared, _ = _build_reweighting_targets(eval_frame, spec=spec) - if spec.reweight_initial_weight_mode == "uniform": - eval_prepared[spec.weight_column] = 1.0 - elif spec.reweight_initial_weight_mode != "observed": - raise ValueError( - "reweight_initial_weight_mode must be 'observed' or 'uniform'" - ) - initial_weights = _numeric_series(eval_prepared, spec.weight_column).copy() - calibrator = Calibrator( - method=spec.reweight_method, - tol=spec.reweight_tol, - max_iter=spec.reweight_max_iter, - ) - reweighted = calibrator.fit_transform( - eval_prepared, - marginal_targets=targets, - weight_col=spec.weight_column, - ) - validation = calibrator.validate(eval_prepared, weight_col=spec.weight_column) - final_weights = _numeric_series(reweighted, spec.weight_column) - denom = initial_weights.abs().clip(lower=1e-9) - relative_change = (final_weights - initial_weights).abs() / denom - keep_columns = [spec.weight_column, *spec.group_eval_columns, *spec.component_columns] - return reweighted.loc[:, keep_columns].copy(), { - "converged": bool(validation["converged"]), - "max_error": float(validation["max_error"]), - "n_iterations": int(calibrator.n_iterations_), - "target_count": int(sum(len(values) for values in targets.values())), - "target_columns": sorted(targets.keys()), - "method": spec.reweight_method, - "initial_weight_mode": spec.reweight_initial_weight_mode, - "target_row_count": int(len(target_prepared)), - "eval_row_count": int(len(eval_prepared)), - "initial_total_weight": float(initial_weights.sum()), - "final_total_weight": float(final_weights.sum()), - "mean_abs_relative_weight_change": float(relative_change.mean()), - "max_abs_relative_weight_change": float(relative_change.max()), - "share_rows_changed_gt_1pct": float((relative_change > 0.01).mean()), - } - - -def _overall_component_shares( - train_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, -) -> dict[str, float]: - totals = _weighted_component_totals( - train_frame, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - total_sum = sum(totals.values()) - if total_sum <= 1e-9: - uniform = 1.0 / len(spec.component_columns) - return {column: uniform for column in spec.component_columns} - return {column: value / total_sum for column, value in totals.items()} - - -def _component_share_targets( - frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, -) -> pd.DataFrame: - total = _numeric_series(frame, spec.total_column) - safe_total = total.where(total > 0.0, 1.0) - shares = pd.DataFrame(index=frame.index) - for column in spec.component_columns: - shares[column] = (_numeric_series(frame, column) / safe_total).clip(lower=0.0) - row_sum = shares.sum(axis=1) - overfull = row_sum > 1.0 - if overfull.any(): - shares.loc[overfull, list(spec.component_columns)] = shares.loc[ - overfull, - list(spec.component_columns), - ].div(row_sum.loc[overfull], axis=0) - shares.loc[total <= 0.0, list(spec.component_columns)] = 0.0 - return shares.loc[:, list(spec.component_columns)] - - -def _component_support_targets( - frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, -) -> pd.DataFrame: - support = pd.DataFrame(index=frame.index) - for column in spec.component_columns: - support[column] = (_numeric_series(frame, column) > 0.0).astype(int) - return support.loc[:, list(spec.component_columns)] - - -def _fit_support_probability_model( - encoded_train: pd.DataFrame, - encoded_test: pd.DataFrame, - *, - target: pd.Series, - sample_weight: pd.Series, - n_estimators: int, - min_samples_leaf: int, - random_seed: int, -) -> np.ndarray: - target = pd.to_numeric(target, errors="coerce").fillna(0.0).astype(int) - unique_values = sorted(set(target.tolist())) - if len(unique_values) <= 1: - return np.full(len(encoded_test), float(unique_values[0] if unique_values else 0.0)) - - classifier = RandomForestClassifier( - n_estimators=n_estimators, - min_samples_leaf=min_samples_leaf, - random_state=random_seed, - n_jobs=-1, - class_weight="balanced_subsample", - ) - classifier.fit( - encoded_train.to_numpy(dtype=float), - target.to_numpy(dtype=int), - sample_weight=sample_weight.to_numpy(dtype=float), - ) - classes = classifier.classes_.tolist() - positive_index = classes.index(1) - probabilities = classifier.predict_proba(encoded_test.to_numpy(dtype=float)) - return probabilities[:, positive_index] - - -def _predict_active_component_counts( - encoded_train: pd.DataFrame, - encoded_test: pd.DataFrame, - *, - support_targets: pd.DataFrame, - sample_weight: pd.Series, - n_estimators: int, - min_samples_leaf: int, - random_seed: int, -) -> np.ndarray: - active_counts = support_targets.sum(axis=1).clip(lower=1).astype(int) - unique_values = sorted(set(active_counts.tolist())) - if len(unique_values) <= 1: - return np.full( - len(encoded_test), - int(unique_values[0] if unique_values else 1), - dtype=int, - ) - - classifier = RandomForestClassifier( - n_estimators=n_estimators, - min_samples_leaf=min_samples_leaf, - random_state=random_seed, - n_jobs=-1, - class_weight="balanced_subsample", - ) - classifier.fit( - encoded_train.to_numpy(dtype=float), - active_counts.to_numpy(dtype=int), - sample_weight=sample_weight.to_numpy(dtype=float), - ) - return classifier.predict(encoded_test.to_numpy(dtype=float)).astype(int) - - -def _normalize_share_predictions( - shares: pd.DataFrame, - *, - component_columns: tuple[str, ...], - fallback_shares: dict[str, float], -) -> pd.DataFrame: - result = pd.DataFrame(index=shares.index) - for column in component_columns: - result[column] = _numeric_series(shares, column).clip(lower=0.0) - row_sum = result.sum(axis=1) - positive = row_sum > 0.0 - if positive.any(): - result.loc[positive, list(component_columns)] = result.loc[ - positive, - list(component_columns), - ].div(row_sum.loc[positive], axis=0) - zero_rows = ~positive - if zero_rows.any(): - for column in component_columns: - result.loc[zero_rows, column] = float(fallback_shares[column]) - return result.loc[:, list(component_columns)] - - -def _sparsify_normalized_share_predictions( - shares: pd.DataFrame, - *, - component_columns: tuple[str, ...], - min_component_share: float, -) -> pd.DataFrame: - sparsified = pd.DataFrame(0.0, index=shares.index, columns=list(component_columns)) - for index in shares.index: - share_row = ( - shares.loc[index, list(component_columns)] - .astype(float) - .clip(lower=0.0) - ) - selected = share_row[share_row >= min_component_share].index - if len(selected) == 0: - selected = share_row.sort_values(ascending=False).index[:1] - selected_scores = share_row.loc[selected] - if float(selected_scores.sum()) <= 0.0: - selected_scores = pd.Series(0.0, index=selected, dtype=float) - selected_scores.iloc[0] = 1.0 - sparsified.loc[index, selected] = selected_scores.to_numpy(dtype=float) - - return _normalize_share_predictions( - sparsified, - component_columns=component_columns, - fallback_shares={ - column: 1.0 / len(component_columns) - for column in component_columns - }, - ) - - -def _mask_share_predictions_to_supported_components( - predicted_shares: pd.DataFrame, - support_probabilities: pd.DataFrame, - predicted_active_counts: np.ndarray, - *, - component_columns: tuple[str, ...], - support_gate_probability_threshold: float, -) -> pd.DataFrame: - masked = pd.DataFrame(0.0, index=predicted_shares.index, columns=list(component_columns)) - component_count = len(component_columns) - - for position, index in enumerate(predicted_shares.index): - share_row = ( - predicted_shares.loc[index, list(component_columns)] - .astype(float) - .clip(lower=0.0) - ) - probability_row = ( - support_probabilities.loc[index, list(component_columns)] - .astype(float) - .clip(lower=0.0) - ) - desired_count = int(np.clip(predicted_active_counts[position], 1, component_count)) - confident_count = int((probability_row >= support_gate_probability_threshold).sum()) - keep_count = max(1, min(component_count, max(desired_count, confident_count))) - selected = probability_row.sort_values(ascending=False).index[:keep_count] - selected_scores = share_row.loc[selected] - if float(selected_scores.sum()) <= 0.0: - selected_scores = probability_row.loc[selected] - if float(selected_scores.sum()) <= 0.0: - selected_scores = pd.Series(0.0, index=selected, dtype=float) - selected_scores.iloc[0] = 1.0 - masked.loc[index, selected] = selected_scores.to_numpy(dtype=float) - - return masked.loc[:, list(component_columns)] - - -def _mask_share_predictions_to_binary_support( - predicted_shares: pd.DataFrame, - support_mask: pd.DataFrame, - *, - component_columns: tuple[str, ...], -) -> pd.DataFrame: - masked = pd.DataFrame(0.0, index=predicted_shares.index, columns=list(component_columns)) - - for index in predicted_shares.index: - share_row = ( - predicted_shares.loc[index, list(component_columns)] - .astype(float) - .clip(lower=0.0) - ) - selected = [ - column - for column in component_columns - if float(support_mask.loc[index, column]) > 0.0 - ] - if not selected: - selected = [share_row.sort_values(ascending=False).index[0]] - selected_scores = share_row.loc[selected] - if float(selected_scores.sum()) <= 0.0: - selected_scores = pd.Series(0.0, index=selected, dtype=float) - selected_scores.iloc[0] = 1.0 - masked.loc[index, selected] = selected_scores.to_numpy(dtype=float) - - return masked.loc[:, list(component_columns)] - - -def _augment_sparse_shares_with_support_prior( - sparse_shares: pd.DataFrame, - base_share_scores: pd.DataFrame, - support_mask: pd.DataFrame, - *, - component_columns: tuple[str, ...], - max_extra_components: int, -) -> pd.DataFrame: - augmented = sparse_shares.loc[:, list(component_columns)].copy() - if max_extra_components <= 0: - return augmented - - for index in augmented.index: - sparse_row = ( - augmented.loc[index, list(component_columns)].astype(float).clip(lower=0.0) - ) - active_components = [ - column for column in component_columns if float(sparse_row[column]) > 0.0 - ] - supported_components = [ - column - for column in component_columns - if float(support_mask.loc[index, column]) > 0.0 - ] - missing_supported = [ - column for column in supported_components if column not in active_components - ] - if not missing_supported: - continue - - extra_budget = min( - max_extra_components, - max(0, len(supported_components) - len(active_components)), - ) - if extra_budget <= 0: - continue - - base_row = ( - base_share_scores.loc[index, list(component_columns)] - .astype(float) - .clip(lower=0.0) - ) - selected = ( - base_row.loc[missing_supported] - .sort_values(ascending=False) - .index[:extra_budget] - .tolist() - ) - if not selected: - continue - - selected_scores = base_row.loc[selected] - if float(selected_scores.sum()) <= 0.0: - selected_scores = pd.Series(0.0, index=selected, dtype=float) - selected_scores.iloc[0] = 1.0 - augmented.loc[index, selected] = selected_scores.to_numpy(dtype=float) - - return augmented.loc[:, list(component_columns)] - - -def reconcile_component_predictions_to_total( - predictions: pd.DataFrame, - *, - family_total: pd.Series, - component_columns: tuple[str, ...], - fallback_shares: dict[str, float] | None = None, -) -> pd.DataFrame: - """Project component predictions onto the observed family total.""" - - result = pd.DataFrame(index=predictions.index) - total = pd.to_numeric(family_total, errors="coerce").fillna(0.0).clip(lower=0.0) - fallback = fallback_shares or { - column: 1.0 / len(component_columns) - for column in component_columns - } - for column in component_columns: - result[column] = _numeric_series(predictions, column).clip(lower=0.0) - - positive_total = total > 0.0 - row_sum = result.loc[:, list(component_columns)].sum(axis=1) - positive_rows = positive_total & (row_sum > 0.0) - zero_rows = positive_total & ~positive_rows - - if positive_rows.any(): - result.loc[positive_rows, list(component_columns)] = result.loc[ - positive_rows, - list(component_columns), - ].div(row_sum.loc[positive_rows], axis=0).mul(total.loc[positive_rows], axis=0) - - if zero_rows.any(): - for column in component_columns: - result.loc[zero_rows, column] = total.loc[zero_rows] * float( - fallback.get(column, 0.0) - ) - - if (~positive_total).any(): - result.loc[~positive_total, list(component_columns)] = 0.0 - - return result.loc[:, list(component_columns)] - - -def _grouped_share_predict( - train_frame: pd.DataFrame, - test_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, -) -> pd.DataFrame: - model = fit_grouped_share_model( - train_frame, - explicit_component_columns=spec.explicit_component_columns, - implicit_component_column=spec.implicit_component_column, - feature_sets=spec.grouped_feature_sets, - weight_column=spec.weight_column, - ) - feature_columns = sorted({column for group in spec.grouped_feature_sets for column in group}) - shares = predict_grouped_component_shares( - test_frame.loc[:, feature_columns].copy(), - model, - ) - result = pd.DataFrame(index=test_frame.index) - family_total = _numeric_series(test_frame, spec.total_column) - for column in spec.component_columns: - result[column] = family_total * shares[column] - return result.loc[:, list(spec.component_columns)] - - -def _default_qrf_factory(*, condition_vars: list[str], target_vars: list[str], n_estimators: int): - from microplex_us.pipelines.us import ColumnwiseQRFDonorImputer - - return ColumnwiseQRFDonorImputer( - condition_vars=condition_vars, - target_vars=target_vars, - n_estimators=n_estimators, - ) - - -def _qrf_predict( - train_frame: pd.DataFrame, - test_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - random_seed: int, - qrf_factory: Callable[..., Any] | None, -) -> pd.DataFrame: - factory = qrf_factory or _default_qrf_factory - encoded_train, encoded_test = _encode_condition_frames( - train_frame, - test_frame, - condition_columns=spec.qrf_condition_vars, - ) - imputer = factory( - condition_vars=list(spec.qrf_condition_vars), - target_vars=list(spec.component_columns), - n_estimators=spec.qrf_n_estimators, - ) - fit_frame = encoded_train.copy() - for column in spec.component_columns: - fit_frame[column] = _numeric_series(train_frame, column) - fit_frame[spec.weight_column] = _numeric_series(train_frame, spec.weight_column) - imputer.fit( - fit_frame, - weight_col=spec.weight_column, - epochs=None, - batch_size=None, - learning_rate=None, - verbose=False, - ) - generated = imputer.generate( - encoded_test.copy(), - seed=random_seed, - ) - return reconcile_component_predictions_to_total( - generated, - family_total=test_frame[spec.total_column], - component_columns=spec.component_columns, - fallback_shares=_overall_component_shares(train_frame, spec=spec), - ) - - -def _forest_share_predict( - train_frame: pd.DataFrame, - test_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - random_seed: int, -) -> pd.DataFrame: - condition_columns = ( - spec.forest_condition_vars if spec.forest_condition_vars else spec.qrf_condition_vars - ) - encoded_train, encoded_test = _encode_condition_frames( - train_frame, - test_frame, - condition_columns=condition_columns, - ) - model = RandomForestRegressor( - n_estimators=spec.forest_n_estimators, - min_samples_leaf=spec.forest_min_samples_leaf, - random_state=random_seed, - n_jobs=-1, - ) - train_targets = _component_share_targets(train_frame, spec=spec) - train_weights = _numeric_series(train_frame, spec.weight_column) - if train_weights.sum() > 0.0: - model.fit( - encoded_train.to_numpy(dtype=float), - train_targets.to_numpy(dtype=float), - sample_weight=train_weights.to_numpy(dtype=float), - ) - else: - model.fit( - encoded_train.to_numpy(dtype=float), - train_targets.to_numpy(dtype=float), - ) - predicted_shares = pd.DataFrame( - model.predict(encoded_test.to_numpy(dtype=float)), - index=test_frame.index, - columns=list(spec.component_columns), - ) - normalized = _normalize_share_predictions( - predicted_shares, - component_columns=spec.component_columns, - fallback_shares=_overall_component_shares(train_frame, spec=spec), - ) - family_total = _numeric_series(test_frame, spec.total_column) - result = pd.DataFrame(index=test_frame.index) - for column in spec.component_columns: - result[column] = normalized[column] * family_total - return result.loc[:, list(spec.component_columns)] - - -def _sparse_forest_share_predict( - train_frame: pd.DataFrame, - test_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - random_seed: int, -) -> pd.DataFrame: - condition_columns = ( - spec.forest_condition_vars if spec.forest_condition_vars else spec.qrf_condition_vars - ) - encoded_train, encoded_test = _encode_condition_frames( - train_frame, - test_frame, - condition_columns=condition_columns, - ) - model = RandomForestRegressor( - n_estimators=spec.forest_n_estimators, - min_samples_leaf=spec.forest_min_samples_leaf, - random_state=random_seed, - n_jobs=-1, - ) - train_targets = _component_share_targets(train_frame, spec=spec) - train_weights = _numeric_series(train_frame, spec.weight_column) - if train_weights.sum() > 0.0: - model.fit( - encoded_train.to_numpy(dtype=float), - train_targets.to_numpy(dtype=float), - sample_weight=train_weights.to_numpy(dtype=float), - ) - else: - model.fit( - encoded_train.to_numpy(dtype=float), - train_targets.to_numpy(dtype=float), - ) - predicted_shares = pd.DataFrame( - model.predict(encoded_test.to_numpy(dtype=float)), - index=test_frame.index, - columns=list(spec.component_columns), - ) - normalized = _normalize_share_predictions( - predicted_shares, - component_columns=spec.component_columns, - fallback_shares=_overall_component_shares(train_frame, spec=spec), - ) - sparsified = _sparsify_normalized_share_predictions( - normalized, - component_columns=spec.component_columns, - min_component_share=spec.forest_share_min_component_share, - ) - family_total = _numeric_series(test_frame, spec.total_column) - result = pd.DataFrame(index=test_frame.index) - for column in spec.component_columns: - result[column] = sparsified[column] * family_total - return result.loc[:, list(spec.component_columns)] - - -def _support_gated_forest_share_predict( - train_frame: pd.DataFrame, - test_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - random_seed: int, -) -> pd.DataFrame: - condition_columns = ( - spec.forest_condition_vars if spec.forest_condition_vars else spec.qrf_condition_vars - ) - encoded_train, encoded_test = _encode_condition_frames( - train_frame, - test_frame, - condition_columns=condition_columns, - ) - train_weights = _numeric_series(train_frame, spec.weight_column) - positive_weights = ( - train_weights if float(train_weights.sum()) > 0.0 else pd.Series(1.0, index=train_frame.index) - ) - predicted_shares = _component_share_targets(train_frame, spec=spec) - share_model = RandomForestRegressor( - n_estimators=spec.forest_n_estimators, - min_samples_leaf=spec.forest_min_samples_leaf, - random_state=random_seed, - n_jobs=-1, - ) - share_model.fit( - encoded_train.to_numpy(dtype=float), - predicted_shares.to_numpy(dtype=float), - sample_weight=positive_weights.to_numpy(dtype=float), - ) - raw_share_predictions = pd.DataFrame( - share_model.predict(encoded_test.to_numpy(dtype=float)), - index=test_frame.index, - columns=list(spec.component_columns), - ) - support_targets = _component_support_targets(train_frame, spec=spec) - support_probabilities = pd.DataFrame(index=test_frame.index) - for offset, column in enumerate(spec.component_columns, start=1): - support_probabilities[column] = _fit_support_probability_model( - encoded_train, - encoded_test, - target=support_targets[column], - sample_weight=positive_weights, - n_estimators=spec.forest_n_estimators, - min_samples_leaf=spec.forest_min_samples_leaf, - random_seed=random_seed + offset, - ) - predicted_active_counts = _predict_active_component_counts( - encoded_train, - encoded_test, - support_targets=support_targets, - sample_weight=positive_weights, - n_estimators=spec.forest_n_estimators, - min_samples_leaf=spec.forest_min_samples_leaf, - random_seed=random_seed + len(spec.component_columns) + 1, - ) - masked_shares = _mask_share_predictions_to_supported_components( - raw_share_predictions, - support_probabilities, - predicted_active_counts, - component_columns=spec.component_columns, - support_gate_probability_threshold=spec.support_gate_probability_threshold, - ) - normalized = _normalize_share_predictions( - masked_shares, - component_columns=spec.component_columns, - fallback_shares=_overall_component_shares(train_frame, spec=spec), - ) - family_total = _numeric_series(test_frame, spec.total_column) - result = pd.DataFrame(index=test_frame.index) - for column in spec.component_columns: - result[column] = normalized[column] * family_total - return result.loc[:, list(spec.component_columns)] - - -def _qrf_support_masked_forest_share_predict( - train_frame: pd.DataFrame, - test_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - random_seed: int, - qrf_predictions: pd.DataFrame, -) -> pd.DataFrame: - condition_columns = ( - spec.forest_condition_vars if spec.forest_condition_vars else spec.qrf_condition_vars - ) - encoded_train, encoded_test = _encode_condition_frames( - train_frame, - test_frame, - condition_columns=condition_columns, - ) - train_weights = _numeric_series(train_frame, spec.weight_column) - positive_weights = ( - train_weights - if float(train_weights.sum()) > 0.0 - else pd.Series(1.0, index=train_frame.index) - ) - predicted_shares = _component_share_targets(train_frame, spec=spec) - share_model = RandomForestRegressor( - n_estimators=spec.forest_n_estimators, - min_samples_leaf=spec.forest_min_samples_leaf, - random_state=random_seed, - n_jobs=-1, - ) - share_model.fit( - encoded_train.to_numpy(dtype=float), - predicted_shares.to_numpy(dtype=float), - sample_weight=positive_weights.to_numpy(dtype=float), - ) - raw_share_predictions = pd.DataFrame( - share_model.predict(encoded_test.to_numpy(dtype=float)), - index=test_frame.index, - columns=list(spec.component_columns), - ) - qrf_support_mask = pd.DataFrame(index=test_frame.index) - for column in spec.component_columns: - qrf_support_mask[column] = (_numeric_series(qrf_predictions, column) > 0.0).astype(float) - masked_shares = _mask_share_predictions_to_binary_support( - raw_share_predictions, - qrf_support_mask, - component_columns=spec.component_columns, - ) - normalized = _normalize_share_predictions( - masked_shares, - component_columns=spec.component_columns, - fallback_shares=_overall_component_shares(train_frame, spec=spec), - ) - family_total = _numeric_series(test_frame, spec.total_column) - result = pd.DataFrame(index=test_frame.index) - for column in spec.component_columns: - result[column] = normalized[column] * family_total - return result.loc[:, list(spec.component_columns)] - - -def _qrf_augmented_sparse_forest_share_predict( - train_frame: pd.DataFrame, - test_frame: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - random_seed: int, - qrf_predictions: pd.DataFrame, -) -> pd.DataFrame: - condition_columns = ( - spec.forest_condition_vars if spec.forest_condition_vars else spec.qrf_condition_vars - ) - encoded_train, encoded_test = _encode_condition_frames( - train_frame, - test_frame, - condition_columns=condition_columns, - ) - model = RandomForestRegressor( - n_estimators=spec.forest_n_estimators, - min_samples_leaf=spec.forest_min_samples_leaf, - random_state=random_seed, - n_jobs=-1, - ) - train_targets = _component_share_targets(train_frame, spec=spec) - train_weights = _numeric_series(train_frame, spec.weight_column) - if train_weights.sum() > 0.0: - model.fit( - encoded_train.to_numpy(dtype=float), - train_targets.to_numpy(dtype=float), - sample_weight=train_weights.to_numpy(dtype=float), - ) - else: - model.fit( - encoded_train.to_numpy(dtype=float), - train_targets.to_numpy(dtype=float), - ) - predicted_shares = pd.DataFrame( - model.predict(encoded_test.to_numpy(dtype=float)), - index=test_frame.index, - columns=list(spec.component_columns), - ) - normalized = _normalize_share_predictions( - predicted_shares, - component_columns=spec.component_columns, - fallback_shares=_overall_component_shares(train_frame, spec=spec), - ) - sparse = _sparsify_normalized_share_predictions( - normalized, - component_columns=spec.component_columns, - min_component_share=spec.forest_share_min_component_share, - ) - qrf_support_mask = pd.DataFrame(index=test_frame.index) - for column in spec.component_columns: - qrf_support_mask[column] = (_numeric_series(qrf_predictions, column) > 0.0).astype(float) - augmented = _augment_sparse_shares_with_support_prior( - sparse, - normalized, - qrf_support_mask, - component_columns=spec.component_columns, - max_extra_components=spec.qrf_support_augmentation_max_extra_components, - ) - renormalized = _normalize_share_predictions( - augmented, - component_columns=spec.component_columns, - fallback_shares=_overall_component_shares(train_frame, spec=spec), - ) - family_total = _numeric_series(test_frame, spec.total_column) - result = pd.DataFrame(index=test_frame.index) - for column in spec.component_columns: - result[column] = renormalized[column] * family_total - return result.loc[:, list(spec.component_columns)] - - -def _summarize_method( - actual_eval: pd.DataFrame, - target_eval: pd.DataFrame, - predicted_components: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, -) -> FamilyImputationMethodBenchmark: - passthrough_columns = { - spec.weight_column, - *spec.group_eval_columns, - *spec.component_columns, - } - for feature_set in spec.reweight_feature_sets: - passthrough_columns.update(feature_set) - actual_eval = actual_eval.loc[:, list(passthrough_columns)].copy() - target_eval = target_eval.loc[:, list(passthrough_columns)].copy() - predicted_eval = actual_eval.loc[:, [column for column in passthrough_columns if column not in spec.component_columns]].copy() - for column in spec.component_columns: - predicted_eval[column] = _numeric_series(predicted_components, column) - - actual_totals = _weighted_component_totals( - actual_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - predicted_totals = _weighted_component_totals( - predicted_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - total_relative_error = { - column: _relative_error(predicted_totals[column], actual_totals[column]) - for column in spec.component_columns - } - - actual_support = _weighted_component_support( - actual_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - predicted_support = _weighted_component_support( - predicted_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - support_relative_error = { - column: _relative_error(predicted_support[column], actual_support[column]) - for column in spec.component_columns - } - - group_sum_mare = _component_group_sum_mare( - actual_eval, - predicted_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - group_columns=spec.group_eval_columns, - ) - - target_totals = _weighted_component_totals( - target_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - target_support = _weighted_component_support( - target_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - if spec.reweight_initial_weight_mode == "uniform": - start_weights = pd.Series(1.0, index=actual_eval.index, dtype=float) - elif spec.reweight_initial_weight_mode == "observed": - start_weights = _numeric_series(actual_eval, spec.weight_column) - else: - raise ValueError("reweight_initial_weight_mode must be 'observed' or 'uniform'") - - pre_target_eval = predicted_eval.copy() - pre_target_eval[spec.weight_column] = start_weights.to_numpy(dtype=float) - pre_target_totals = _weighted_component_totals( - pre_target_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - pre_target_total_error = { - column: _relative_error(pre_target_totals[column], target_totals[column]) - for column in spec.component_columns - } - pre_target_mean_total_error = float( - np.mean(list(pre_target_total_error.values())) - ) - - oracle_pre_target_eval = actual_eval.copy() - oracle_pre_target_eval[spec.weight_column] = start_weights.to_numpy(dtype=float) - oracle_pre_target_totals = _weighted_component_totals( - oracle_pre_target_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - oracle_pre_target_total_error = { - column: _relative_error(oracle_pre_target_totals[column], target_totals[column]) - for column in spec.component_columns - } - oracle_pre_target_mean_total_error = float( - np.mean(list(oracle_pre_target_total_error.values())) - ) - - post_reweight_total_error = None - post_reweight_support_error = None - post_reweight_group_sum_mare = None - post_reweight_mean_total_error = None - post_reweight_mean_support_error = None - post_reweight_mean_group_sum_mare = None - post_reweight_total_error_lift = None - post_reweight_mean_total_error_lift = None - oracle_post_reweight_total_error_lift = None - oracle_post_reweight_mean_total_error_lift = None - oracle_post_reweight_total_error = None - oracle_post_reweight_mean_total_error = None - post_reweight_excess_over_oracle_total_error = None - post_reweight_mean_total_error_excess_over_oracle = None - reweighting_summary = None - reweighted_result = _apply_reweighting( - target_eval, - predicted_eval, - spec=spec, - ) - if reweighted_result is not None: - reweighted_eval, reweighting_summary = reweighted_result - reweighted_totals = _weighted_component_totals( - reweighted_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - post_reweight_total_error = { - column: _relative_error(reweighted_totals[column], target_totals[column]) - for column in spec.component_columns - } - reweighted_support = _weighted_component_support( - reweighted_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - post_reweight_support_error = { - column: _relative_error(reweighted_support[column], target_support[column]) - for column in spec.component_columns - } - post_reweight_group_sum_mare = _component_group_sum_mare( - target_eval, - reweighted_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - group_columns=spec.group_eval_columns, - ) - post_reweight_mean_total_error = float( - np.mean(list(post_reweight_total_error.values())) - ) - post_reweight_mean_support_error = float( - np.mean(list(post_reweight_support_error.values())) - ) - post_reweight_mean_group_sum_mare = ( - float(np.mean(list(post_reweight_group_sum_mare.values()))) - if post_reweight_group_sum_mare - else None - ) - oracle_reweighted_result = _apply_reweighting( - target_eval, - actual_eval, - spec=spec, - ) - if oracle_reweighted_result is not None: - oracle_reweighted_eval, _oracle_summary = oracle_reweighted_result - oracle_reweighted_totals = _weighted_component_totals( - oracle_reweighted_eval, - component_columns=spec.component_columns, - weight_column=spec.weight_column, - ) - oracle_post_reweight_total_error = { - column: _relative_error( - oracle_reweighted_totals[column], - target_totals[column], - ) - for column in spec.component_columns - } - oracle_post_reweight_mean_total_error = float( - np.mean(list(oracle_post_reweight_total_error.values())) - ) - oracle_post_reweight_total_error_lift = { - column: oracle_post_reweight_total_error[column] - - oracle_pre_target_total_error[column] - for column in spec.component_columns - } - oracle_post_reweight_mean_total_error_lift = float( - np.mean(list(oracle_post_reweight_total_error_lift.values())) - ) - post_reweight_excess_over_oracle_total_error = { - column: post_reweight_total_error[column] - - oracle_post_reweight_total_error[column] - for column in spec.component_columns - } - post_reweight_mean_total_error_excess_over_oracle = float( - np.mean(list(post_reweight_excess_over_oracle_total_error.values())) - ) - post_reweight_total_error_lift = { - column: post_reweight_total_error[column] - pre_target_total_error[column] - for column in spec.component_columns - } - post_reweight_mean_total_error_lift = float( - np.mean(list(post_reweight_total_error_lift.values())) - ) - - return FamilyImputationMethodBenchmark( - component_total_relative_error=total_relative_error, - component_support_relative_error=support_relative_error, - component_group_sum_mare=group_sum_mare, - mean_component_total_relative_error=float(np.mean(list(total_relative_error.values()))), - mean_component_support_relative_error=float( - np.mean(list(support_relative_error.values())) - ), - mean_component_group_sum_mare=( - float(np.mean(list(group_sum_mare.values()))) if group_sum_mare else None - ), - pre_target_component_total_relative_error=pre_target_total_error, - pre_target_mean_component_total_relative_error=pre_target_mean_total_error, - post_reweight_component_total_relative_error=post_reweight_total_error, - post_reweight_component_support_relative_error=post_reweight_support_error, - post_reweight_component_group_sum_mare=post_reweight_group_sum_mare, - post_reweight_mean_component_total_relative_error=post_reweight_mean_total_error, - post_reweight_mean_component_support_relative_error=post_reweight_mean_support_error, - post_reweight_mean_component_group_sum_mare=post_reweight_mean_group_sum_mare, - post_reweight_total_error_lift=post_reweight_total_error_lift, - post_reweight_mean_component_total_error_lift=post_reweight_mean_total_error_lift, - oracle_pre_target_component_total_relative_error=oracle_pre_target_total_error, - oracle_pre_target_mean_component_total_relative_error=oracle_pre_target_mean_total_error, - oracle_post_reweight_component_total_relative_error=oracle_post_reweight_total_error, - oracle_post_reweight_mean_component_total_relative_error=oracle_post_reweight_mean_total_error, - oracle_post_reweight_total_error_lift=oracle_post_reweight_total_error_lift, - oracle_post_reweight_mean_component_total_error_lift=oracle_post_reweight_mean_total_error_lift, - post_reweight_excess_over_oracle_total_error=post_reweight_excess_over_oracle_total_error, - post_reweight_mean_component_total_error_excess_over_oracle=post_reweight_mean_total_error_excess_over_oracle, - reweighting_summary=reweighting_summary, - ) - - -_REPEAT_SCALAR_FIELDS = ( - "mean_component_total_relative_error", - "mean_component_support_relative_error", - "mean_component_group_sum_mare", - "pre_target_mean_component_total_relative_error", - "post_reweight_mean_component_total_relative_error", - "post_reweight_mean_component_support_relative_error", - "post_reweight_mean_component_group_sum_mare", - "post_reweight_mean_component_total_error_lift", - "oracle_pre_target_mean_component_total_relative_error", - "oracle_post_reweight_mean_component_total_relative_error", - "oracle_post_reweight_mean_component_total_error_lift", - "post_reweight_mean_component_total_error_excess_over_oracle", -) - - -def _aggregate_reweighting_summaries( - summaries: list[dict[str, Any] | None], - *, - repeat_count: int, -) -> dict[str, Any] | None: - present = [summary for summary in summaries if summary is not None] - if not present: - return None - first = present[0] - aggregated: dict[str, Any] = { - "method": first.get("method"), - "initial_weight_mode": first.get("initial_weight_mode"), - "target_columns": first.get("target_columns"), - "target_count": first.get("target_count"), - "target_row_count": first.get("target_row_count"), - "eval_row_count": first.get("eval_row_count"), - "repeat_count": repeat_count, - "converged": all(bool(summary.get("converged")) for summary in present), - "converged_count": int(sum(bool(summary.get("converged")) for summary in present)), - } - numeric_fields = ( - "max_error", - "n_iterations", - "initial_total_weight", - "final_total_weight", - "mean_abs_relative_weight_change", - "max_abs_relative_weight_change", - "share_rows_changed_gt_1pct", - ) - for field_name in numeric_fields: - values = [ - float(summary[field_name]) - for summary in present - if summary.get(field_name) is not None - ] - if not values: - continue - aggregated[field_name] = float(np.median(values)) - aggregated[f"{field_name}_worst"] = float(max(values)) - return aggregated - - -def _aggregate_method_benchmarks( - repeats: list[FamilyImputationMethodBenchmark], -) -> FamilyImputationMethodBenchmark: - aggregated_values: dict[str, Any] = {} - repeat_metric_summary: dict[str, dict[str, float]] = {} - for field_info in fields(FamilyImputationMethodBenchmark): - field_name = field_info.name - if field_name in {"reweighting_summary", "repeat_metric_summary"}: - continue - values = [getattr(result, field_name) for result in repeats] - sample = next((value for value in values if value is not None), None) - if sample is None: - aggregated_values[field_name] = None - continue - if isinstance(sample, dict): - aggregated_values[field_name] = _aggregate_numeric_dicts(values) - continue - aggregated_values[field_name] = _median_or_none(values) - if field_name in _REPEAT_SCALAR_FIELDS and aggregated_values[field_name] is not None: - repeat_metric_summary[field_name] = { - "median": float(aggregated_values[field_name]), - "worst": float(_max_or_none(values)), - } - aggregated_values["reweighting_summary"] = _aggregate_reweighting_summaries( - [result.reweighting_summary for result in repeats], - repeat_count=len(repeats), - ) - aggregated_values["repeat_metric_summary"] = repeat_metric_summary or None - return FamilyImputationMethodBenchmark(**aggregated_values) - - -def _compact_repeat_summary( - result: FamilyImputationBenchmarkResult, - *, - repeat_index: int, - split_seed: int, -) -> dict[str, Any]: - method_fields = ( - "mean_component_total_relative_error", - "mean_component_support_relative_error", - "mean_component_group_sum_mare", - "pre_target_mean_component_total_relative_error", - "post_reweight_mean_component_total_relative_error", - "post_reweight_mean_component_support_relative_error", - "post_reweight_mean_component_group_sum_mare", - "post_reweight_mean_component_total_error_lift", - "oracle_pre_target_mean_component_total_relative_error", - "oracle_post_reweight_mean_component_total_relative_error", - "oracle_post_reweight_mean_component_total_error_lift", - "post_reweight_mean_component_total_error_excess_over_oracle", - ) - methods = { - method_name: { - field_name: getattr(method_result, field_name) - for field_name in method_fields - } - for method_name, method_result in result.methods.items() - } - return { - "repeat_index": repeat_index, - "split_seed": split_seed, - "train_row_count": result.train_row_count, - "eval_row_count": result.eval_row_count, - "target_row_count": result.target_row_count, - "methods": methods, - } - - -def _benchmark_decomposable_family_imputers_once( - reference: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - train_frac: float, - target_frac: float, - random_seed: int, - qrf_factory: Callable[..., Any] | None, -) -> FamilyImputationBenchmarkResult: - frame = _build_positive_family_frame(reference, spec=spec) - train_frame, eval_frame, target_frame = _split_train_eval_target( - frame, - train_frac=train_frac, - target_frac=target_frac, - random_seed=random_seed, - ) - - grouped_predictions = _grouped_share_predict( - train_frame, - eval_frame, - spec=spec, - ) - qrf_predictions = _qrf_predict( - train_frame, - eval_frame, - spec=spec, - random_seed=random_seed, - qrf_factory=qrf_factory, - ) - qrf_support_masked_predictions = _qrf_support_masked_forest_share_predict( - train_frame, - eval_frame, - spec=spec, - random_seed=random_seed, - qrf_predictions=qrf_predictions, - ) - - methods = { - "grouped_share": _summarize_method( - eval_frame, - target_frame, - grouped_predictions, - spec=spec, - ), - "forest_share": _summarize_method( - eval_frame, - target_frame, - _forest_share_predict( - train_frame, - eval_frame, - spec=spec, - random_seed=random_seed, - ), - spec=spec, - ), - "sparse_forest_share": _summarize_method( - eval_frame, - target_frame, - _sparse_forest_share_predict( - train_frame, - eval_frame, - spec=spec, - random_seed=random_seed, - ), - spec=spec, - ), - "support_gated_forest_share": _summarize_method( - eval_frame, - target_frame, - _support_gated_forest_share_predict( - train_frame, - eval_frame, - spec=spec, - random_seed=random_seed, - ), - spec=spec, - ), - "qrf_support_masked_forest_share": _summarize_method( - eval_frame, - target_frame, - qrf_support_masked_predictions, - spec=spec, - ), - "qrf_augmented_sparse_forest_share": _summarize_method( - eval_frame, - target_frame, - _qrf_augmented_sparse_forest_share_predict( - train_frame, - eval_frame, - spec=spec, - random_seed=random_seed, - qrf_predictions=qrf_predictions, - ), - spec=spec, - ), - "qrf": _summarize_method( - eval_frame, - target_frame, - qrf_predictions, - spec=spec, - ), - } - return FamilyImputationBenchmarkResult( - spec=spec, - row_count=int(len(frame)), - train_row_count=int(len(train_frame)), - eval_row_count=int(len(eval_frame)), - target_row_count=int(len(target_frame)), - methods=methods, - ) - - -def benchmark_decomposable_family_imputers( - reference: pd.DataFrame, - *, - spec: DecomposableFamilyBenchmarkSpec, - train_frac: float = 0.8, - target_frac: float = 0.1, - random_seed: int = 42, - repeat_count: int = 1, - repeat_seed_step: int = 1, - qrf_factory: Callable[..., Any] | None = None, -) -> FamilyImputationBenchmarkResult: - """Benchmark decomposable-family imputers on one or more holdout splits.""" - - if repeat_count < 1: - raise ValueError("repeat_count must be at least 1") - if repeat_seed_step < 1: - raise ValueError("repeat_seed_step must be at least 1") - - repeat_results: list[FamilyImputationBenchmarkResult] = [] - split_seeds = tuple( - int(random_seed + repeat_index * repeat_seed_step) - for repeat_index in range(repeat_count) - ) - for split_seed in split_seeds: - repeat_results.append( - _benchmark_decomposable_family_imputers_once( - reference, - spec=spec, - train_frac=train_frac, - target_frac=target_frac, - random_seed=split_seed, - qrf_factory=qrf_factory, - ) - ) - - first_result = repeat_results[0] - methods = { - method_name: _aggregate_method_benchmarks( - [result.methods[method_name] for result in repeat_results] - ) - for method_name in first_result.methods - } - repeat_summaries = tuple( - _compact_repeat_summary( - result, - repeat_index=repeat_index, - split_seed=split_seed, - ) - for repeat_index, (result, split_seed) in enumerate( - zip(repeat_results, split_seeds, strict=True) - ) - ) - return FamilyImputationBenchmarkResult( - spec=first_result.spec, - row_count=first_result.row_count, - train_row_count=first_result.train_row_count, - eval_row_count=first_result.eval_row_count, - target_row_count=first_result.target_row_count, - methods=methods, - repeat_count=repeat_count, - split_seeds=split_seeds, - repeat_summaries=repeat_summaries, - ) diff --git a/src/microplex_us/data_sources/forbes.py b/src/microplex_us/data_sources/forbes.py deleted file mode 100644 index f90c0d95..00000000 --- a/src/microplex_us/data_sources/forbes.py +++ /dev/null @@ -1,576 +0,0 @@ -"""Forbes fixed-spine source support for Microplex top-tail units.""" - -from __future__ import annotations - -import hashlib -import json -from collections.abc import Iterable, Mapping -from dataclasses import asdict, dataclass, replace -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from microplex.core import EntityType -from microplex.targets import TargetAggregation, TargetSet, TargetSpec - -from microplex_us.policyengine.us import ( - DEFAULT_POLICYENGINE_US_VARIABLE_BINDINGS, - PolicyEngineUSEntityTableBundle, - PolicyEngineUSVariableBinding, - compile_supported_policyengine_us_household_linear_constraints, -) - -FORBES_HOUSEHOLD_VARIABLES: tuple[str, ...] = ( - "state_fips", - "net_worth", -) - -FORBES_PERSON_VARIABLES: tuple[str, ...] = ( - "age", - "is_female", - "employment_income_before_lsr", - "self_employment_income_before_lsr", - "taxable_interest_income", - "tax_exempt_interest_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "short_term_capital_gains", - "long_term_capital_gains_before_response", - "partnership_s_corp_income", - "partnership_se_income", - "estate_income", - "farm_income", - "rental_income", -) - -FORBES_SOURCE_METADATA_COLUMNS: tuple[str, ...] = ( - "forbes_unit_id", - "forbes_name", - "forbes_rank", - "forbes_snapshot_id", - "replicate_index", - "replicate_count", - "replicate_weight", - "household_id", - "person_id", - "tax_unit_id", - "spm_unit_id", - "family_id", - "marital_unit_id", -) - - -@dataclass(frozen=True) -class ForbesFixedSpineConfig: - """Controls deterministic Forbes fixed-spine construction.""" - - period: int = 2024 - snapshot_id: str = "forbes-us-top-tail" - replicates_per_unit: int = 10 - default_unit_weight: float = 1.0 - household_id_start: int = 90_000_000_000 - person_id_start: int = 90_000_000_000 - tax_unit_id_start: int = 90_000_000_000 - spm_unit_id_start: int = 90_000_000_000 - family_id_start: int = 90_000_000_000 - marital_unit_id_start: int = 90_000_000_000 - unit_id_column: str = "forbes_unit_id" - name_column: str = "name" - rank_column: str = "rank" - unit_weight_column: str = "weight" - - def __post_init__(self) -> None: - if self.replicates_per_unit < 1: - raise ValueError("replicates_per_unit must be at least 1") - if self.default_unit_weight <= 0: - raise ValueError("default_unit_weight must be positive") - - -@dataclass(frozen=True) -class ForbesFixedSpine: - """Constructed fixed spine plus source-owned diagnostic metadata.""" - - tables: PolicyEngineUSEntityTableBundle - record_metadata: pd.DataFrame - source_metadata: dict[str, Any] - - -@dataclass(frozen=True) -class FixedSpineTargetContribution: - """Fixed-spine contribution to one target.""" - - target_name: str - target_value: float - contribution: float - residual_value: float - status: str - reason: str | None = None - clamped: bool = False - - def to_dict(self) -> dict[str, Any]: - return asdict(self) - - -@dataclass(frozen=True) -class FixedSpineResidualizationResult: - """Residualized targets and diagnostic contribution records.""" - - targets: TargetSet - contributions: tuple[FixedSpineTargetContribution, ...] - - def diagnostics(self) -> list[dict[str, Any]]: - return [contribution.to_dict() for contribution in self.contributions] - - -def read_forbes_fixed_spine_records(path: str | Path) -> pd.DataFrame: - """Read normalized Forbes spine records from JSON, JSONL, or CSV.""" - - input_path = Path(path) - suffixes = tuple(suffix.lower() for suffix in input_path.suffixes) - if suffixes[-1:] == (".csv",): - return pd.read_csv(input_path) - if suffixes[-1:] == (".jsonl",): - return pd.read_json(input_path, lines=True) - if suffixes[-1:] == (".json",): - return pd.read_json(input_path) - raise ValueError( - f"Forbes fixed-spine records must be .csv, .json, or .jsonl; got {input_path}" - ) - - -def build_forbes_fixed_spine( - records: pd.DataFrame | str | Path, - *, - config: ForbesFixedSpineConfig | None = None, - source_path: str | Path | None = None, - source_metadata: Mapping[str, Any] | None = None, -) -> ForbesFixedSpine: - """Build deterministic PolicyEngine entity tables for Forbes fixed units. - - The returned entity tables contain only model-facing variables. Source fields - such as names, ranks, and replicate provenance are kept in ``record_metadata`` - and ``source_metadata`` so they can be audited without entering the exported - PolicyEngine dataset. - """ - - resolved_config = config or ForbesFixedSpineConfig() - if isinstance(records, (str, Path)): - source_path = records if source_path is None else source_path - records = read_forbes_fixed_spine_records(records) - normalized = _normalize_forbes_records(records, resolved_config) - - household_rows: list[dict[str, Any]] = [] - person_rows: list[dict[str, Any]] = [] - tax_unit_rows: list[dict[str, Any]] = [] - spm_unit_rows: list[dict[str, Any]] = [] - family_rows: list[dict[str, Any]] = [] - marital_unit_rows: list[dict[str, Any]] = [] - metadata_rows: list[dict[str, Any]] = [] - - for unit_position, (_, unit) in enumerate(normalized.iterrows()): - unit_weight = _numeric_or_default( - unit.get(resolved_config.unit_weight_column), - resolved_config.default_unit_weight, - ) - replicate_weight = unit_weight / float(resolved_config.replicates_per_unit) - for replicate_index in range(resolved_config.replicates_per_unit): - row_index = ( - unit_position * resolved_config.replicates_per_unit + replicate_index - ) - household_id = resolved_config.household_id_start + row_index - person_id = resolved_config.person_id_start + row_index - tax_unit_id = resolved_config.tax_unit_id_start + row_index - spm_unit_id = resolved_config.spm_unit_id_start + row_index - family_id = resolved_config.family_id_start + row_index - marital_unit_id = resolved_config.marital_unit_id_start + row_index - state_fips = _normalize_state_fips(unit.get("state_fips")) - - household = { - "household_id": household_id, - "household_weight": replicate_weight, - "state_fips": state_fips, - } - for variable in FORBES_HOUSEHOLD_VARIABLES: - if variable == "state_fips": - continue - if variable in unit: - household[variable] = _numeric_or_default(unit.get(variable), 0.0) - household_rows.append(household) - - person = { - "person_id": person_id, - "household_id": household_id, - "tax_unit_id": tax_unit_id, - "spm_unit_id": spm_unit_id, - "family_id": family_id, - "marital_unit_id": marital_unit_id, - "state_fips": state_fips, - } - for variable in FORBES_PERSON_VARIABLES: - if variable == "state_fips": - continue - if variable in unit: - person[variable] = _numeric_or_default(unit.get(variable), 0.0) - person_rows.append(person) - - tax_unit_rows.append( - { - "tax_unit_id": tax_unit_id, - "household_id": household_id, - "state_fips": state_fips, - } - ) - spm_unit_rows.append( - { - "spm_unit_id": spm_unit_id, - "household_id": household_id, - "state_fips": state_fips, - } - ) - family_rows.append( - { - "family_id": family_id, - "household_id": household_id, - "state_fips": state_fips, - } - ) - marital_unit_rows.append( - { - "marital_unit_id": marital_unit_id, - "household_id": household_id, - "state_fips": state_fips, - } - ) - metadata_rows.append( - { - "forbes_unit_id": unit[resolved_config.unit_id_column], - "forbes_name": unit.get(resolved_config.name_column), - "forbes_rank": _optional_int(unit.get(resolved_config.rank_column)), - "forbes_snapshot_id": resolved_config.snapshot_id, - "replicate_index": replicate_index, - "replicate_count": resolved_config.replicates_per_unit, - "replicate_weight": replicate_weight, - "household_id": household_id, - "person_id": person_id, - "tax_unit_id": tax_unit_id, - "spm_unit_id": spm_unit_id, - "family_id": family_id, - "marital_unit_id": marital_unit_id, - } - ) - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame(household_rows), - persons=pd.DataFrame(person_rows), - tax_units=pd.DataFrame(tax_unit_rows), - spm_units=pd.DataFrame(spm_unit_rows), - families=pd.DataFrame(family_rows), - marital_units=pd.DataFrame(marital_unit_rows), - ) - metadata = { - **dict(source_metadata or {}), - "source": "forbes_fixed_spine", - "snapshot_id": resolved_config.snapshot_id, - "period": resolved_config.period, - "unit_count": int(len(normalized)), - "replicates_per_unit": int(resolved_config.replicates_per_unit), - "record_count": int(len(household_rows)), - "source_path": str(source_path) if source_path is not None else None, - "source_sha256": _sha256_file(source_path) if source_path is not None else None, - } - return ForbesFixedSpine( - tables=tables, - record_metadata=pd.DataFrame( - metadata_rows, columns=FORBES_SOURCE_METADATA_COLUMNS - ), - source_metadata=metadata, - ) - - -def append_forbes_fixed_spine_tables( - tables: PolicyEngineUSEntityTableBundle, - fixed_spine: ForbesFixedSpine, -) -> PolicyEngineUSEntityTableBundle: - """Append fixed-spine entity rows after ordinary calibration.""" - - fixed_tables = _with_person_weights(fixed_spine.tables) - return PolicyEngineUSEntityTableBundle( - households=_append_entity_table( - tables.households, - fixed_tables.households, - id_column="household_id", - ), - persons=_append_entity_table( - tables.persons, - fixed_tables.persons, - id_column="person_id", - ), - tax_units=_append_entity_table( - tables.tax_units, - fixed_tables.tax_units, - id_column="tax_unit_id", - ), - spm_units=_append_entity_table( - tables.spm_units, - fixed_tables.spm_units, - id_column="spm_unit_id", - ), - families=_append_entity_table( - tables.families, - fixed_tables.families, - id_column="family_id", - ), - marital_units=_append_entity_table( - tables.marital_units, - fixed_tables.marital_units, - id_column="marital_unit_id", - ), - ) - - -def forbes_fixed_spine_variable_bindings( - tables: PolicyEngineUSEntityTableBundle, -) -> dict[str, PolicyEngineUSVariableBinding]: - """Return variable bindings for scoring fixed-spine target contributions.""" - - bindings = dict(DEFAULT_POLICYENGINE_US_VARIABLE_BINDINGS) - household_columns = set(tables.households.columns) - person_columns = set(tables.persons.columns if tables.persons is not None else ()) - - for variable in FORBES_HOUSEHOLD_VARIABLES: - if variable in household_columns: - bindings[variable] = PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, - column=variable, - ) - for variable in FORBES_PERSON_VARIABLES: - if variable in person_columns and variable not in bindings: - bindings[variable] = PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column=variable, - ) - return bindings - - -def residualize_targets_for_fixed_spine( - targets: TargetSet | Iterable[TargetSpec], - fixed_spine_tables: PolicyEngineUSEntityTableBundle, - *, - variable_bindings: Mapping[str, PolicyEngineUSVariableBinding] | None = None, - clamp_negative_residuals: bool = True, -) -> FixedSpineResidualizationResult: - """Subtract fixed-spine contributions from additive calibration targets.""" - - target_list = list(targets.targets if isinstance(targets, TargetSet) else targets) - bindings = { - **forbes_fixed_spine_variable_bindings(fixed_spine_tables), - **dict(variable_bindings or {}), - } - residualized_targets: list[TargetSpec] = [] - contributions: list[FixedSpineTargetContribution] = [] - - for target in target_list: - contribution = _fixed_spine_target_contribution( - target, - fixed_spine_tables, - variable_bindings=bindings, - ) - if contribution.status != "supported": - residualized_targets.append(target) - contributions.append(contribution) - continue - - residual_value = float(target.value) - contribution.contribution - clamped = False - if clamp_negative_residuals and residual_value < 0.0: - residual_value = 0.0 - clamped = True - metadata = dict(target.metadata) - metadata["fixed_spine_residualization"] = { - "original_value": float(target.value), - "fixed_spine_contribution": contribution.contribution, - "residual_value": residual_value, - "clamped": clamped, - } - residualized_targets.append( - replace(target, value=residual_value, metadata=metadata) - ) - contributions.append( - replace(contribution, residual_value=residual_value, clamped=clamped) - ) - - return FixedSpineResidualizationResult( - targets=TargetSet(residualized_targets), - contributions=tuple(contributions), - ) - - -def fixed_spine_contribution_diagnostics_json( - result: FixedSpineResidualizationResult, -) -> str: - """Serialize fixed-spine residualization diagnostics for artifact manifests.""" - - return json.dumps(result.diagnostics(), indent=2, sort_keys=True) - - -def _fixed_spine_target_contribution( - target: TargetSpec, - tables: PolicyEngineUSEntityTableBundle, - *, - variable_bindings: Mapping[str, PolicyEngineUSVariableBinding], -) -> FixedSpineTargetContribution: - if target.aggregation is TargetAggregation.MEAN: - return FixedSpineTargetContribution( - target_name=target.name, - target_value=float(target.value), - contribution=0.0, - residual_value=float(target.value), - status="unsupported", - reason="mean targets are not additive residualization targets", - ) - - weights = _household_weights(tables) - try: - supported, unsupported, constraints = ( - compile_supported_policyengine_us_household_linear_constraints( - [target], - tables, - variable_bindings=dict(variable_bindings), - ) - ) - except (KeyError, ValueError) as error: - return FixedSpineTargetContribution( - target_name=target.name, - target_value=float(target.value), - contribution=0.0, - residual_value=float(target.value), - status="unsupported", - reason=str(error), - ) - - if unsupported or not supported or not constraints: - return FixedSpineTargetContribution( - target_name=target.name, - target_value=float(target.value), - contribution=0.0, - residual_value=float(target.value), - status="unsupported", - reason="target cannot be compiled against fixed-spine entity tables", - ) - - contribution = float(np.dot(weights, constraints[0].coefficients)) - return FixedSpineTargetContribution( - target_name=target.name, - target_value=float(target.value), - contribution=contribution, - residual_value=float(target.value) - contribution, - status="supported", - ) - - -def _normalize_forbes_records( - records: pd.DataFrame, - config: ForbesFixedSpineConfig, -) -> pd.DataFrame: - result = records.copy() - if result.empty: - raise ValueError("Forbes fixed spine requires at least one source record") - if "net_worth" not in result.columns: - raise ValueError("Forbes fixed spine records must include 'net_worth'") - if config.unit_id_column not in result.columns: - result[config.unit_id_column] = np.arange(1, len(result) + 1) - if config.name_column not in result.columns: - result[config.name_column] = result[config.unit_id_column].astype(str) - if config.rank_column not in result.columns: - result[config.rank_column] = np.arange(1, len(result) + 1) - if config.unit_weight_column not in result.columns: - result[config.unit_weight_column] = config.default_unit_weight - if "state_fips" not in result.columns: - result["state_fips"] = None - return result - - -def _household_weights(tables: PolicyEngineUSEntityTableBundle) -> np.ndarray: - households = tables.households - if "household_weight" in households.columns: - return households["household_weight"].to_numpy(dtype=float, copy=False) - if "weight" in households.columns: - return households["weight"].to_numpy(dtype=float, copy=False) - raise ValueError("Fixed-spine households must include household_weight or weight") - - -def _normalize_state_fips(value: Any) -> int: - if value is None or pd.isna(value): - return 0 - if isinstance(value, str): - stripped = value.strip() - if not stripped: - return 0 - numeric = pd.to_numeric(pd.Series([stripped]), errors="coerce").iloc[0] - if pd.isna(numeric): - raise ValueError(f"Invalid state_fips value for Forbes spine: {value!r}") - return int(numeric) - return int(value) - - -def _numeric_or_default(value: Any, default: float) -> float: - numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0] - if pd.isna(numeric): - return float(default) - return float(numeric) - - -def _optional_int(value: Any) -> int | None: - numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0] - if pd.isna(numeric): - return None - return int(numeric) - - -def _sha256_file(path: str | Path | None) -> str | None: - if path is None: - return None - digest = hashlib.sha256() - with Path(path).open("rb") as file: - for chunk in iter(lambda: file.read(1024 * 1024), b""): - digest.update(chunk) - return digest.hexdigest() - - -def _with_person_weights( - tables: PolicyEngineUSEntityTableBundle, -) -> PolicyEngineUSEntityTableBundle: - if tables.persons is None or "weight" in tables.persons.columns: - return tables - household_weights = tables.households.set_index("household_id")["household_weight"] - persons = tables.persons.copy() - persons["weight"] = persons["household_id"].map(household_weights).astype(float) - return PolicyEngineUSEntityTableBundle( - households=tables.households, - persons=persons, - tax_units=tables.tax_units, - spm_units=tables.spm_units, - families=tables.families, - marital_units=tables.marital_units, - ) - - -def _append_entity_table( - base: pd.DataFrame | None, - fixed: pd.DataFrame | None, - *, - id_column: str, -) -> pd.DataFrame | None: - if fixed is None: - return base - if base is None: - return fixed.copy() - if id_column in base.columns and id_column in fixed.columns: - overlap = set(base[id_column].dropna()) & set(fixed[id_column].dropna()) - if overlap: - sample = sorted(overlap)[:5] - raise ValueError( - f"Forbes fixed spine {id_column} values overlap existing table: {sample}" - ) - return pd.concat([base, fixed], ignore_index=True, sort=False) diff --git a/src/microplex_us/data_sources/psid.py b/src/microplex_us/data_sources/psid.py deleted file mode 100644 index 8276fef3..00000000 --- a/src/microplex_us/data_sources/psid.py +++ /dev/null @@ -1,548 +0,0 @@ -"""PSID (Panel Study of Income Dynamics) data source for microplex. - -Provides loading, processing, and transition rate extraction from PSID panel data. -PSID is the longest-running longitudinal household survey in the world (1968-present), -making it ideal for calibrating demographic transition models. - -Key features: -- Load PSID panel data via the `psid` Python package -- Extract empirical transition rates (marriage, divorce, etc.) -- Calibrate microplex transition models with PSID data -- Use as a source in MultiSourceFusion for coverage evaluation - -Example: - >>> from microplex.data_sources.psid import load_psid_panel, extract_transition_rates - >>> - >>> # Load PSID data - >>> dataset = load_psid_panel(data_dir="./psid_data") - >>> - >>> # Extract transition rates for model calibration - >>> transitions = psid.get_household_transitions(dataset.panel) - >>> rates = extract_transition_rates(transitions) - >>> - >>> # Calibrate marriage model - >>> from microplex.transitions import MarriageTransition ->>> marriage_rates = calibrate_marriage_rates(rates["marriage_by_age"]) ->>> model = MarriageTransition(base_rates={"male": 0.05, "female": 0.06}) -""" - -from __future__ import annotations - -from collections.abc import Callable -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from microplex.core import ( - EntityObservation, - EntityRelationship, - EntityType, - ObservationFrame, - RelationshipCardinality, - Shareability, - SourceArchetype, - SourceDescriptor, - SourceQuery, - TimeStructure, - apply_source_query, -) - -# Variable mapping from PSID names to microplex conventions -PSID_TO_MICROPLEX_VARS = { - # Demographics - "age": "age", - "sex": "is_male", # Will need transformation (1=male in PSID) - "marital_status": "marital_status", - "education": "education", - "race": "race", - - # Income - "total_family_income": "total_income", - "head_labor_income": "head_labor_income", - "wife_labor_income": "spouse_labor_income", - - # Wealth - "total_wealth": "total_wealth", - - # Identifiers - "person_id": "person_id", - "interview_number": "household_id", - "year": "year", - "relationship": "relationship", -} - - -@dataclass -class PSIDDataset: - """Container for PSID panel data. - - Attributes: - persons: DataFrame with person-year observations - source: Data source identifier (path or "mock") - panel: Optional Panel object from psid package - """ - - persons: pd.DataFrame - source: str - panel: Any = None - - @property - def n_persons(self) -> int: - """Number of unique persons in dataset.""" - if "person_id" in self.persons.columns: - return self.persons["person_id"].nunique() - return 0 - - @property - def n_observations(self) -> int: - """Total number of person-year observations.""" - return len(self.persons) - - @property - def years(self) -> list[int]: - """List of years in the dataset.""" - if "year" in self.persons.columns: - return sorted(self.persons["year"].unique().tolist()) - return [] - - def summary(self) -> dict: - """Return summary statistics.""" - return { - "n_persons": self.n_persons, - "n_observations": self.n_observations, - "years": self.years, - "source": self.source, - } - - -def load_psid_panel( - data_dir: str | Path, - years: list[int] | None = None, - family_vars: dict[str, str] | None = None, - individual_vars: dict[str, str] | None = None, - sample: str | None = None, -) -> PSIDDataset: - """Load PSID panel data using the psid package. - - Args: - data_dir: Directory containing PSID data files - years: List of survey years to include (None = all available) - family_vars: Dict mapping variable names to crosswalk lookups - individual_vars: Dict mapping individual-level variables - sample: Sample type filter ("SRC", "SEO", "IMMIGRANT", or None for all) - - Returns: - PSIDDataset with loaded panel data - - Raises: - FileNotFoundError: If data_dir doesn't exist - ValueError: If psid package is not installed - """ - data_dir = Path(data_dir) - - if not data_dir.exists(): - raise FileNotFoundError(f"PSID data directory not found: {data_dir}") - - try: - import psid - except ImportError: - raise ValueError( - "psid package not installed. Install with: pip install psid" - ) - - # Build panel using psid package - panel = psid.build_panel( - data_dir=str(data_dir), - years=years, - family_vars=family_vars, - individual_vars=individual_vars, - sample=sample, - ) - - # Convert to DataFrame with microplex variable names - df = panel.data.copy() - - # Rename columns to microplex conventions - rename_map = {} - for psid_name, microplex_name in PSID_TO_MICROPLEX_VARS.items(): - if psid_name in df.columns: - rename_map[psid_name] = microplex_name - - df = df.rename(columns=rename_map) - - # Transform sex to is_male boolean - if "is_male" in df.columns: - df["is_male"] = df["is_male"] == 1 - - return PSIDDataset( - persons=df, - source=str(data_dir), - panel=panel, - ) - - -def extract_transition_rates( - transitions_df: pd.DataFrame, - transition_types: list[str] | None = None, -) -> dict[str, float]: - """Extract overall transition rates from PSID transition data. - - Args: - transitions_df: DataFrame from psid.get_household_transitions() - transition_types: Types to extract (None = all available) - - Returns: - Dict mapping transition type to annual probability - """ - if "type" not in transitions_df.columns: - raise ValueError("transitions_df must have 'type' column") - - total = len(transitions_df) - if total == 0: - return {} - - counts = transitions_df["type"].value_counts() - - if transition_types is None: - transition_types = counts.index.tolist() - - rates = {} - for t_type in transition_types: - if t_type in counts.index: - rates[t_type] = counts[t_type] / total - else: - rates[t_type] = 0.0 - - return rates - - -def get_age_specific_rates( - transitions_df: pd.DataFrame, - transition_type: str, - age_bins: list[tuple[int, int]], - age_col: str = "age_from", -) -> dict[tuple[int, int], float]: - """Extract age-specific transition rates. - - Args: - transitions_df: DataFrame from psid.get_household_transitions() - transition_type: Type of transition (e.g., "marriage", "divorce") - age_bins: List of (age_min, age_max) tuples - age_col: Column name for age - - Returns: - Dict mapping age range to rate - """ - if age_col not in transitions_df.columns: - return {} - - rates = {} - - for age_min, age_max in age_bins: - # Filter to age bin - mask = (transitions_df[age_col] >= age_min) & (transitions_df[age_col] <= age_max) - bin_data = transitions_df[mask] - - if len(bin_data) == 0: - rates[(age_min, age_max)] = 0.0 - continue - - # Count transitions of specified type - type_count = (bin_data["type"] == transition_type).sum() - rates[(age_min, age_max)] = type_count / len(bin_data) - - return rates - - -def calibrate_marriage_rates( - psid_rates: dict[tuple[int, int], float], - gender_adjustment: dict[str, float] | None = None, -) -> dict[tuple[int, int], float]: - """Convert PSID-derived rates to MarriageTransition format. - - Args: - psid_rates: Dict from get_age_specific_rates() for marriage - gender_adjustment: Optional {"male": factor, "female": factor} - - Returns: - Dict compatible with MarriageTransition base_rates - """ - # PSID rates are already in the right format: (age_min, age_max) -> rate - calibrated = {} - - for age_range, rate in psid_rates.items(): - # Ensure rate is a valid probability - calibrated[age_range] = float(np.clip(rate, 0.0, 1.0)) - - return calibrated - - -def calibrate_divorce_rates( - psid_rates: dict[tuple[int, int], float], -) -> dict[tuple[int, int], float]: - """Convert PSID-derived rates to DivorceTransition format. - - Args: - psid_rates: Dict from get_age_specific_rates() for divorce - - Returns: - Dict compatible with DivorceTransition age_effects - """ - # Same format as marriage rates - calibrated = {} - - for age_range, rate in psid_rates.items(): - calibrated[age_range] = float(np.clip(rate, 0.0, 1.0)) - - return calibrated - - -def create_psid_fusion_source( - dataset: PSIDDataset, - source_vars: list[str] | None = None, -) -> dict: - """Create configuration for adding PSID to MultiSourceFusion. - - Args: - dataset: PSIDDataset from load_psid_panel() - source_vars: Variables to include (None = common set) - - Returns: - Dict with parameters for fusion.add_source() - """ - if source_vars is None: - # Default to variables commonly available in PSID - source_vars = ["age", "total_income"] - - # Add others if present - optional = ["is_male", "marital_status", "education", "total_wealth"] - for var in optional: - if var in dataset.persons.columns: - source_vars.append(var) - - # Determine number of periods per person - if "year" in dataset.persons.columns and "person_id" in dataset.persons.columns: - periods_per_person = dataset.persons.groupby("person_id")["year"].nunique() - n_periods = int(periods_per_person.median()) - else: - n_periods = 1 - - return { - "name": "psid", - "data": dataset.persons, - "source_vars": source_vars, - "n_periods": n_periods, - "person_id_col": "person_id", - "period_col": "year", - } - - -def _sample_psid_households( - persons: pd.DataFrame, - *, - sample_n: int | None, - random_seed: int, -) -> pd.DataFrame: - """Sample linked households after selecting one survey year.""" - if sample_n is None: - return persons.reset_index(drop=True) - household_ids = persons["household_id"].drop_duplicates() - if sample_n >= len(household_ids): - return persons.reset_index(drop=True) - sampled_households = household_ids.sample( - n=sample_n, - random_state=random_seed, - replace=False, - ) - return persons[persons["household_id"].isin(set(sampled_households))].reset_index( - drop=True - ) - - -def _normalize_psid_cross_section( - persons: pd.DataFrame, - *, - survey_year: int, -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Project PSID panel data to one cross-sectional frame.""" - cross_section = persons.copy() - if "year" in cross_section.columns: - cross_section = cross_section[cross_section["year"] == survey_year].copy() - if cross_section.empty: - raise ValueError(f"PSID data has no observations for survey year {survey_year}") - - raw_household = cross_section["household_id"].astype(str) - raw_person = cross_section["person_id"].astype(str) - cross_section["household_id"] = f"{survey_year}:" + raw_household - cross_section["person_id"] = f"{survey_year}:" + raw_person - cross_section["year"] = survey_year - - def numeric_series(column: str, default: float = 0.0) -> pd.Series: - if column in cross_section.columns: - return pd.to_numeric(cross_section[column], errors="coerce") - return pd.Series(default, index=cross_section.index, dtype=float) - - income = numeric_series("total_income", default=np.nan) - if income.isna().all(): - income = numeric_series("head_labor_income").fillna(0) + numeric_series( - "spouse_labor_income" - ).fillna(0) - cross_section["income"] = income.fillna(0.0).astype(float) - if "is_male" in cross_section.columns: - cross_section["sex"] = np.where(cross_section["is_male"], 1, 2) - else: - cross_section["sex"] = 0 - cross_section["education"] = numeric_series("education").fillna(0).astype(int) - cross_section["employment_status"] = ( - cross_section["income"].astype(float) > 0 - ).astype(int) - cross_section["age"] = numeric_series("age").fillna(0).astype(int) - cross_section["weight"] = numeric_series("weight", default=1.0).fillna(1.0) - - households = ( - cross_section.groupby("household_id", as_index=False) - .agg({"year": "first", "weight": "sum"}) - .rename(columns={"weight": "household_weight"}) - ) - households["state_fips"] = 0 - households["tenure"] = 0 - - return households, cross_section - - -def _build_psid_observation_frame( - *, - households: pd.DataFrame, - persons: pd.DataFrame, - source_name: str, - shareability: Shareability, -) -> ObservationFrame: - """Build an observation frame from a PSID cross section.""" - descriptor = SourceDescriptor( - name=source_name, - shareability=shareability, - time_structure=TimeStructure.PANEL, - archetype=SourceArchetype.LONGITUDINAL_SOCIOECONOMIC, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="household_weight", - period_column="year", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=tuple( - column - for column in persons.columns - if column not in {"person_id", "household_id", "weight", "year"} - ), - weight_column="weight", - period_column="year", - ), - ), - ) - frame = ObservationFrame( - source=descriptor, - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - frame.validate() - return frame - - -@dataclass -class PSIDSourceProvider: - """Source-provider wrapper around PSID panel extracts.""" - - data_dir: str | Path - survey_year: int | None = None - years: list[int] | None = None - sample: str | None = None - shareability: Shareability = Shareability.RESTRICTED - loader: Callable[..., PSIDDataset] | None = None - _descriptor_cache: SourceDescriptor | None = None - - @property - def descriptor(self) -> SourceDescriptor: - if self._descriptor_cache is not None: - return self._descriptor_cache - return SourceDescriptor( - name="psid", - shareability=self.shareability, - time_structure=TimeStructure.PANEL, - archetype=SourceArchetype.LONGITUDINAL_SOCIOECONOMIC, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="household_weight", - period_column="year", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=("age", "sex", "education", "employment_status", "income"), - weight_column="weight", - period_column="year", - ), - ), - ) - - def load_frame(self, query: SourceQuery | None = None) -> ObservationFrame: - query = query or SourceQuery() - provider_filters = query.provider_filters - loader = self.loader or load_psid_panel - dataset = loader( - data_dir=provider_filters.get("data_dir", self.data_dir), - years=provider_filters.get("years", self.years), - sample=provider_filters.get("sample", self.sample), - ) - persons = dataset.persons.copy() - available_years = sorted(persons["year"].dropna().astype(int).unique().tolist()) - survey_year = int( - provider_filters.get( - "survey_year", - query.period - if query.period is not None - else ( - self.survey_year if self.survey_year is not None else available_years[-1] - ), - ) - ) - households, persons = _normalize_psid_cross_section( - persons, - survey_year=survey_year, - ) - persons = _sample_psid_households( - persons, - sample_n=provider_filters.get("sample_n"), - random_seed=int(provider_filters.get("random_seed", 0)), - ) - households = households[ - households["household_id"].isin(set(persons["household_id"])) - ].reset_index(drop=True) - frame = _build_psid_observation_frame( - households=households, - persons=persons, - source_name=f"psid_{survey_year}", - shareability=self.shareability, - ) - self._descriptor_cache = frame.source - return apply_source_query(frame, query) diff --git a/src/microplex_us/data_sources/puf.py b/src/microplex_us/data_sources/puf.py deleted file mode 100644 index dc0f6ef1..00000000 --- a/src/microplex_us/data_sources/puf.py +++ /dev/null @@ -1,2919 +0,0 @@ -"""IRS Public Use File (PUF) loader, processing, and source-provider wrapper. - -Downloads PUF from HuggingFace, uprates 2015 → target year, -and maps to common variable schema for multi-survey fusion. -""" - -from __future__ import annotations - -import re -from collections.abc import Callable -from dataclasses import dataclass, field -from functools import cache, lru_cache -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from microplex.core import ( - EntityObservation, - EntityRelationship, - EntityType, - ObservationFrame, - RelationshipCardinality, - Shareability, - SourceDescriptor, - SourceQuery, - TimeStructure, - apply_source_query, -) - -from microplex_us.data_sources.cps import load_cps_asec -from microplex_us.data_sources.share_imputation import ( - GroupedShareModel, - fit_grouped_share_model, - predict_grouped_component_shares, -) -from microplex_us.pipelines.pe_native_scores import ( - resolve_policyengine_us_data_repo_root, -) -from microplex_us.source_manifests import load_us_source_manifest -from microplex_us.source_registry import resolve_source_variable_capabilities -from microplex_us.variables import normalize_dividend_columns - -try: - from huggingface_hub import hf_hub_download - - HF_AVAILABLE = True -except ImportError: - HF_AVAILABLE = False - -PUF_VARIABLE_MAP = { - column_spec.raw_column: column_spec.canonical_name - for column_spec in load_us_source_manifest("puf") - .observation_for(EntityType.TAX_UNIT) - .columns -} - -PUF_UPRATING_MODE_INTERPOLATED = "interpolated" -PUF_UPRATING_MODE_PE_SOI = "pe_soi" - -PE_ITMDED_GROW_RATE = 0.02 -PE_PUF_SOI_END_YEAR = 2021 -PE_UPRATING_FACTOR_ALIASES = { - "weight": "household_weight", - "gross_social_security": "social_security", -} -PE_SOI_TO_PUF_STRAIGHT_RENAMES = { - "employment_income": "E00200", - "capital_gains_distributions": "E01100", - "taxable_interest_income": "E00300", - "exempt_interest": "E00400", - "ordinary_dividends": "E00600", - "qualified_dividends": "E00650", - "ira_distributions": "E01400", - "total_pension_income": "E01500", - "taxable_pension_income": "E01700", - "unemployment_compensation": "E02300", - "total_social_security": "E02400", - "taxable_social_security": "E02500", - "medical_expense_deductions_uncapped": "E17500", - "itemized_state_income_tax_deductions": "E18400", - "itemized_real_estate_tax_deductions": "E18500", - "interest_paid_deductions": "E19200", - "charitable_contributions_deductions": "E19800", -} -PE_SOI_TO_PUF_POS_ONLY_RENAMES = { - "business_net_profits": "E00900", - "capital_gains_gross": "E01000", - "partnership_and_s_corp_income": "E26270", -} -PE_SOI_TO_PUF_NEG_ONLY_RENAMES = { - "business_net_losses": "E00900", - "capital_gains_losses": "E01000", - "partnership_and_s_corp_losses": "E26270", -} -PUF_AGGREGATE_RECIDS = (999996, 999997, 999998, 999999) -PUF_SYNTHETIC_RECID_START = 1_000_000 -PUF_AGGREGATE_SCREENED_FIELDS = ( - "E00200", # Wages - "P23250", # Long-term capital gains - "P22250", # Short-term capital gains - "E00650", # Qualified dividends - "E00300", # Taxable interest - "E26270", # Partnership / S-corp - "E00900", # Business income - "E02100", # Farm income - "E00400", # Tax-exempt interest - "E00600", # Ordinary dividends -) -PUF_AGGREGATE_BUCKET_BOUNDS = { - 999996: (-np.inf, 0.0), - 999997: (0.0, 10_000_000.0), - 999998: (10_000_000.0, 100_000_000.0), - 999999: (100_000_000.0, np.inf), -} -PUF_AGGREGATE_STRUCTURAL_COLUMNS = ("MARS", "XTOT", "DSI", "EIC") -_PUF_AMOUNT_COLUMN_PATTERN = re.compile(r"^(?:[EPT]\d+|S\d{5})$") -_PUF_AGGREGATE_AGI_CAP_100M_PLUS = 1_250_000_000.0 -_PUF_AGGREGATE_MAX_AGI_DOMINANCE = 0.20 -_PUF_AGGREGATE_SELECTION_POWER = 24 -_PUF_NUMERIC_TOL = 1e-9 -PE_PUF_REMAINING_RAW_COLUMNS = ( - "E03500", - "E00800", - "E20500", - "E32800", - "E20100", - "E03240", - "E03400", - "E03220", - "E26390", - "E26400", - "T27800", - "E27200", - "E03290", - "P23250", - "E24518", - "E20400", - "E26270", - "E03230", - "E25850", - "E25860", - "E00900", - "E03270", - "E03300", - "P22250", - "E03210", - "E03150", - "E24515", - "E07300", - "E62900", - "E01200", - "E00700", - "E58990", - "E07400", - "E07600", - "E11200", - "E87521", - "E07260", - "E09900", - "P08000", - "E07240", - "E09700", - "E09800", -) - -# SOI growth factors for uprating 2015 → 2024 -# Based on IRS SOI aggregate growth rates -# These should be updated with actual SOI data -UPRATING_FACTORS = { - "employment_income": 1.45, # ~4.5% annual wage growth - "self_employment_income": 1.35, - "farm_income": 1.20, - "taxable_interest_income": 2.50, # Interest rates rose significantly - "tax_exempt_interest_income": 1.80, - "ordinary_dividend_income": 1.60, - "qualified_dividend_income": 1.60, - "short_term_capital_gains": 1.80, - "long_term_capital_gains": 2.20, # Stock market growth - "non_sch_d_capital_gains": 1.80, - "partnership_s_corp_income": 1.50, - "rental_income_positive": 1.40, - "rental_income_negative": 1.40, - "ira_distributions": 1.60, - "total_pension_income": 1.40, - "taxable_pension_income": 1.40, - "gross_social_security": 1.45, - "taxable_social_security": 1.45, - "unemployment_compensation": 0.30, # Down from COVID peak - "alimony_income": 0.50, # Declining due to tax law change - "medical_expense_agi_floor": 1.50, - "state_income_tax_paid": 1.40, - "real_estate_tax_paid": 1.35, - "mortgage_interest_paid": 1.30, - "charitable_cash": 1.40, - "charitable_noncash": 1.40, - "student_loan_interest": 1.20, -} - -MINIMUM_SOCIAL_SECURITY_RETIREMENT_AGE = 62 -SOCIAL_SECURITY_SHARE_AGE_BINS = (-np.inf, 18.0, 30.0, 45.0, 62.0, 75.0, np.inf) -SOCIAL_SECURITY_SHARE_AGE_LABELS = ( - "under_18", - "18_to_29", - "30_to_44", - "45_to_61", - "62_to_74", - "75_plus", -) -SOCIAL_SECURITY_SHARE_EXPLICIT_COMPONENTS = ( - "social_security_retirement", - "social_security_disability", - "social_security_survivors", -) -SOCIAL_SECURITY_SHARE_IMPLICIT_COMPONENT = "social_security_dependents" -SOCIAL_SECURITY_SHARE_COMPONENTS = ( - *SOCIAL_SECURITY_SHARE_EXPLICIT_COMPONENTS, - SOCIAL_SECURITY_SHARE_IMPLICIT_COMPONENT, -) -SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE = "grouped_share" -SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF = "pe_qrf" -SOCIAL_SECURITY_SPLIT_STRATEGY_AGE_HEURISTIC = "age_heuristic" -PE_STYLE_SOCIAL_SECURITY_QRF_PREDICTORS = ( - "age", - "is_male", - "tax_unit_is_joint", - "is_tax_unit_head", - "is_tax_unit_dependent", -) -MIN_PE_STYLE_SOCIAL_SECURITY_QRF_TRAINING_RECORDS = 100 -PE_PUF_PERSON_EXPANSION_RANDOM_SEED = 64 -QBI_PUF_RANDOM_SEED = 64 -QBI_SIMULATION_RANDOM_SEED = 42 - -QBI_QUALIFICATION_PROBABILITIES = { - "self_employment_income": 0.8, - "farm_operations_income": 0.95, - "farm_rent_income": 0.5, - "rental_income": 0.4, - "estate_income": 0.5, - "partnership_s_corp_income": 0.85, -} -QBI_SSTB_PROBABILITY_BY_RAW_COLUMN = { - "E00900": 0.20, - "E26270": 0.15, - "E26390": 0.10, - "E26400": 0.10, -} -QBI_REIT_PTP_PROBABILITY = 0.07 -QBI_REIT_PTP_LOG_NORMAL_MU = 8.04 -QBI_REIT_PTP_LOG_NORMAL_SIGMA = 1.20 -QBI_BDC_PROBABILITY = 0.003 -QBI_BDC_LOG_NORMAL_MU = 8.71 -QBI_BDC_LOG_NORMAL_SIGMA = 1.00 -QBI_PROFIT_MARGIN_BETA_A = 2.0 -QBI_PROFIT_MARGIN_BETA_B = 3.0 -QBI_PROFIT_MARGIN_SCALE = 0.20 -QBI_PROFIT_MARGIN_SHIFT = 0.05 -QBI_HAS_EMPLOYEES_LOGIT_INTERCEPT = -3.1 -QBI_HAS_EMPLOYEES_LOGIT_SLOPE_PER_DOLLAR = 1.2e-6 -QBI_RENTAL_LABOR_RATIO_BETA_A = 1.5 -QBI_RENTAL_LABOR_RATIO_BETA_B = 8.0 -QBI_RENTAL_LABOR_RATIO_SCALE = 0.08 -QBI_NON_RENTAL_LABOR_RATIO_BETA_A = 2.0 -QBI_NON_RENTAL_LABOR_RATIO_BETA_B = 2.0 -QBI_NON_RENTAL_LABOR_RATIO_SCALE = 0.25 -QBI_DEPRECIATION_PROXY_SIGMA = 0.8 -QBI_UBIA_MULTIPLE_OF_QBI = 4.0 -QBI_UBIA_SIGMA = 1.0 -QBI_QUALIFICATION_COLUMNS = tuple( - f"{variable}_would_be_qualified" for variable in QBI_QUALIFICATION_PROBABILITIES -) -QBI_BOOLEAN_COLUMNS = ( - "business_is_sstb", - "sstb_self_employment_income_would_be_qualified", - *QBI_QUALIFICATION_COLUMNS, -) - -JOINT_HEAD_SHARE_ALLOCATION = { - "employment_income": 0.6, - "self_employment_income": 0.6, - "sstb_self_employment_income": 0.6, - "sstb_self_employment_income_before_lsr": 0.6, -} - -JOINT_EQUAL_SHARE_ALLOCATION = ( - "farm_income", - "taxable_interest_income", - "tax_exempt_interest_income", - "ordinary_dividend_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "short_term_capital_gains", - "long_term_capital_gains", - "non_sch_d_capital_gains", - "partnership_s_corp_income", - "rental_income", - "ira_distributions", - "total_pension_income", - "taxable_pension_income", - "gross_social_security", - "taxable_social_security", - "unemployment_compensation", - "alimony_income", - "medical_expense_agi_floor", - "state_income_tax_paid", - "real_estate_tax_paid", - "mortgage_interest_paid", - "charitable_cash", - "charitable_noncash", - "ira_deduction", - "student_loan_interest", - "w2_wages_from_qualified_business", - "unadjusted_basis_qualified_property", - "sstb_w2_wages_from_qualified_business", - "sstb_unadjusted_basis_qualified_property", - "qualified_reit_and_ptp_income", - "qualified_bdc_income", -) - -PUF_DEMOGRAPHIC_HELPER_COLUMNS = ( - "_puf_recid", - "_puf_agerange", - "_puf_earnsplit", - "_puf_gender", - "_puf_agedp1", - "_puf_agedp2", - "_puf_agedp3", -) - -PUF_PERSON_EXPANSION_PRESERVE_COLUMNS = { - "weight", - "household_weight", - "year", - "household_id", - "state_fips", - "tenure", - "filing_status", - "filing_status_code", - "exemptions_count", - "eitc_children", - "ctc_children", - "age", - "is_male", - "employment_status", - "income", - "interest_income", - "dividend_income", - "capital_gains", - "pension_income", - "social_security", - "social_security_retirement", - *QBI_BOOLEAN_COLUMNS, - *PUF_DEMOGRAPHIC_HELPER_COLUMNS, -} - -MEDICAL_EXPENSE_CATEGORY_BREAKDOWNS = { - "health_insurance_premiums_without_medicare_part_b": 0.453, - "other_medical_expenses": 0.325, - "medicare_part_b_premiums": 0.137, - "over_the_counter_health_expenses": 0.085, -} - - -@dataclass(frozen=True) -class PEStyleQRFShareModel: - """PE-style QRF share model for PUF Social Security components.""" - - predictors: tuple[str, ...] - component_columns: tuple[str, ...] - share_prediction_columns: tuple[str, ...] - fitted_model: Any - - -SocialSecurityShareModel = GroupedShareModel | PEStyleQRFShareModel - - -@dataclass(frozen=True) -class PEStyleQRFImputationModel: - """PE-style QRF imputation model for direct PUF variable imputation.""" - - predictors: tuple[str, ...] - imputed_variable: str - fitted_model: Any - - -PUF_DEMOGRAPHIC_VARIABLES = ( - "AGEDP1", - "AGEDP2", - "AGEDP3", - "AGERANGE", - "EARNSPLIT", - "GENDER", -) - -PUF_DEMOGRAPHIC_PREDICTORS = ( - "E00200", - "MARS", - "DSI", - "EIC", - "XTOT", -) - - -def download_puf(cache_dir: Path | None = None) -> Path: - """Download PUF from HuggingFace. - - Returns path to downloaded CSV file. - """ - if cache_dir is None: - cache_dir = Path.home() / ".cache" / "microplex" - cache_dir.mkdir(parents=True, exist_ok=True) - puf_path = cache_dir / "puf_2015.csv" - demo_path = cache_dir / "demographics_2015.csv" - - # Prefer an already-present local copy over any remote resolution. - if puf_path.exists(): - return puf_path, demo_path - - if not HF_AVAILABLE: - raise ImportError("huggingface_hub required: pip install huggingface_hub") - - # Download PUF 2015 - puf_path = hf_hub_download( - repo_id="policyengine/irs-soi-puf", - filename="puf_2015.csv", - repo_type="model", - local_dir=cache_dir, - ) - - # Download demographics file - demo_path = hf_hub_download( - repo_id="policyengine/irs-soi-puf", - filename="demographics_2015.csv", - repo_type="model", - local_dir=cache_dir, - ) - - return Path(puf_path), Path(demo_path) - - -def _resolve_policyengine_repo_local_puf_paths( - policyengine_us_data_repo: str | Path | None, -) -> tuple[Path, Path | None] | None: - """Resolve raw PUF CSVs from a local policyengine-us-data checkout.""" - - if policyengine_us_data_repo is None: - return None - try: - repo_root = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - except FileNotFoundError: - return None - candidate_dirs = ( - repo_root / "policyengine_us_data" / "storage", - repo_root / "data" / "raw", - ) - for candidate_dir in candidate_dirs: - puf_path = candidate_dir / "puf_2015.csv" - demographics_path = candidate_dir / "demographics_2015.csv" - if puf_path.exists(): - return puf_path, (demographics_path if demographics_path.exists() else None) - return None - - -def _puf_aggregate_amount_columns(columns: pd.Index | list[str]) -> list[str]: - return [column for column in columns if _PUF_AMOUNT_COLUMN_PATTERN.match(column)] - - -def _puf_aggregate_bucket_mask(df: pd.DataFrame, recid: int) -> pd.Series: - if "E00100" not in df.columns: - return pd.Series(True, index=df.index) - agi = pd.to_numeric(df["E00100"], errors="coerce").fillna(0.0) - lower, upper = PUF_AGGREGATE_BUCKET_BOUNDS.get(recid, (-np.inf, np.inf)) - return agi.ge(lower) & agi.lt(upper) - - -def _puf_aggregate_eligibility_scores( - df: pd.DataFrame, - reference: pd.DataFrame | None = None, -) -> pd.Series: - reference_frame = df if reference is None else reference - present_fields = [ - field - for field in PUF_AGGREGATE_SCREENED_FIELDS - if field in df.columns and field in reference_frame.columns - ] - if not present_fields: - return pd.Series(0.0, index=df.index, dtype=float) - - scores = np.zeros(len(df), dtype=float) - for raw_field in present_fields: - values = pd.to_numeric(df[raw_field], errors="coerce").fillna(0.0) - reference_values = pd.to_numeric( - reference_frame[raw_field], errors="coerce" - ).fillna(0.0) - field_scores = np.zeros(len(df), dtype=float) - - positive_mask = values.gt(0) - reference_positive = np.sort( - reference_values[reference_values.gt(0)].to_numpy() - ) - if bool(positive_mask.any()) and len(reference_positive) > 0: - field_scores[positive_mask.to_numpy()] = np.searchsorted( - reference_positive, - values[positive_mask].to_numpy(), - side="right", - ) / len(reference_positive) - - negative_mask = values.lt(0) - reference_negative = np.sort( - (-reference_values[reference_values.lt(0)]).to_numpy() - ) - if bool(negative_mask.any()) and len(reference_negative) > 0: - negative_scores = np.searchsorted( - reference_negative, - (-values[negative_mask]).to_numpy(), - side="right", - ) / len(reference_negative) - field_scores[negative_mask.to_numpy()] = np.maximum( - field_scores[negative_mask.to_numpy()], - negative_scores, - ) - - scores = np.maximum(scores, field_scores) - return pd.Series(scores, index=df.index, dtype=float) - - -def _choose_puf_aggregate_synthetic_count(pop_weight: float) -> int: - total_weight = max(1, int(round(pop_weight))) - target_count = max(20, round(pop_weight / 10)) - return int(min(40, total_weight, target_count)) - - -def _assign_puf_aggregate_weights( - pop_weight: float, - n_records: int, - rng: np.random.Generator, -) -> np.ndarray: - total_weight = max(1, int(round(pop_weight))) - n_records = max(1, min(int(n_records), total_weight)) - weights = np.ones(n_records, dtype=int) - remainder = total_weight - n_records - if remainder > 0: - base_extra = remainder // n_records - weights += base_extra - leftover = remainder - base_extra * n_records - if leftover: - weights[rng.choice(n_records, size=leftover, replace=False)] += 1 - return weights - - -def _project_puf_weighted_sum_to_bounds( - values: np.ndarray, - weights: np.ndarray, - target_total: float, - lower: np.ndarray, - upper: np.ndarray, - max_iter: int = 50, -) -> np.ndarray: - projected = np.clip(values.astype(float), lower, upper) - - for _ in range(max_iter): - residual = float(target_total - np.dot(projected, weights)) - if abs(residual) <= 1e-6: - return projected - - slack = upper - projected if residual > 0 else projected - lower - free = slack > _PUF_NUMERIC_TOL - if not bool(free.any()): - break - - basis = np.abs(projected[free]) - if basis.sum() <= _PUF_NUMERIC_TOL: - basis = np.ones(free.sum(), dtype=float) - denom = float(np.dot(weights[free], basis)) - if denom <= _PUF_NUMERIC_TOL: - basis = np.ones(free.sum(), dtype=float) - denom = float(weights[free].sum()) - - delta = residual * basis / denom - if residual > 0: - delta = np.minimum(delta, slack[free]) - else: - delta = -np.minimum(-delta, slack[free]) - projected[free] += delta - projected = np.clip(projected, lower, upper) - - return projected - - -def _allocate_puf_weighted_values( - base_values: np.ndarray, - weights: np.ndarray, - target_total: float, - lower: np.ndarray | float | None = None, - upper: np.ndarray | float | None = None, -) -> np.ndarray: - base_values = np.asarray(base_values, dtype=float) - weights = np.asarray(weights, dtype=float) - n = len(base_values) - if abs(target_total) <= 1e-6: - return np.zeros(n, dtype=float) - - if target_total > 0 and np.any(base_values > 0): - active = base_values > 0 - elif target_total < 0 and np.any(base_values < 0): - active = base_values < 0 - elif np.any(np.abs(base_values) > _PUF_NUMERIC_TOL): - active = np.abs(base_values) > _PUF_NUMERIC_TOL - else: - active = np.ones(n, dtype=bool) - - allocated = np.zeros(n, dtype=float) - magnitudes = np.abs(base_values[active]) - if magnitudes.sum() <= _PUF_NUMERIC_TOL: - magnitudes = np.ones(active.sum(), dtype=float) - denom = float(np.dot(weights[active], magnitudes)) - if denom <= _PUF_NUMERIC_TOL: - magnitudes = np.ones(active.sum(), dtype=float) - denom = float(weights[active].sum()) - - allocated[active] = np.sign(target_total) * magnitudes * abs(target_total) / denom - if lower is None and upper is None: - return allocated - - if lower is None: - lower_array = np.full(n, -np.inf, dtype=float) - elif np.isscalar(lower): - lower_array = np.full(n, float(lower), dtype=float) - else: - lower_array = np.asarray(lower, dtype=float) - - if upper is None: - upper_array = np.full(n, np.inf, dtype=float) - elif np.isscalar(upper): - upper_array = np.full(n, float(upper), dtype=float) - else: - upper_array = np.asarray(upper, dtype=float) - - return _project_puf_weighted_sum_to_bounds( - allocated, - weights, - target_total, - lower_array, - upper_array, - ) - - -def _allocate_puf_aggregate_agi( - donor_agi: np.ndarray, - weights: np.ndarray, - recid: int, - target_total: float, -) -> np.ndarray: - donor_agi = np.asarray(donor_agi, dtype=float) - weights = np.asarray(weights, dtype=float) - dominance_cap = _PUF_AGGREGATE_MAX_AGI_DOMINANCE * abs(target_total) / weights - n = len(donor_agi) - - if recid == 999996: - lower = -dominance_cap - upper = np.zeros(n, dtype=float) - else: - bucket_lower, bucket_upper = PUF_AGGREGATE_BUCKET_BOUNDS[recid] - if np.isinf(bucket_upper): - bucket_upper = _PUF_AGGREGATE_AGI_CAP_100M_PLUS - lower = np.full(n, max(float(bucket_lower), 0.0), dtype=float) - upper = np.minimum(np.full(n, float(bucket_upper), dtype=float), dominance_cap) - - return _allocate_puf_weighted_values( - base_values=np.abs(donor_agi), - weights=weights, - target_total=target_total, - lower=lower, - upper=upper, - ) - - -def _sample_puf_aggregate_donors( - donor_bucket: pd.DataFrame, - donor_scores: pd.Series, - target_mean_agi: float, - n_records: int, - rng: np.random.Generator, -) -> pd.DataFrame: - scores = donor_scores.loc[donor_bucket.index].to_numpy(dtype=float) - score_mass = np.clip(scores, 1e-6, None) ** _PUF_AGGREGATE_SELECTION_POWER - donor_abs_agi = np.abs(donor_bucket["E00100"].to_numpy(dtype=float)) - target_abs_agi = max(abs(float(target_mean_agi)), 1.0) - agi_distance = np.abs(np.log1p(donor_abs_agi) - np.log1p(target_abs_agi)) - probabilities = score_mass * np.sqrt(1.0 / (1.0 + agi_distance)) - if not np.isfinite(probabilities).all() or probabilities.sum() <= 0: - probabilities = np.ones(len(donor_bucket), dtype=float) - probabilities = probabilities / probabilities.sum() - - selected_index = rng.choice( - donor_bucket.index.to_numpy(), - size=n_records, - replace=len(donor_bucket) < n_records, - p=probabilities, - ) - return donor_bucket.loc[selected_index].reset_index(drop=True).copy() - - -def _disaggregate_puf_aggregate_bucket( - recid: int, - row: pd.Series, - regular: pd.DataFrame, - amount_columns: list[str], - donor_scores: pd.Series, - next_recid: int, - rng: np.random.Generator, -) -> pd.DataFrame: - pop_weight = float(row["S006"]) / 100.0 - target_mean_agi = float(row["E00100"]) - target_total_agi = pop_weight * target_mean_agi - donor_bucket = regular[_puf_aggregate_bucket_mask(regular, recid)].copy() - if donor_bucket.empty: - donor_bucket = regular.copy() - - total_weight = max(1, int(round(pop_weight))) - n_records = min(_choose_puf_aggregate_synthetic_count(pop_weight), total_weight) - synthetic_weights = _assign_puf_aggregate_weights( - pop_weight, n_records, rng - ).astype(float) - selected = _sample_puf_aggregate_donors( - donor_bucket=donor_bucket, - donor_scores=donor_scores, - target_mean_agi=target_mean_agi, - n_records=n_records, - rng=rng, - ) - selected = selected.astype( - {column: float for column in amount_columns if column in selected.columns} - ) - - synthetic = selected.copy() - synthetic["RECID"] = np.arange(next_recid, next_recid + n_records, dtype=int) - synthetic["S006"] = (synthetic_weights.astype(int) * 100).astype(int) - - for column in PUF_AGGREGATE_STRUCTURAL_COLUMNS: - if column in synthetic.columns: - synthetic[column] = selected[column].round().astype(int) - if "MARS" in synthetic.columns and "XTOT" in synthetic.columns: - joint_mask = synthetic["MARS"] == 2 - synthetic.loc[joint_mask, "XTOT"] = np.maximum( - synthetic.loc[joint_mask, "XTOT"], - 2, - ) - synthetic["XTOT"] = synthetic["XTOT"].clip(lower=0, upper=5).astype(int) - - synthetic["E00100"] = _allocate_puf_aggregate_agi( - donor_agi=selected["E00100"].to_numpy(dtype=float), - weights=synthetic_weights, - recid=recid, - target_total=target_total_agi, - ) - for column in amount_columns: - if column == "E00100": - continue - target_value = float(row.get(column, 0.0)) - if not np.isfinite(target_value): - target_value = 0.0 - synthetic[column] = _allocate_puf_weighted_values( - base_values=selected[column].to_numpy(dtype=float), - weights=synthetic_weights, - target_total=pop_weight * target_value, - ) - return synthetic - - -def disaggregate_puf_aggregate_records( - puf: pd.DataFrame, - *, - seed: int = 42, -) -> pd.DataFrame: - """Replace IRS aggregate PUF rows with calibrated synthetic donor records.""" - if not {"RECID", "MARS", "S006", "E00100"}.issubset(puf.columns): - return puf - - recid = pd.to_numeric(puf["RECID"], errors="coerce").astype("Int64") - aggregate_mask = recid.isin(PUF_AGGREGATE_RECIDS) | pd.to_numeric( - puf["MARS"], errors="coerce" - ).eq(0) - if not bool(aggregate_mask.any()): - return puf - - regular = puf.loc[~aggregate_mask].copy() - if regular.empty: - return puf.loc[~pd.to_numeric(puf["MARS"], errors="coerce").eq(0)].copy() - - aggregate_rows = puf.loc[aggregate_mask].copy() - amount_columns = _puf_aggregate_amount_columns(puf.columns) - donor_scores = _puf_aggregate_eligibility_scores(regular) - rng = np.random.default_rng(seed) - next_recid = PUF_SYNTHETIC_RECID_START - synthetic_rows: list[pd.DataFrame] = [] - - for _, row in aggregate_rows.sort_values("RECID").iterrows(): - row_recid = int(row["RECID"]) - if row_recid not in PUF_AGGREGATE_BUCKET_BOUNDS: - continue - synthetic = _disaggregate_puf_aggregate_bucket( - recid=row_recid, - row=row, - regular=regular, - amount_columns=amount_columns, - donor_scores=donor_scores, - next_recid=next_recid, - rng=rng, - ) - next_recid += len(synthetic) - synthetic_rows.append(synthetic[puf.columns]) - - if not synthetic_rows: - return regular.reset_index(drop=True) - - synthetic_frame = pd.concat(synthetic_rows, ignore_index=True) - print( - f"Disaggregated {int(aggregate_mask.sum())} aggregate PUF records into " - f"{len(synthetic_frame)} synthetic records" - ) - return pd.concat([regular, synthetic_frame], ignore_index=True) - - -def load_puf_raw(puf_path: Path, demographics_path: Path | None = None) -> pd.DataFrame: - """Load raw PUF data from CSV.""" - print(f"Loading PUF from {puf_path}...") - puf = pd.read_csv(puf_path) - - puf = disaggregate_puf_aggregate_records(puf) - - print(f" Raw records: {len(puf):,}") - - # Load and merge demographics if available - if demographics_path and demographics_path.exists(): - print(f"Loading demographics from {demographics_path}...") - demo = pd.read_csv(demographics_path) - - # Demographics file has RECID to match - if "RECID" in puf.columns and "RECID" in demo.columns: - puf = puf.merge(demo, on="RECID", how="left", suffixes=("", "_demo")) - print(f" After demographics merge: {len(puf):,}") - puf = _impute_missing_puf_demographics(puf) - - return puf - - -def _normalize_puf_uprating_mode(mode: str | None) -> str: - resolved = (mode or PUF_UPRATING_MODE_INTERPOLATED).strip().lower() - allowed = { - PUF_UPRATING_MODE_INTERPOLATED, - PUF_UPRATING_MODE_PE_SOI, - } - if resolved not in allowed: - raise ValueError( - f"puf uprating mode must be one of {sorted(allowed)}; got {mode!r}" - ) - return resolved - - -def _resolve_pe_soi_path( - *, - policyengine_us_data_repo: str | Path | None = None, - soi_path: str | Path | None = None, -) -> Path: - if soi_path is not None: - resolved = Path(soi_path) - elif policyengine_us_data_repo is not None: - resolved = ( - Path(policyengine_us_data_repo) - / "policyengine_us_data" - / "storage" - / "soi.csv" - ) - else: - raise ValueError( - "PE SOI uprating requires soi_path or policyengine_us_data_repo" - ) - if not resolved.exists(): - raise FileNotFoundError(f"Could not find PE SOI file at {resolved}") - return resolved - - -def _resolve_pe_uprating_factors_path( - *, - policyengine_us_data_repo: str | Path | None = None, -) -> Path: - if policyengine_us_data_repo is None: - raise ValueError("PE forward uprating requires policyengine_us_data_repo") - resolved = ( - Path(policyengine_us_data_repo) - / "policyengine_us_data" - / "storage" - / "uprating_factors.csv" - ) - if not resolved.exists(): - raise FileNotFoundError(f"Could not find PE uprating factors at {resolved}") - return resolved - - -@cache -def _load_pe_soi_table(soi_path: str) -> pd.DataFrame: - return pd.read_csv(soi_path) - - -@cache -def _load_pe_uprating_factors_table(uprating_factors_path: str) -> pd.DataFrame: - return pd.read_csv(uprating_factors_path) - - -def _get_pe_soi_aggregate( - soi_table: pd.DataFrame, - variable: str, - year: int, - *, - is_count: bool, -) -> float: - lookup_variable = ( - "count" if variable == "adjusted_gross_income" and is_count else variable - ) - rows = soi_table[ - (soi_table["Variable"] == lookup_variable) - & (soi_table["Year"] == year) - & (soi_table["Filing status"] == "All") - & (soi_table["AGI lower bound"] == -np.inf) - & (soi_table["AGI upper bound"] == np.inf) - & (soi_table["Count"] == is_count) - & (~soi_table["Taxable only"]) - ] - if rows.empty: - raise ValueError( - f"Missing SOI aggregate for variable={lookup_variable!r}, year={year}, is_count={is_count}" - ) - return float(rows.iloc[0]["Value"]) - - -def _get_pe_soi_growth( - soi_table: pd.DataFrame, - variable: str, - from_year: int, - to_year: int, -) -> float: - start_value = _get_pe_soi_aggregate( - soi_table, - variable, - from_year, - is_count=False, - ) - end_value = _get_pe_soi_aggregate( - soi_table, - variable, - to_year, - is_count=False, - ) - start_population = _get_pe_soi_aggregate( - soi_table, - "count", - from_year, - is_count=True, - ) - end_population = _get_pe_soi_aggregate( - soi_table, - "count", - to_year, - is_count=True, - ) - return (end_value / start_value) / (end_population / start_population) - - -def uprate_raw_puf_pe_style( - puf: pd.DataFrame, - *, - from_year: int = 2015, - to_year: int = 2024, - policyengine_us_data_repo: str | Path | None = None, - soi_path: str | Path | None = None, -) -> pd.DataFrame: - """Uprate raw PUF columns using the PE SOI growth contract.""" - if from_year == to_year: - return puf.copy() - resolved_soi_path = _resolve_pe_soi_path( - policyengine_us_data_repo=policyengine_us_data_repo, - soi_path=soi_path, - ) - soi_table = _load_pe_soi_table(str(resolved_soi_path.resolve())) - result = puf.copy() - - for variable, puf_column in PE_SOI_TO_PUF_STRAIGHT_RENAMES.items(): - if puf_column not in result.columns: - continue - growth = _get_pe_soi_growth(soi_table, variable, from_year, to_year) - if variable in { - "medical_expense_deductions_uncapped", - "itemized_state_income_tax_deductions", - "itemized_real_estate_tax_deductions", - "interest_paid_deductions", - "charitable_contributions_deductions", - }: - growth = (1.0 + PE_ITMDED_GROW_RATE) ** (to_year - from_year) - values = pd.to_numeric(result[puf_column], errors="coerce").fillna(0.0) - result[puf_column] = values * growth - - for variable, puf_column in PE_SOI_TO_PUF_POS_ONLY_RENAMES.items(): - if puf_column not in result.columns: - continue - growth = _get_pe_soi_growth(soi_table, variable, from_year, to_year) - values = pd.to_numeric(result[puf_column], errors="coerce").fillna(0.0) - result[puf_column] = values.where(values <= 0.0, values * growth) - - for variable, puf_column in PE_SOI_TO_PUF_NEG_ONLY_RENAMES.items(): - if puf_column not in result.columns: - continue - growth = _get_pe_soi_growth(soi_table, variable, from_year, to_year) - values = pd.to_numeric(result[puf_column], errors="coerce").fillna(0.0) - result[puf_column] = values.where(values >= 0.0, values * growth) - - agi_growth = _get_pe_soi_growth( - soi_table, - "adjusted_gross_income", - from_year, - to_year, - ) - for puf_column in PE_PUF_REMAINING_RAW_COLUMNS: - if puf_column not in result.columns: - continue - values = pd.to_numeric(result[puf_column], errors="coerce").fillna(0.0) - result[puf_column] = values * agi_growth - - if "S006" in result.columns: - returns_start = _get_pe_soi_aggregate( - soi_table, - "count", - from_year, - is_count=True, - ) - returns_end = _get_pe_soi_aggregate( - soi_table, - "count", - to_year, - is_count=True, - ) - weights = pd.to_numeric(result["S006"], errors="coerce").fillna(0.0) - result["S006"] = weights * (returns_end / returns_start) - - return result - - -def uprate_mapped_puf_with_pe_factors( - puf: pd.DataFrame, - *, - from_year: int = PE_PUF_SOI_END_YEAR, - to_year: int = 2024, - policyengine_us_data_repo: str | Path | None = None, -) -> pd.DataFrame: - """Uprate mapped PUF variables using PE's forward factor table.""" - if to_year <= from_year: - return puf.copy() - uprating_factors_path = _resolve_pe_uprating_factors_path( - policyengine_us_data_repo=policyengine_us_data_repo, - ) - factors = _load_pe_uprating_factors_table(str(uprating_factors_path.resolve())) - start_column = str(from_year) - end_column = str(to_year) - if start_column not in factors.columns or end_column not in factors.columns: - raise ValueError(f"PE uprating factors do not cover {from_year} -> {to_year}") - factor_lookup = factors.set_index("Variable") - result = puf.copy() - for column in result.columns: - factor_variable = PE_UPRATING_FACTOR_ALIASES.get(column, column) - if factor_variable not in factor_lookup.index: - continue - start_value = float(factor_lookup.at[factor_variable, start_column]) - end_value = float(factor_lookup.at[factor_variable, end_column]) - growth = end_value / start_value - values = pd.to_numeric(result[column], errors="coerce").fillna(0.0) - result[column] = values * growth - if { - "qualified_dividend_income", - "non_qualified_dividend_income", - }.issubset(result.columns): - result["ordinary_dividend_income"] = result["qualified_dividend_income"].fillna( - 0.0 - ) + result["non_qualified_dividend_income"].fillna(0.0) - if { - "taxable_pension_income", - "tax_exempt_pension_income", - }.issubset(result.columns): - result["total_pension_income"] = result["taxable_pension_income"].fillna( - 0.0 - ) + result["tax_exempt_pension_income"].fillna(0.0) - return result - - -def _impute_missing_puf_demographics(puf: pd.DataFrame) -> pd.DataFrame: - if not set(PUF_DEMOGRAPHIC_VARIABLES).issubset(puf.columns): - return puf - - missing_mask = puf.loc[:, PUF_DEMOGRAPHIC_VARIABLES].isna().all(axis=1) - if not bool(missing_mask.any()): - return puf - - observed_mask = ~missing_mask - if int(observed_mask.sum()) < 100: - return puf - - try: - from microimpute.models.qrf import QRF - except ImportError: - return puf - - train = ( - puf.loc[ - observed_mask, [*PUF_DEMOGRAPHIC_PREDICTORS, *PUF_DEMOGRAPHIC_VARIABLES] - ] - .copy() - .fillna(0) - ) - if len(train) > 10_000: - train = train.sample(n=10_000, random_state=0) - - qrf = QRF(log_level="WARNING", memory_efficient=True) - fitted_model = qrf.fit( - X_train=train, - predictors=list(PUF_DEMOGRAPHIC_PREDICTORS), - imputed_variables=list(PUF_DEMOGRAPHIC_VARIABLES), - n_jobs=1, - ) - - predicted = fitted_model.predict( - X_test=puf.loc[missing_mask, list(PUF_DEMOGRAPHIC_PREDICTORS)].copy().fillna(0) - ) - - result = puf.copy() - bounds = { - "AGEDP1": (0, 7), - "AGEDP2": (0, 7), - "AGEDP3": (0, 7), - "AGERANGE": (0, 7), - "EARNSPLIT": (0, 4), - "GENDER": (1, 2), - } - for column in PUF_DEMOGRAPHIC_VARIABLES: - if column not in predicted.columns: - continue - values = pd.to_numeric(predicted[column], errors="coerce").fillna(0.0) - lower, upper = bounds[column] - values = values.round().clip(lower=lower, upper=upper) - result.loc[missing_mask, column] = values.to_numpy() - return result - - -def map_puf_variables( - puf: pd.DataFrame, - *, - random_seed: int = 42, - impute_pre_tax_contributions: bool = False, - pre_tax_contribution_model: PEStyleQRFImputationModel | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - pre_tax_training_year: int = 2024, - require_pre_tax_contribution_model: bool = False, -) -> pd.DataFrame: - """Map PUF variable codes to common names.""" - result = pd.DataFrame(index=puf.index) - manifest = load_us_source_manifest("puf") - observation = manifest.observation_for(EntityType.TAX_UNIT) - - for column_spec in observation.columns: - if column_spec.raw_column in puf.columns: - result[column_spec.canonical_name] = puf[column_spec.raw_column].fillna(0) - else: - result[column_spec.canonical_name] = 0 - - # Fix weight (PUF stores in hundredths) - if "weight" in result.columns: - result["weight"] = result["weight"] / 100 - - # Preserve rental losses as negative values so downstream PE targets can - # recover rent-and-royalty loss cells. - result["rental_income"] = result.get("rental_income_positive", 0).fillna( - 0 - ) + -result.get("rental_income_negative", 0).fillna(0) - if {"E00600", "E00650"}.issubset(set(puf.columns)): - result["non_qualified_dividend_income"] = puf["E00600"].fillna(0) - puf[ - "E00650" - ].fillna(0) - if {"E26190", "E26180", "E25980", "E25960"}.issubset(set(puf.columns)): - s_corp_income = puf["E26190"].fillna(0) - puf["E26180"].fillna(0) - partnership_income = puf["E25980"].fillna(0) - puf["E25960"].fillna(0) - result["partnership_s_corp_income"] = s_corp_income + partnership_income - if { - "E30400", - "E30500", - "E00900", - "E02100", - "E25940", - "E25980", - "E25920", - "E25960", - }.issubset(set(puf.columns)): - se_deduction_factor = 0.9235 - taxable_se = puf["E30400"].fillna(0) + puf["E30500"].fillna(0) - gross_se = taxable_se / se_deduction_factor - schedule_c_f_income = puf["E00900"].fillna(0) + puf["E02100"].fillna(0) - has_partnership = ( - puf["E25940"].fillna(0) - + puf["E25980"].fillna(0) - - puf["E25920"].fillna(0) - - puf["E25960"].fillna(0) - ) != 0 - result["partnership_se_income"] = np.where( - has_partnership, - gross_se - schedule_c_f_income, - 0.0, - ) - if "T27800" in puf.columns: - result["farm_income"] = puf["T27800"].fillna(0) - if {"E26390", "E26400"}.issubset(set(puf.columns)): - result["estate_income"] = puf["E26390"].fillna(0) - puf["E26400"].fillna(0) - if {"E01500", "E01700"}.issubset(set(puf.columns)): - result["tax_exempt_pension_income"] = puf["E01500"].fillna(0) - puf[ - "E01700" - ].fillna(0) - medical_expense_floor = result.get("medical_expense_agi_floor") - if medical_expense_floor is not None: - for variable, fraction in MEDICAL_EXPENSE_CATEGORY_BREAKDOWNS.items(): - result[variable] = medical_expense_floor.fillna(0) * fraction - result = _add_puf_qbi_simulation_columns( - result, - raw_puf=puf, - random_seed=random_seed, - ) - - # Map filing status code to string - filing_status_map = { - 1: "SINGLE", - 2: "JOINT", - 3: "SEPARATE", - 4: "HEAD_OF_HOUSEHOLD", - 5: "SURVIVING_SPOUSE", - } - result["filing_status"] = ( - result["filing_status_code"].map(filing_status_map).fillna("UNKNOWN") - ) - filing_status_code = ( - pd.to_numeric(result["filing_status_code"], errors="coerce") - .fillna(0) - .astype(int) - ) - result["is_surviving_spouse"] = filing_status_code.eq(5) - - # Add age from demographics if available - if "age" in puf.columns: - result["age"] = puf["age"] - elif "AGE_HEAD" in puf.columns: - result["age"] = puf["AGE_HEAD"] - else: - # Impute age based on income patterns - result["age"] = _impute_age(result, random_seed=random_seed) - - # Add sex from demographics if available - if "is_male" in puf.columns: - result["is_male"] = puf["is_male"] - elif "GENDER" in puf.columns: - result["is_male"] = (puf["GENDER"] == 1).astype(float) - else: - # Unknown - will be learned from CPS - result["is_male"] = np.nan - - if "RECID" in puf.columns: - result["_puf_recid"] = pd.to_numeric(puf["RECID"], errors="coerce") - if "AGERANGE" in puf.columns: - result["_puf_agerange"] = pd.to_numeric(puf["AGERANGE"], errors="coerce") - if "EARNSPLIT" in puf.columns: - result["_puf_earnsplit"] = pd.to_numeric(puf["EARNSPLIT"], errors="coerce") - if "GENDER" in puf.columns: - result["_puf_gender"] = pd.to_numeric(puf["GENDER"], errors="coerce") - for dependent_idx in range(1, 4): - raw_column = f"AGEDP{dependent_idx}" - if raw_column in puf.columns: - result[f"_puf_agedp{dependent_idx}"] = pd.to_numeric( - puf[raw_column], - errors="coerce", - ) - - if impute_pre_tax_contributions: - model = pre_tax_contribution_model - if model is None: - try: - model = _default_pe_style_puf_pre_tax_contribution_model( - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - pre_tax_training_year=pre_tax_training_year, - ) - except (FileNotFoundError, ImportError, ValueError): - if require_pre_tax_contribution_model: - raise - model = None - if model is not None: - predictor_frame = result.loc[:, model.predictors].copy() - predictor_frame = predictor_frame.apply( - lambda column: pd.to_numeric(column, errors="coerce").fillna(0.0) - ) - try: - predictions = model.fitted_model.predict(X_test=predictor_frame) - except ( - FileNotFoundError, - ImportError, - ValueError, - ): - if require_pre_tax_contribution_model: - raise - model = None - else: - result[model.imputed_variable] = pd.to_numeric( - predictions[model.imputed_variable], - errors="coerce", - ).fillna(0.0) - elif require_pre_tax_contribution_model: - raise RuntimeError( - "pre_tax_contributions imputation was requested but no PE-style " - "pre-tax contribution model was available" - ) - - # Mark survey source - result["_survey"] = "puf" - - return result - - -def _impute_age( - df: pd.DataFrame, - *, - random_seed: int = 42, -) -> pd.Series: - """Simple age imputation based on income patterns. - - This is a rough heuristic. The masked MAF will learn - better age distributions from CPS. - """ - # Base age on Social Security receipt and pension income - age = pd.Series(40, index=df.index) # Default - - # Social Security recipients tend to be older - has_ss = df.get("gross_social_security", 0) > 0 - age = age.where(~has_ss, 68) - - # Pension recipients also older - has_pension = df.get("taxable_pension_income", 0) > 0 - age = age.where(~has_pension | has_ss, 62) - - # IRA distributions suggest retirement age - has_ira = df.get("ira_distributions", 0) > 0 - age = age.where(~has_ira | has_ss | has_pension, 60) - - # High earners tend to be prime working age - high_wage = df.get("employment_income", 0) > 200_000 - age = age.where(~high_wage, 45) - - # Add some noise - rng = np.random.default_rng(random_seed) - noise = rng.normal(0, 5, len(age)) - age = (age + noise).clip(18, 95).astype(int) - - return age - - -def _decode_puf_filer_age( - age_range: int | float | None, - *, - fallback: float = 40.0, - rng: np.random.Generator | None = None, -) -> int: - resolved_fallback = 40.0 if fallback is None or pd.isna(fallback) else fallback - if age_range is None or pd.isna(age_range): - return int(resolved_fallback) - age_code = int(age_range) - if age_code == 0: - return int(resolved_fallback) - age_decode = { - 1: 18, - 2: 26, - 3: 35, - 4: 45, - 5: 55, - 6: 65, - 7: 80, - } - lower = age_decode.get(age_code) - upper = age_decode.get(age_code + 1) - if lower is None: - return int(resolved_fallback) - if upper is None or upper <= lower: - if age_code == 7: - if rng is not None: - return int(rng.integers(low=lower, high=90, endpoint=False)) - return int(lower + (90 - lower) / 2) - return int(lower) - return int(lower + (upper - lower) / 2) - - -def _decode_puf_dependent_age( - age_range: int | float | None, - *, - rng: np.random.Generator | None = None, -) -> int: - if age_range is None or pd.isna(age_range): - return 0 - age_code = int(age_range) - if age_code == 0: - return 0 - age_decode = { - 0: 0, - 1: 0, - 2: 5, - 3: 13, - 4: 17, - 5: 19, - 6: 25, - 7: 30, - } - lower = age_decode.get(age_code, 0) - upper = age_decode.get(age_code + 1, lower) - if upper <= lower: - return int(lower) - if rng is not None: - return int(rng.integers(low=lower, high=upper, endpoint=False)) - return int(lower + (upper - lower) / 2) - - -def _puf_joint_head_share( - row: pd.Series, - *, - default: float = 0.6, - rng: np.random.Generator | None = None, -) -> float: - earnsplit = row.get("_puf_earnsplit") - if earnsplit is None or pd.isna(earnsplit): - return default - split_code = int(earnsplit) - if split_code <= 0: - return 1.0 - split_decodes = { - 1: 0.0, - 2: 0.25, - 3: 0.75, - 4: 1.0, - 5: 1.0, - } - lower = split_decodes.get(split_code) - upper = split_decodes.get(split_code + 1) - if lower is None or upper is None: - return default - if rng is not None: - frac = (upper - lower) * rng.random() + lower - return float(1.0 - frac) - return float(1.0 - ((lower + upper) / 2.0)) - - -def _puf_spouse_is_male( - gender_code: int | float | None, - *, - rng: np.random.Generator | None = None, -) -> float: - if gender_code is None or pd.isna(gender_code): - return 0.0 - resolved_gender = int(gender_code) - if rng is None: - return float(resolved_gender != 1) - is_opposite_gender = bool(rng.random() < 0.96) - opposite_gender_code = 0.0 if resolved_gender == 1 else 1.0 - same_gender_code = 1.0 - opposite_gender_code - return opposite_gender_code if is_opposite_gender else same_gender_code - - -def _puf_dependent_is_male(*, rng: np.random.Generator | None = None) -> float: - if rng is None: - return 0.0 - return float(rng.choice([0, 1])) - - -def _is_puf_numeric_split_column(df: pd.DataFrame, column: str) -> bool: - if column in PUF_PERSON_EXPANSION_PRESERVE_COLUMNS: - return False - if column.startswith("_"): - return False - return pd.api.types.is_numeric_dtype(df[column]) - - -def uprate_puf( - df: pd.DataFrame, from_year: int = 2015, to_year: int = 2024 -) -> pd.DataFrame: - """Uprate PUF income variables from one year to another. - - Uses SOI-based growth factors. - """ - if from_year == to_year: - return df - - # Simple scaling - in production, use year-specific factors - year_factor = (to_year - from_year) / (2024 - 2015) - - result = df.copy() - - for var, factor in UPRATING_FACTORS.items(): - if var in result.columns: - # Interpolate factor based on years - scaled_factor = 1 + (factor - 1) * year_factor - result[var] = result[var] * scaled_factor - - print(f"Uprated PUF from {from_year} to {to_year}") - - return result - - -def _numeric_series(df: pd.DataFrame, column: str) -> pd.Series: - if column not in df.columns: - return pd.Series(0.0, index=df.index, dtype=float) - return df[column].fillna(0).astype(float) - - -def _bernoulli_lognormal_sample( - n: int, - *, - probability: float, - log_mean: float, - log_sigma: float, - rng: np.random.Generator, -) -> np.ndarray: - positive = rng.binomial(1, probability, size=n).astype(bool) - return np.where( - positive, - rng.lognormal(mean=log_mean, sigma=log_sigma, size=n), - 0.0, - ) - - -def _conditional_lognormal_sample( - flag: np.ndarray, - target_mean: pd.Series | np.ndarray, - *, - log_sigma: float, - rng: np.random.Generator, -) -> np.ndarray: - flag_array = np.asarray(flag, dtype=bool) - target = np.maximum(np.asarray(target_mean, dtype=float), 0.0) - positive_target = target > 0.0 - mu = np.zeros_like(target, dtype=float) - mu[positive_target] = np.log(target[positive_target]) - (log_sigma**2 / 2.0) - draws = rng.lognormal(mean=mu, sigma=log_sigma, size=len(target)) - return np.where(flag_array & positive_target, draws, 0.0) - - -def _simulate_qbi_w2_wages_and_ubia( - puf: pd.DataFrame, - *, - seed: int = QBI_SIMULATION_RANDOM_SEED, -) -> tuple[np.ndarray, np.ndarray]: - """Simulate PE-US-data-style Section 199A W-2 wages and UBIA support.""" - rng = np.random.default_rng(seed) - qbi = sum( - _numeric_series(puf, income_type) * probability - for income_type, probability in QBI_QUALIFICATION_PROBABILITIES.items() - ).to_numpy(dtype=float) - - margins = ( - rng.beta(QBI_PROFIT_MARGIN_BETA_A, QBI_PROFIT_MARGIN_BETA_B, qbi.size) - * QBI_PROFIT_MARGIN_SCALE - + QBI_PROFIT_MARGIN_SHIFT - ) - revenues = np.maximum(qbi, 0.0) / margins - logit = ( - QBI_HAS_EMPLOYEES_LOGIT_INTERCEPT - + QBI_HAS_EMPLOYEES_LOGIT_SLOPE_PER_DOLLAR * revenues - ) - employee_probability = np.where( - revenues == 0.0, - 0.0, - 1.0 / (1.0 + np.exp(-logit)), - ) - has_employees = rng.binomial(1, employee_probability) - - rental_income = _numeric_series(puf, "rental_income") - is_rental = rental_income.to_numpy(dtype=float) > 0.0 - labor_ratios = np.where( - is_rental, - rng.beta( - QBI_RENTAL_LABOR_RATIO_BETA_A, - QBI_RENTAL_LABOR_RATIO_BETA_B, - qbi.size, - ) - * QBI_RENTAL_LABOR_RATIO_SCALE, - rng.beta( - QBI_NON_RENTAL_LABOR_RATIO_BETA_A, - QBI_NON_RENTAL_LABOR_RATIO_BETA_B, - qbi.size, - ) - * QBI_NON_RENTAL_LABOR_RATIO_SCALE, - ) - w2_wages = revenues * labor_ratios * has_employees - - depreciation_proxy = _conditional_lognormal_sample( - is_rental, - rental_income, - log_sigma=QBI_DEPRECIATION_PROXY_SIGMA, - rng=rng, - ) - is_capital_intensive = is_rental | (depreciation_proxy > 0.0) - ubia = _conditional_lognormal_sample( - is_capital_intensive, - QBI_UBIA_MULTIPLE_OF_QBI * np.maximum(qbi, 0.0), - log_sigma=QBI_UBIA_SIGMA, - rng=rng, - ) - return w2_wages, ubia - - -def _add_puf_qbi_simulation_columns( - mapped_puf: pd.DataFrame, - *, - raw_puf: pd.DataFrame, - random_seed: int, -) -> pd.DataFrame: - """Populate PE-US-data-style Section 199A support columns from PUF.""" - result = mapped_puf.copy() - rng = np.random.default_rng(QBI_PUF_RANDOM_SEED + int(random_seed)) - for variable, probability in QBI_QUALIFICATION_PROBABILITIES.items(): - result[f"{variable}_would_be_qualified"] = rng.random(len(result)) < probability - - w2_wages, ubia = _simulate_qbi_w2_wages_and_ubia( - result, - seed=QBI_SIMULATION_RANDOM_SEED, - ) - result["w2_wages_from_qualified_business"] = w2_wages - result["unadjusted_basis_qualified_property"] = ubia - - raw_sstb_sources = pd.DataFrame( - { - column: pd.to_numeric(raw_puf[column], errors="coerce").fillna(0.0) - if column in raw_puf.columns - else pd.Series(0.0, index=result.index, dtype=float) - for column in QBI_SSTB_PROBABILITY_BY_RAW_COLUMN - }, - index=result.index, - ) - largest_qbi_source = raw_sstb_sources.idxmax(axis=1) - has_any_qbi_source = raw_sstb_sources.abs().sum(axis=1).gt(0.0) - probability_sstb = ( - largest_qbi_source.map(QBI_SSTB_PROBABILITY_BY_RAW_COLUMN) - .fillna(0.0) - .where(has_any_qbi_source, 0.0) - ) - is_sstb = rng.binomial(n=1, p=probability_sstb).astype(bool) - result["business_is_sstb"] = is_sstb - - legacy_self_employment_income = _numeric_series(result, "self_employment_income") - result["sstb_self_employment_income_before_lsr"] = np.where( - is_sstb, - legacy_self_employment_income, - 0.0, - ) - result["sstb_self_employment_income"] = result[ - "sstb_self_employment_income_before_lsr" - ] - result["self_employment_income"] = np.where( - is_sstb, - 0.0, - legacy_self_employment_income, - ) - result["sstb_self_employment_income_would_be_qualified"] = np.where( - is_sstb, - result["self_employment_income_would_be_qualified"].astype(bool), - False, - ) - result["sstb_w2_wages_from_qualified_business"] = np.where( - is_sstb, - w2_wages, - 0.0, - ) - result["sstb_unadjusted_basis_qualified_property"] = np.where( - is_sstb, - ubia, - 0.0, - ) - - result["qualified_reit_and_ptp_income"] = _bernoulli_lognormal_sample( - len(result), - probability=QBI_REIT_PTP_PROBABILITY, - log_mean=QBI_REIT_PTP_LOG_NORMAL_MU, - log_sigma=QBI_REIT_PTP_LOG_NORMAL_SIGMA, - rng=rng, - ) - result["qualified_bdc_income"] = _bernoulli_lognormal_sample( - len(result), - probability=QBI_BDC_PROBABILITY, - log_mean=QBI_BDC_LOG_NORMAL_MU, - log_sigma=QBI_BDC_LOG_NORMAL_SIGMA, - rng=rng, - ) - return result - - -def _default_cps_reference_year(target_year: int) -> int: - return min(max(target_year - 1, 2021), 2023) - - -def _social_security_age_bucket(ages: pd.Series) -> pd.Series: - return pd.cut( - pd.to_numeric(ages, errors="coerce"), - bins=SOCIAL_SECURITY_SHARE_AGE_BINS, - labels=SOCIAL_SECURITY_SHARE_AGE_LABELS, - right=False, - include_lowest=True, - ) - - -def _normalize_social_security_split_strategy(strategy: str | None) -> str: - resolved = ( - (strategy or SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE).strip().lower() - ) - allowed = { - SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE, - SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF, - SOCIAL_SECURITY_SPLIT_STRATEGY_AGE_HEURISTIC, - } - if resolved not in allowed: - raise ValueError( - "social_security_split_strategy must be one of " - f"{sorted(allowed)}; got {strategy!r}" - ) - return resolved - - -def _build_pe_style_social_security_predictor_frame( - frame: pd.DataFrame, -) -> pd.DataFrame: - result = pd.DataFrame(index=frame.index) - if "age" in frame.columns: - result["age"] = pd.to_numeric(frame["age"], errors="coerce").astype(float) - if "is_male" in frame.columns: - result["is_male"] = pd.to_numeric(frame["is_male"], errors="coerce").astype( - float - ) - elif "sex" in frame.columns: - sex = pd.to_numeric(frame["sex"], errors="coerce") - result["is_male"] = pd.Series( - np.where(sex == 1, 1.0, np.where(sex == 2, 0.0, np.nan)), - index=frame.index, - dtype=float, - ) - if "tax_unit_is_joint" in frame.columns: - result["tax_unit_is_joint"] = pd.to_numeric( - frame["tax_unit_is_joint"], errors="coerce" - ).astype(float) - elif "filing_status" in frame.columns: - filing_status = frame["filing_status"].astype(str) - result["tax_unit_is_joint"] = (filing_status == "JOINT").astype(float) - if "is_tax_unit_head" in frame.columns: - result["is_tax_unit_head"] = pd.to_numeric( - frame["is_tax_unit_head"], errors="coerce" - ).astype(float) - elif "is_head" in frame.columns: - result["is_tax_unit_head"] = pd.to_numeric( - frame["is_head"], errors="coerce" - ).astype(float) - if "is_tax_unit_dependent" in frame.columns: - result["is_tax_unit_dependent"] = pd.to_numeric( - frame["is_tax_unit_dependent"], errors="coerce" - ).astype(float) - elif "is_dependent" in frame.columns: - result["is_tax_unit_dependent"] = pd.to_numeric( - frame["is_dependent"], errors="coerce" - ).astype(float) - return result - - -def _fit_puf_social_security_share_model_from_reference( - reference_persons: pd.DataFrame, -) -> GroupedShareModel: - work = reference_persons.copy() - if "weight" not in work.columns: - work["weight"] = 1.0 - work["age_bucket"] = _social_security_age_bucket( - work.get("age", pd.Series(np.nan, index=work.index)) - ) - return fit_grouped_share_model( - work, - explicit_component_columns=SOCIAL_SECURITY_SHARE_EXPLICIT_COMPONENTS, - implicit_component_column=SOCIAL_SECURITY_SHARE_IMPLICIT_COMPONENT, - feature_sets=(("age_bucket",),), - weight_column="weight", - ) - - -def _default_puf_social_security_share_model( - *, - cps_reference_year: int, - cache_dir: Path | None, -) -> GroupedShareModel: - cps_dataset = load_cps_asec( - year=cps_reference_year, - cache_dir=cache_dir, - download=True, - ) - return _fit_puf_social_security_share_model_from_reference( - cps_dataset.persons.to_pandas() - ) - - -def _fit_pe_style_puf_social_security_qrf_model_from_reference( - reference_persons: pd.DataFrame, - *, - min_training_records: int = MIN_PE_STYLE_SOCIAL_SECURITY_QRF_TRAINING_RECORDS, -) -> PEStyleQRFShareModel: - from microimpute.models.qrf import QRF - - total_social_security = _numeric_series(reference_persons, "social_security") - has_social_security = total_social_security > 0.0 - if int(has_social_security.sum()) < min_training_records: - raise ValueError( - "PE-style QRF Social Security split requires at least " - f"{min_training_records} positive training rows" - ) - - predictor_frame = _build_pe_style_social_security_predictor_frame( - reference_persons.loc[has_social_security] - ) - available_predictors = tuple( - predictor - for predictor in PE_STYLE_SOCIAL_SECURITY_QRF_PREDICTORS - if predictor in predictor_frame.columns - and predictor_frame[predictor].notna().any() - ) - if not available_predictors: - raise ValueError( - "PE-style QRF Social Security split requires at least one predictor" - ) - - train = predictor_frame.loc[:, available_predictors].copy() - total = total_social_security.loc[has_social_security].to_numpy(dtype=float) - share_prediction_columns: list[str] = [] - for component in SOCIAL_SECURITY_SHARE_COMPONENTS: - share_column = f"{component}_share" - share_prediction_columns.append(share_column) - component_values = _numeric_series( - reference_persons.loc[has_social_security], - component, - ).to_numpy(dtype=float) - train[share_column] = np.where(total > 0.0, component_values / total, 0.0) - - qrf = QRF(log_level="WARNING", memory_efficient=True) - fitted_model = qrf.fit( - X_train=train.loc[:, [*available_predictors, *share_prediction_columns]], - predictors=list(available_predictors), - imputed_variables=share_prediction_columns, - n_jobs=1, - ) - return PEStyleQRFShareModel( - predictors=available_predictors, - component_columns=SOCIAL_SECURITY_SHARE_COMPONENTS, - share_prediction_columns=tuple(share_prediction_columns), - fitted_model=fitted_model, - ) - - -def _default_pe_style_puf_social_security_share_model( - *, - cps_reference_year: int, - cache_dir: Path | None, -) -> PEStyleQRFShareModel: - cps_dataset = load_cps_asec( - year=cps_reference_year, - cache_dir=cache_dir, - download=True, - ) - return _fit_pe_style_puf_social_security_qrf_model_from_reference( - cps_dataset.persons.to_pandas() - ) - - -def _load_pe_extended_cps_pre_tax_training_frame( - *, - policyengine_us_data_repo: str | Path, - training_year: int, -) -> pd.DataFrame: - import h5py - - repo_root = Path(policyengine_us_data_repo).expanduser().resolve() - storage_dir = repo_root / "policyengine_us_data" / "storage" - candidate_paths = ( - storage_dir / f"extended_cps_{int(training_year)}.h5", - storage_dir / "extended_cps_2024.h5", - ) - dataset_path = next((path for path in candidate_paths if path.exists()), None) - if dataset_path is None: - raise FileNotFoundError( - "Could not locate an extended CPS training artifact for PE-style " - f"pre-tax contributions under {storage_dir}" - ) - - with h5py.File(dataset_path, "r") as h5: - train = pd.DataFrame( - { - "employment_income": np.asarray( - h5["employment_income"][str(int(training_year))], dtype=float - ) - if str(int(training_year)) in h5["employment_income"] - else np.asarray( - h5["employment_income"][sorted(h5["employment_income"].keys())[-1]], - dtype=float, - ), - "age": np.asarray(h5["age"][str(int(training_year))], dtype=float) - if str(int(training_year)) in h5["age"] - else np.asarray( - h5["age"][sorted(h5["age"].keys())[-1]], - dtype=float, - ), - "is_male": 1.0 - - np.asarray(h5["is_female"][str(int(training_year))], dtype=float) - if str(int(training_year)) in h5["is_female"] - else 1.0 - - np.asarray( - h5["is_female"][sorted(h5["is_female"].keys())[-1]], - dtype=float, - ), - "pre_tax_contributions": np.asarray( - h5["pre_tax_contributions"][str(int(training_year))], dtype=float - ) - if str(int(training_year)) in h5["pre_tax_contributions"] - else np.asarray( - h5["pre_tax_contributions"][ - sorted(h5["pre_tax_contributions"].keys())[-1] - ], - dtype=float, - ), - } - ) - if len(train) > 10_000: - train = train.sample(n=10_000, random_state=0) - return train - - -def _load_microplex_cps_pre_tax_training_frame( - *, - training_year: int, -) -> pd.DataFrame: - cps = load_cps_asec(year=int(training_year)) - persons = cps.persons.to_pandas() - index = persons.index - employment_income = pd.to_numeric( - persons.get("employment_income", persons.get("wage_income", 0.0)), - errors="coerce", - ).fillna(0.0) - age = pd.to_numeric(persons.get("age", 0.0), errors="coerce").fillna(0.0) - if "is_male" in persons.columns: - is_male = pd.to_numeric(persons["is_male"], errors="coerce").fillna(0.0) - elif "sex" in persons.columns: - is_male = ( - pd.to_numeric(persons["sex"], errors="coerce") - .fillna(0) - .astype(int) - .eq(1) - .astype(float) - ) - else: - is_male = pd.Series(0.0, index=index) - - if "pre_tax_contributions" in persons.columns: - pre_tax_contributions = pd.to_numeric( - persons["pre_tax_contributions"], - errors="coerce", - ).fillna(0.0) - else: - pre_tax_contributions = pd.Series(0.0, index=index) - for column in ( - "traditional_401k_contributions", - "traditional_401k_contributions_desired", - "traditional_403b_contributions", - "traditional_403b_contributions_desired", - "pre_tax_health_insurance_premiums", - "health_savings_account_payroll_contributions", - ): - if column in persons.columns: - pre_tax_contributions = pre_tax_contributions.add( - pd.to_numeric(persons[column], errors="coerce").fillna(0.0), - fill_value=0.0, - ) - - train = pd.DataFrame( - { - "employment_income": employment_income, - "age": age, - "is_male": is_male, - "pre_tax_contributions": pre_tax_contributions, - }, - index=index, - ) - train = train.apply( - lambda column: pd.to_numeric(column, errors="coerce").fillna(0.0) - ) - if len(train) > 10_000: - train = train.sample(n=10_000, random_state=0) - return train - - -@lru_cache(maxsize=4) -def _default_pe_style_puf_pre_tax_contribution_model( - *, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - pre_tax_training_year: int = 2024, -) -> PEStyleQRFImputationModel: - predictors = ("employment_income", "age", "is_male") - if policyengine_us_data_repo is not None: - try: - train = _load_pe_extended_cps_pre_tax_training_frame( - policyengine_us_data_repo=policyengine_us_data_repo, - training_year=pre_tax_training_year, - ) - except (FileNotFoundError, KeyError, OSError, ValueError): - train = _load_microplex_cps_pre_tax_training_frame( - training_year=pre_tax_training_year, - ) - - from microimpute.models.qrf import QRF - - train = train.apply( - lambda column: pd.to_numeric(column, errors="coerce").fillna(0.0) - ) - qrf = QRF(log_level="WARNING", memory_efficient=True) - fitted_model = qrf.fit( - X_train=train, - predictors=list(predictors), - imputed_variables=["pre_tax_contributions"], - n_jobs=1, - ) - return PEStyleQRFImputationModel( - predictors=predictors, - imputed_variable="pre_tax_contributions", - fitted_model=fitted_model, - ) - - from microimpute.models.qrf import QRF - - train = _load_microplex_cps_pre_tax_training_frame( - training_year=pre_tax_training_year - ) - - qrf = QRF(log_level="WARNING", memory_efficient=True) - fitted_model = qrf.fit( - X_train=train, - predictors=list(predictors), - imputed_variables=["pre_tax_contributions"], - n_jobs=1, - ) - return PEStyleQRFImputationModel( - predictors=predictors, - imputed_variable="pre_tax_contributions", - fitted_model=fitted_model, - ) - - -def _strategy_social_security_share_model_loader( - strategy: str, -) -> Callable[[int, Path | None], SocialSecurityShareModel]: - if strategy == SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF: - return lambda year, cache_dir: ( - _default_pe_style_puf_social_security_share_model( - cps_reference_year=year, - cache_dir=cache_dir, - ) - ) - if strategy == SOCIAL_SECURITY_SPLIT_STRATEGY_AGE_HEURISTIC: - return lambda year, cache_dir: _age_heuristic_puf_social_security_share_model() - return lambda year, cache_dir: _default_puf_social_security_share_model( - cps_reference_year=year, - cache_dir=cache_dir, - ) - - -def _age_heuristic_puf_social_security_share_model() -> GroupedShareModel: - reference = pd.DataFrame( - { - "age_bucket": list(SOCIAL_SECURITY_SHARE_AGE_LABELS), - "weight": [1.0] * len(SOCIAL_SECURITY_SHARE_AGE_LABELS), - "social_security_retirement": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0], - "social_security_disability": [0.0, 1.0, 1.0, 1.0, 0.0, 0.0], - "social_security_survivors": [0.0] * len(SOCIAL_SECURITY_SHARE_AGE_LABELS), - "social_security_dependents": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0], - } - ) - return fit_grouped_share_model( - reference, - explicit_component_columns=SOCIAL_SECURITY_SHARE_EXPLICIT_COMPONENTS, - implicit_component_column=SOCIAL_SECURITY_SHARE_IMPLICIT_COMPONENT, - feature_sets=(("age_bucket",),), - weight_column="weight", - ) - - -def _predict_puf_social_security_component_shares( - persons: pd.DataFrame, - *, - share_model: SocialSecurityShareModel, -) -> pd.DataFrame: - if isinstance(share_model, GroupedShareModel): - features = persons.loc[:, []].copy() - features["age_bucket"] = _social_security_age_bucket( - persons.get("age", pd.Series(np.nan, index=persons.index)) - ) - return predict_grouped_component_shares(features, share_model) - - predictors = _build_pe_style_social_security_predictor_frame(persons) - X_test = pd.DataFrame(index=persons.index) - for predictor in share_model.predictors: - if predictor in predictors.columns: - X_test[predictor] = ( - pd.to_numeric(predictors[predictor], errors="coerce") - .fillna(0.0) - .astype(float) - ) - else: - X_test[predictor] = 0.0 - predictions = share_model.fitted_model.predict(X_test=X_test) - - shares = pd.DataFrame(index=persons.index) - total = np.zeros(len(persons), dtype=float) - for component, share_column in zip( - share_model.component_columns, - share_model.share_prediction_columns, - strict=True, - ): - source_column = ( - share_column if share_column in predictions.columns else component - ) - if source_column in predictions.columns: - values = np.clip( - pd.to_numeric(predictions[source_column], errors="coerce") - .fillna(0.0) - .to_numpy(dtype=float), - 0.0, - 1.0, - ) - else: - values = np.zeros(len(persons), dtype=float) - shares[component] = values - total += values - - positive_total = total > 0.0 - for component in share_model.component_columns: - shares[component] = np.where( - positive_total, - shares[component].to_numpy(dtype=float) / total, - 0.0, - ) - return shares - - -def _impute_puf_social_security_components( - persons: pd.DataFrame, - *, - share_model: SocialSecurityShareModel, -) -> pd.DataFrame: - result = persons.copy() - total_social_security = _numeric_series(result, "social_security") - if float(total_social_security.sum()) <= 0.0: - for component in SOCIAL_SECURITY_SHARE_COMPONENTS: - result[component] = 0.0 - return result - - shares = _predict_puf_social_security_component_shares( - result, - share_model=share_model, - ) - for component in SOCIAL_SECURITY_SHARE_COMPONENTS: - result[component] = total_social_security * shares[component] - return result - - -def _add_derived_income_columns(df: pd.DataFrame) -> pd.DataFrame: - result = df.copy() - result = normalize_dividend_columns(result) - employment_income = _numeric_series(result, "employment_income") - self_employment_income = _numeric_series(result, "self_employment_income") - sstb_self_employment_income = _numeric_series( - result, - "sstb_self_employment_income", - ) - taxable_interest_income = _numeric_series(result, "taxable_interest_income") - ordinary_dividend_income = _numeric_series(result, "ordinary_dividend_income") - short_term_capital_gains = _numeric_series(result, "short_term_capital_gains") - long_term_capital_gains = _numeric_series(result, "long_term_capital_gains") - taxable_pension_income = _numeric_series(result, "taxable_pension_income") - gross_social_security = _numeric_series(result, "gross_social_security") - if "age" in result.columns: - ages = pd.to_numeric(result["age"], errors="coerce").fillna(0.0).astype(float) - else: - ages = pd.Series(0.0, index=result.index, dtype=float) - rental_income = _numeric_series(result, "rental_income") - unemployment_compensation = _numeric_series( - result, - "unemployment_compensation", - ) - alimony_income = _numeric_series(result, "alimony_income") - - result["interest_income"] = taxable_interest_income - result["dividend_income"] = ordinary_dividend_income - result["capital_gains"] = short_term_capital_gains + long_term_capital_gains - result["pension_income"] = taxable_pension_income - result["social_security"] = gross_social_security - result["social_security_retirement"] = gross_social_security.where( - ages >= MINIMUM_SOCIAL_SECURITY_RETIREMENT_AGE, 0.0 - ).astype(float) - result["income"] = ( - employment_income - + self_employment_income - + sstb_self_employment_income - + result["interest_income"] - + result["dividend_income"] - + rental_income - + result["social_security"] - + result["pension_income"] - + unemployment_compensation - + alimony_income - ) - result["employment_status"] = ( - (employment_income + self_employment_income) > 0 - ).astype(int) - return result - - -def _allocate_joint_tax_unit_amounts( - row: pd.Series, - head: pd.Series, - spouse: pd.Series, -) -> tuple[pd.Series, pd.Series]: - for variable, head_share in JOINT_HEAD_SHARE_ALLOCATION.items(): - if variable not in row.index: - continue - amount = float(row[variable]) - head[variable] = amount * head_share - spouse[variable] = amount * (1.0 - head_share) - - for variable in JOINT_EQUAL_SHARE_ALLOCATION: - if variable not in row.index: - continue - amount = float(row[variable]) - head[variable] = amount * 0.5 - spouse[variable] = amount * 0.5 - - return head, spouse - - -def expand_to_persons(df: pd.DataFrame) -> pd.DataFrame: - """Expand tax unit records to person-level records. - - Each tax unit becomes 1-2 persons (filer + spouse if joint). - This enables stacking with CPS person-level data. - """ - records = [] - split_columns = [ - column for column in df.columns if _is_puf_numeric_split_column(df, column) - ] - pe_rng = np.random.default_rng(PE_PUF_PERSON_EXPANSION_RANDOM_SEED) - pe_age_rng = np.random.default_rng(PE_PUF_PERSON_EXPANSION_RANDOM_SEED + 1) - - for idx, row in df.iterrows(): - filing_status = row.get("filing_status", "SINGLE") - exemptions = int( - pd.to_numeric(row.get("exemptions_count", 1), errors="coerce") or 1 - ) - has_pe_demographics = "_puf_agerange" in row.index and not pd.isna( - row.get("_puf_agerange") - ) - tax_unit_id = row.get("_puf_recid") - if tax_unit_id is None or pd.isna(tax_unit_id): - tax_unit_id = idx - pe_tax_unit_id = str(int(tax_unit_id)) if pd.notna(tax_unit_id) else str(idx) - - # Create head record - head = row.copy() - head["is_head"] = 1 - head["is_spouse"] = 0 - head["is_dependent"] = 0 - head["person_id"] = ( - f"{pe_tax_unit_id}:1" if has_pe_demographics else f"{idx}_head" - ) - head["tax_unit_id"] = pe_tax_unit_id if has_pe_demographics else str(idx) - if has_pe_demographics: - head["age"] = _decode_puf_filer_age( - row.get("_puf_agerange"), - fallback=row.get("age", 40.0), - rng=pe_age_rng, - ) - if pd.notna(row.get("_puf_gender")): - head["is_male"] = float(int(row.get("_puf_gender")) == 1) - records.append(head) - - # Create spouse record if joint filing - if filing_status == "JOINT": - spouse = row.copy() - spouse["is_head"] = 0 - spouse["is_spouse"] = 1 - spouse["is_dependent"] = 0 - spouse["person_id"] = ( - f"{pe_tax_unit_id}:2" if has_pe_demographics else f"{idx}_spouse" - ) - spouse["tax_unit_id"] = pe_tax_unit_id if has_pe_demographics else str(idx) - spouse["is_surviving_spouse"] = False - - if has_pe_demographics: - spouse["age"] = _decode_puf_filer_age( - row.get("_puf_agerange"), - fallback=row.get("age", 40.0), - rng=pe_age_rng, - ) - if pd.notna(row.get("_puf_gender")): - spouse["is_male"] = _puf_spouse_is_male( - row.get("_puf_gender"), - ) - head_share = _puf_joint_head_share(row, rng=pe_rng) - for column in split_columns: - amount = float( - pd.to_numeric(row.get(column), errors="coerce") or 0.0 - ) - head[column] = amount * head_share - spouse[column] = amount * (1.0 - head_share) - else: - head, spouse = _allocate_joint_tax_unit_amounts(row, head, spouse) - # Spouse weight is same as head (we'll deduplicate in calibration) - records.append(spouse) - exemptions -= 1 - - exemptions -= 1 - if has_pe_demographics: - for dependent_idx in range(min(3, max(exemptions, 0))): - dependent = row.copy() - dependent["is_head"] = 0 - dependent["is_spouse"] = 0 - dependent["is_dependent"] = 1 - dependent["person_id"] = f"{pe_tax_unit_id}:{dependent_idx + 3}" - dependent["tax_unit_id"] = pe_tax_unit_id - dependent["age"] = _decode_puf_dependent_age( - row.get(f"_puf_agedp{dependent_idx + 1}"), - ) - dependent["is_male"] = _puf_dependent_is_male() - dependent["is_surviving_spouse"] = False - for column in split_columns: - dependent[column] = 0.0 - records.append(dependent) - - result = pd.DataFrame(records).reset_index(drop=True) - helper_columns = [ - column for column in result.columns if column in PUF_DEMOGRAPHIC_HELPER_COLUMNS - ] - if helper_columns: - result = result.drop(columns=helper_columns) - result = _add_derived_income_columns(result) - print(f"Expanded {len(df):,} tax units to {len(result):,} persons") - - return result - - -def load_puf( - target_year: int = 2024, - expand_persons: bool = True, - cache_dir: Path | None = None, - social_security_split_strategy: str = SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE, - uprating_mode: str = PUF_UPRATING_MODE_INTERPOLATED, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - impute_pre_tax_contributions: bool = False, - pre_tax_training_year: int = 2024, - soi_path: str | Path | None = None, - require_pre_tax_contribution_model: bool = False, -) -> pd.DataFrame: - """Load and process PUF for multi-survey fusion. - - Args: - target_year: Year to uprate to - expand_persons: If True, expand tax units to person records - cache_dir: Directory to cache downloaded files - - Returns: - DataFrame with common variable names, ready for stacking with CPS - """ - # Prefer a repo-local raw PUF copy when available to avoid remote auth/cache - # requirements during rebuild runs. - local_repo_paths = _resolve_policyengine_repo_local_puf_paths( - policyengine_us_data_repo - ) - if local_repo_paths is not None: - puf_path, demo_path = local_repo_paths - print(f"Using repo-local PUF from {puf_path}...") - else: - puf_path, demo_path = download_puf(cache_dir) - - # Load raw data - raw = load_puf_raw(puf_path, demo_path) - resolved_uprating_mode = _normalize_puf_uprating_mode(uprating_mode) - if resolved_uprating_mode == PUF_UPRATING_MODE_PE_SOI: - raw_uprating_year = min(int(target_year), PE_PUF_SOI_END_YEAR) - raw = uprate_raw_puf_pe_style( - raw, - from_year=2015, - to_year=raw_uprating_year, - policyengine_us_data_repo=policyengine_us_data_repo, - soi_path=soi_path, - ) - - # Map to common variables - df = map_puf_variables( - raw, - impute_pre_tax_contributions=impute_pre_tax_contributions, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - pre_tax_training_year=pre_tax_training_year, - require_pre_tax_contribution_model=require_pre_tax_contribution_model, - ) - - # Uprate to target year - if resolved_uprating_mode == PUF_UPRATING_MODE_PE_SOI: - if target_year > PE_PUF_SOI_END_YEAR: - df = uprate_mapped_puf_with_pe_factors( - df, - from_year=PE_PUF_SOI_END_YEAR, - to_year=target_year, - policyengine_us_data_repo=policyengine_us_data_repo, - ) - else: - df = uprate_puf(df, from_year=2015, to_year=target_year) - - # Expand to persons if requested - if expand_persons: - df = expand_to_persons(df) - strategy = _normalize_social_security_split_strategy( - social_security_split_strategy - ) - try: - if strategy == SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF: - share_model = _default_pe_style_puf_social_security_share_model( - cps_reference_year=_default_cps_reference_year(target_year), - cache_dir=cache_dir, - ) - elif strategy == SOCIAL_SECURITY_SPLIT_STRATEGY_AGE_HEURISTIC: - share_model = _age_heuristic_puf_social_security_share_model() - else: - share_model = _default_puf_social_security_share_model( - cps_reference_year=_default_cps_reference_year(target_year), - cache_dir=cache_dir, - ) - except (FileNotFoundError, ImportError, ValueError): - share_model = _age_heuristic_puf_social_security_share_model() - df = _impute_puf_social_security_components(df, share_model=share_model) - - print(f"\nPUF loaded: {len(df):,} records") - print(f" Weight sum: {df['weight'].sum():,.0f}") - - return df - - -# Variables that PUF has but CPS doesn't (will be NaN in CPS) -PUF_EXCLUSIVE_VARS = [ - "pre_tax_contributions", - "short_term_capital_gains", - "long_term_capital_gains", - "non_sch_d_capital_gains", - "partnership_s_corp_income", - "qualified_dividend_income", - "tax_exempt_interest_income", - "charitable_cash", - "charitable_noncash", - "mortgage_interest_paid", - "state_income_tax_paid", - "real_estate_tax_paid", - "student_loan_interest", - "ira_deduction", -] - -# Variables that both surveys have (may differ in quality) -SHARED_VARS = [ - "employment_income", - "self_employment_income", - "taxable_interest_income", - "ordinary_dividend_income", - "rental_income", - "gross_social_security", - "taxable_pension_income", - "unemployment_compensation", - "age", - "filing_status", -] - - -def _sample_tax_units( - tax_units: pd.DataFrame, - *, - sample_n: int | None, - random_seed: int, -) -> pd.DataFrame: - """Sample tax units before expanding them to persons.""" - if sample_n is None or sample_n >= len(tax_units): - return tax_units.reset_index(drop=True) - sample_weights: pd.Series | None = None - weight_column = next( - ( - candidate - for candidate in ("weight", "S006", "household_weight") - if candidate in tax_units.columns - ), - None, - ) - if weight_column is not None: - candidate_weights = ( - pd.to_numeric(tax_units[weight_column], errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - ) - if ( - candidate_weights.sum() > 0.0 - and int((candidate_weights > 0.0).sum()) >= sample_n - ): - sample_weights = candidate_weights - try: - return tax_units.sample( - n=sample_n, - random_state=random_seed, - replace=False, - weights=sample_weights, - ).reset_index(drop=True) - except ValueError: - # Match CPS behavior: if weighted sampling without replacement is - # infeasible at high sample sizes, fall back to deterministic uniform - # sampling instead of failing the run. - return tax_units.sample( - n=sample_n, - random_state=random_seed, - replace=False, - weights=None, - ).reset_index(drop=True) - - -def _build_puf_tax_units( - *, - raw: pd.DataFrame, - target_year: int, - random_seed: int = 42, - uprating_mode: str = PUF_UPRATING_MODE_INTERPOLATED, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - impute_pre_tax_contributions: bool = False, - pre_tax_training_year: int = 2024, - soi_path: str | Path | None = None, - require_pre_tax_contribution_model: bool = False, -) -> pd.DataFrame: - """Map raw PUF records into a normalized tax-unit table.""" - resolved_uprating_mode = _normalize_puf_uprating_mode(uprating_mode) - if resolved_uprating_mode == PUF_UPRATING_MODE_PE_SOI: - raw_uprating_year = min(int(target_year), PE_PUF_SOI_END_YEAR) - raw = uprate_raw_puf_pe_style( - raw, - from_year=2015, - to_year=raw_uprating_year, - policyengine_us_data_repo=policyengine_us_data_repo, - soi_path=soi_path, - ) - tax_units = map_puf_variables( - raw, - random_seed=random_seed, - impute_pre_tax_contributions=impute_pre_tax_contributions, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - pre_tax_training_year=pre_tax_training_year, - require_pre_tax_contribution_model=require_pre_tax_contribution_model, - ) - if resolved_uprating_mode == PUF_UPRATING_MODE_PE_SOI: - if target_year > PE_PUF_SOI_END_YEAR: - tax_units = uprate_mapped_puf_with_pe_factors( - tax_units, - from_year=PE_PUF_SOI_END_YEAR, - to_year=target_year, - policyengine_us_data_repo=policyengine_us_data_repo, - ) - else: - tax_units = uprate_puf(tax_units, from_year=2015, to_year=target_year) - identifier = ( - raw["RECID"].astype(str).reset_index(drop=True) - if "RECID" in raw.columns - else pd.Series(np.arange(len(raw)).astype(str)) - ) - tax_units = tax_units.reset_index(drop=True) - tax_units["household_id"] = identifier - tax_units["year"] = target_year - tax_units["state_fips"] = 0 - tax_units["tenure"] = 0 - tax_units["household_weight"] = tax_units["weight"].astype(float) - tax_units = _add_derived_income_columns(tax_units) - is_male = tax_units.get("is_male", pd.Series(np.nan, index=tax_units.index)).fillna( - 0 - ) - tax_units["sex"] = np.where(is_male > 0, 1, np.where(is_male == 0, 2, 0)) - tax_units["education"] = 0 - return tax_units - - -def _tax_units_to_persons( - tax_units: pd.DataFrame, - *, - expand_persons_flag: bool, -) -> pd.DataFrame: - """Expand tax units into a person table.""" - if expand_persons_flag: - persons = expand_to_persons(tax_units) - else: - persons = tax_units.copy() - persons["is_head"] = 1 - persons["is_spouse"] = 0 - persons["is_dependent"] = 0 - persons["person_id"] = persons["household_id"].astype(str) + ":head" - persons["tax_unit_id"] = persons["household_id"].astype(str) - persons = persons.reset_index(drop=True) - persons["person_id"] = persons["person_id"].astype(str) - persons["household_id"] = persons["household_id"].astype(str) - persons["year"] = tax_units["year"].iloc[0] if not tax_units.empty else 2024 - if "income" not in persons.columns: - persons["income"] = tax_units["income"] - if "employment_status" not in persons.columns: - persons["employment_status"] = tax_units["employment_status"] - if "education" not in persons.columns: - persons["education"] = 0 - if "age" not in persons.columns: - persons["age"] = 0 - if "sex" not in persons.columns: - persons["sex"] = 0 - return persons - - -def _build_puf_observation_frame( - *, - tax_units: pd.DataFrame, - persons: pd.DataFrame, - source_name: str, - shareability: Shareability, -) -> ObservationFrame: - """Build an observation frame from normalized PUF tax units.""" - manifest = load_us_source_manifest("puf") - households = tax_units[ - ["household_id", "year", "state_fips", "tenure", "household_weight"] - ].copy() - person_variable_names = tuple( - column - for column in persons.columns - if column not in {"person_id", "household_id", "weight", "year"} - ) - descriptor = SourceDescriptor( - name=source_name, - shareability=shareability, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - archetype=manifest.archetype, - population=manifest.population, - description=manifest.description, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="household_weight", - period_column="year", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=person_variable_names, - weight_column="weight" if "weight" in persons.columns else None, - period_column="year", - ), - ), - variable_capabilities=resolve_source_variable_capabilities( - source_name, - ("state_fips", "tenure", *person_variable_names), - ), - ) - frame = ObservationFrame( - source=descriptor, - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - frame.validate() - return frame - - -@dataclass -class PUFSourceProvider: - """Source-provider wrapper around the IRS SOI PUF.""" - - target_year: int = 2024 - cache_dir: Path | None = None - puf_path: str | Path | None = None - demographics_path: str | Path | None = None - expand_persons: bool = True - uprating_mode: str = PUF_UPRATING_MODE_INTERPOLATED - cps_reference_year: int | None = None - policyengine_us_data_repo: str | Path | None = None - policyengine_us_data_python: str | Path | None = None - impute_pre_tax_contributions: bool = False - pre_tax_training_year: int = 2024 - soi_path: str | Path | None = None - require_pre_tax_contribution_model: bool = False - social_security_split_strategy: str = SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE - shareability: Shareability = Shareability.PUBLIC - loader: Callable[[Path | None], tuple[Path, Path | None]] | None = None - social_security_share_model_loader: ( - Callable[[int, Path | None], SocialSecurityShareModel] | None - ) = None - _descriptor_cache: SourceDescriptor | None = None - _social_security_share_model_cache: dict[ - tuple[int, str], SocialSecurityShareModel - ] = field( - default_factory=dict, - init=False, - repr=False, - ) - - @property - def descriptor(self) -> SourceDescriptor: - if self._descriptor_cache is not None: - return self._descriptor_cache - manifest = load_us_source_manifest("puf") - person_variables = ("age", "sex", "income") - return SourceDescriptor( - name="irs_soi_puf", - shareability=self.shareability, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - archetype=manifest.archetype, - population=manifest.population, - description=manifest.description, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="household_weight", - period_column="year", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=person_variables, - weight_column="weight", - period_column="year", - ), - ), - variable_capabilities=resolve_source_variable_capabilities( - "irs_soi_puf", - ("state_fips", "tenure", *person_variables), - ), - ) - - def load_frame(self, query: SourceQuery | None = None) -> ObservationFrame: - query = query or SourceQuery() - provider_filters = query.provider_filters - target_year = int(provider_filters.get("target_year", self.target_year)) - expand_persons_flag = bool( - provider_filters.get("expand_persons", self.expand_persons) - ) - cps_reference_year = int( - provider_filters.get( - "cps_reference_year", - self.cps_reference_year or _default_cps_reference_year(target_year), - ) - ) - uprating_mode = _normalize_puf_uprating_mode( - provider_filters.get("uprating_mode", self.uprating_mode) - ) - social_security_split_strategy = _normalize_social_security_split_strategy( - provider_filters.get( - "social_security_split_strategy", - self.social_security_split_strategy, - ) - ) - puf_path = provider_filters.get("puf_path", self.puf_path) - demographics_path = provider_filters.get( - "demographics_path", - self.demographics_path, - ) - policyengine_us_data_repo = provider_filters.get( - "policyengine_us_data_repo", - self.policyengine_us_data_repo, - ) - policyengine_us_data_python = provider_filters.get( - "policyengine_us_data_python", - self.policyengine_us_data_python, - ) - impute_pre_tax_contributions = bool( - provider_filters.get( - "impute_pre_tax_contributions", - self.impute_pre_tax_contributions, - ) - ) - pre_tax_training_year = int( - provider_filters.get( - "pre_tax_training_year", - self.pre_tax_training_year, - ) - ) - soi_path = provider_filters.get("soi_path", self.soi_path) - require_pre_tax_contribution_model = bool( - provider_filters.get( - "require_pre_tax_contribution_model", - self.require_pre_tax_contribution_model, - ) - ) - if puf_path is None: - local_repo_paths = _resolve_policyengine_repo_local_puf_paths( - policyengine_us_data_repo - ) - if local_repo_paths is not None: - loaded_puf_path, loaded_demographics_path = local_repo_paths - print(f"Using repo-local PUF from {loaded_puf_path}...") - else: - loader = self.loader or download_puf - loaded_puf_path, loaded_demographics_path = loader(self.cache_dir) - puf_path = loaded_puf_path - if demographics_path is None: - demographics_path = loaded_demographics_path - - raw = load_puf_raw( - Path(puf_path), - Path(demographics_path) if demographics_path is not None else None, - ) - raw = _sample_tax_units( - raw, - sample_n=provider_filters.get("sample_n"), - random_seed=int(provider_filters.get("random_seed", 0)), - ) - tax_units = _build_puf_tax_units( - raw=raw, - target_year=target_year, - random_seed=int(provider_filters.get("random_seed", 0)), - uprating_mode=uprating_mode, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - impute_pre_tax_contributions=impute_pre_tax_contributions, - pre_tax_training_year=pre_tax_training_year, - soi_path=soi_path, - require_pre_tax_contribution_model=require_pre_tax_contribution_model, - ) - persons = _tax_units_to_persons( - tax_units, - expand_persons_flag=expand_persons_flag, - ) - persons = _impute_puf_social_security_components( - persons, - share_model=self._load_social_security_share_model( - cps_reference_year, - social_security_split_strategy, - ), - ) - frame = _build_puf_observation_frame( - tax_units=tax_units, - persons=persons, - source_name=f"irs_soi_puf_{target_year}", - shareability=self.shareability, - ) - self._descriptor_cache = frame.source - return apply_source_query(frame, query) - - def _load_social_security_share_model( - self, - cps_reference_year: int, - strategy: str, - ) -> SocialSecurityShareModel: - cache_key = (cps_reference_year, strategy) - cached = self._social_security_share_model_cache.get(cache_key) - if cached is not None: - return cached - loader = ( - self.social_security_share_model_loader - or _strategy_social_security_share_model_loader(strategy) - ) - try: - model = loader(cps_reference_year, self.cache_dir) - except (FileNotFoundError, ImportError, ValueError): - model = _age_heuristic_puf_social_security_share_model() - self._social_security_share_model_cache[cache_key] = model - return model - - -if __name__ == "__main__": - # Test loading - df = load_puf(target_year=2024) - print("\nSample of loaded PUF:") - print(df.head()) - - print("\nIncome variable sums:") - income_vars = [ - "employment_income", - "self_employment_income", - "long_term_capital_gains", - "partnership_s_corp_income", - "gross_social_security", - "taxable_pension_income", - ] - for var in income_vars: - if var in df.columns: - total = (df[var] * df["weight"]).sum() / 1e9 - print(f" {var}: ${total:.1f}B") diff --git a/src/microplex_us/data_sources/sampling.py b/src/microplex_us/data_sources/sampling.py deleted file mode 100644 index 7c7debc2..00000000 --- a/src/microplex_us/data_sources/sampling.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Shared sampling helpers for checkpoint-scale source queries.""" - -from __future__ import annotations - -import numpy as np -import pandas as pd - - -def sample_frame_without_replacement( - frame: pd.DataFrame, - *, - sample_n: int | None, - random_seed: int, - weight_col: str | None = None, - positive_only_when_weighted: bool = False, -) -> pd.DataFrame: - """Sample rows without replacement, preserving existing weighting behavior.""" - - result = frame.copy() - if sample_n is None or sample_n >= len(result): - return result - - sample_source = result - sample_weights: pd.Series | None = None - if weight_col is not None and weight_col in result.columns: - candidate_weights = ( - pd.to_numeric(result[weight_col], errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - ) - positive_mask = candidate_weights > 0.0 - if candidate_weights.sum() > 0.0 and int(positive_mask.sum()) >= sample_n: - if positive_only_when_weighted: - sample_source = result.loc[positive_mask].copy() - sample_weights = candidate_weights.loc[positive_mask] - else: - sample_weights = candidate_weights - - if sample_n >= len(sample_source): - return sample_source.copy() - - try: - return sample_source.sample( - n=sample_n, - random_state=random_seed, - replace=False, - weights=sample_weights, - ).copy() - except ValueError: - return sample_source.sample( - n=sample_n, - random_state=random_seed, - replace=False, - weights=None, - ).copy() - - -def sample_frame_with_state_floor( - frame: pd.DataFrame, - *, - sample_n: int | None, - random_seed: int, - weight_col: str | None = None, - state_col: str = "state_fips", - state_floor: int | None = None, - positive_only_when_weighted: bool = False, -) -> pd.DataFrame: - """Sample rows while guaranteeing a small minimum from each observed state.""" - - result = frame.copy() - if sample_n is None or sample_n >= len(result): - return result - resolved_floor = int(state_floor or 0) - if resolved_floor <= 0 or state_col not in result.columns: - return sample_frame_without_replacement( - result, - sample_n=sample_n, - random_seed=random_seed, - weight_col=weight_col, - positive_only_when_weighted=positive_only_when_weighted, - ) - - state_values = pd.to_numeric(result[state_col], errors="coerce") - eligible = result.loc[state_values.notna()].copy() - if eligible.empty: - return sample_frame_without_replacement( - result, - sample_n=sample_n, - random_seed=random_seed, - weight_col=weight_col, - positive_only_when_weighted=positive_only_when_weighted, - ) - - eligible["_sampling_state_key"] = state_values.loc[eligible.index].astype(int) - groups = [ - group.copy() - for _, group in eligible.groupby("_sampling_state_key", sort=True, dropna=False) - ] - minimum_required = sum(min(len(group), resolved_floor) for group in groups) - if minimum_required > sample_n: - raise ValueError( - "state_floor requires more rows than sample_n allows: " - f"floor={resolved_floor}, required={minimum_required}, sample_n={sample_n}" - ) - - rng = np.random.default_rng(random_seed) - floor_samples: list[pd.DataFrame] = [] - for group in groups: - group_floor = min(len(group), resolved_floor) - if group_floor <= 0: - continue - floor_samples.append( - sample_frame_without_replacement( - group.drop(columns="_sampling_state_key"), - sample_n=group_floor, - random_seed=int(rng.integers(0, np.iinfo(np.int32).max)), - weight_col=weight_col, - positive_only_when_weighted=positive_only_when_weighted, - ) - ) - floor_sample = ( - pd.concat(floor_samples, axis=0, ignore_index=False) - if floor_samples - else result.iloc[0:0].copy() - ) - selected_index = pd.Index(floor_sample.index.unique()) - remaining_n = int(sample_n) - len(selected_index) - if remaining_n <= 0: - return floor_sample.copy() - - remainder = result.drop(index=selected_index, errors="ignore") - if remainder.empty: - return floor_sample.copy() - remainder_sample = sample_frame_without_replacement( - remainder, - sample_n=remaining_n, - random_seed=int(rng.integers(0, np.iinfo(np.int32).max)), - weight_col=weight_col, - positive_only_when_weighted=positive_only_when_weighted, - ) - return pd.concat([floor_sample, remainder_sample], axis=0, ignore_index=False) diff --git a/src/microplex_us/data_sources/share_imputation.py b/src/microplex_us/data_sources/share_imputation.py deleted file mode 100644 index 3efa8c15..00000000 --- a/src/microplex_us/data_sources/share_imputation.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Reusable grouped-share imputation utilities for decomposable value families.""" - -from __future__ import annotations - -from dataclasses import dataclass - -import numpy as np -import pandas as pd -from pandas import CategoricalDtype - - -@dataclass -class GroupedShareModel: - """Weighted grouped-share model with hierarchical fallbacks.""" - - explicit_component_columns: tuple[str, ...] - implicit_component_column: str | None - feature_sets: tuple[tuple[str, ...], ...] - group_share_tables: dict[tuple[str, ...], pd.DataFrame] - overall_explicit_shares: dict[str, float] - - @property - def component_columns(self) -> tuple[str, ...]: - if self.implicit_component_column is None: - return self.explicit_component_columns - return (*self.explicit_component_columns, self.implicit_component_column) - - -def _nonnegative_series(frame: pd.DataFrame, column: str) -> pd.Series: - if column not in frame.columns: - return pd.Series(0.0, index=frame.index, dtype=float) - return pd.to_numeric(frame[column], errors="coerce").fillna(0.0).clip(lower=0.0) - - -def _normalized_feature_frame( - frame: pd.DataFrame, - feature_columns: tuple[str, ...], -) -> pd.DataFrame: - result = frame.loc[:, list(feature_columns)].copy() - for column in feature_columns: - series = result[column] - if isinstance(series.dtype, CategoricalDtype): - series = series.astype(object) - result[column] = series.where(pd.notna(series), None) - return result - - -def fit_grouped_share_model( - reference: pd.DataFrame, - *, - explicit_component_columns: tuple[str, ...], - implicit_component_column: str | None = None, - feature_sets: tuple[tuple[str, ...], ...], - weight_column: str = "weight", -) -> GroupedShareModel: - """Fit weighted grouped shares over explicit components. - - The implicit component, when provided, is computed as the remaining share. - """ - - if not explicit_component_columns: - raise ValueError("Grouped share model requires at least one explicit component") - all_components = explicit_component_columns + ( - (implicit_component_column,) if implicit_component_column is not None else () - ) - component_total = sum( - _nonnegative_series(reference, component) - for component in all_components - ) - positive_mask = component_total > 0.0 - if not positive_mask.any(): - raise ValueError("Grouped share model requires at least one positive training row") - - positive = reference.loc[positive_mask].copy() - weights = _nonnegative_series(positive, weight_column) - weight_sum = float(weights.sum()) - if weight_sum <= 0.0: - weights = pd.Series(1.0, index=positive.index, dtype=float) - weight_sum = float(weights.sum()) - - positive_total = component_total.loc[positive.index].where( - component_total.loc[positive.index] > 0.0, - 1.0, - ) - share_columns: list[str] = [] - work = positive.copy() - for component in explicit_component_columns: - share_column = f"__share_{component}" - share_columns.append(share_column) - work[share_column] = ( - _nonnegative_series(positive, component) / positive_total - ).astype(float) - work["__weight"] = weights.astype(float) - - overall_explicit_shares: dict[str, float] = {} - for component, share_column in zip( - explicit_component_columns, - share_columns, - strict=True, - ): - overall_explicit_shares[component] = float( - np.average(work[share_column], weights=work["__weight"]) - ) - overall_sum = sum(overall_explicit_shares.values()) - if overall_sum > 1.0: - overall_explicit_shares = { - component: value / overall_sum - for component, value in overall_explicit_shares.items() - } - - group_share_tables: dict[tuple[str, ...], pd.DataFrame] = {} - for feature_set in feature_sets: - if not feature_set: - continue - features = _normalized_feature_frame(work, feature_set) - grouped = pd.concat([features, work[share_columns], work[["__weight"]]], axis=1) - weighted_columns = [] - for share_column in share_columns: - weighted_column = f"{share_column}__weighted" - grouped[weighted_column] = grouped[share_column] * grouped["__weight"] - weighted_columns.append(weighted_column) - aggregated = ( - grouped.groupby(list(feature_set), dropna=False, observed=False)[ - [*weighted_columns, "__weight"] - ] - .sum() - .reset_index() - ) - for component, share_column, weighted_column in zip( - explicit_component_columns, - share_columns, - weighted_columns, - strict=True, - ): - aggregated[component] = np.where( - aggregated["__weight"] > 0.0, - aggregated[weighted_column] / aggregated["__weight"], - 0.0, - ) - group_share_tables[feature_set] = aggregated.loc[ - :, - [*feature_set, *explicit_component_columns], - ] - - return GroupedShareModel( - explicit_component_columns=explicit_component_columns, - implicit_component_column=implicit_component_column, - feature_sets=feature_sets, - group_share_tables=group_share_tables, - overall_explicit_shares=overall_explicit_shares, - ) - - -def predict_grouped_component_shares( - target: pd.DataFrame, - model: GroupedShareModel, -) -> pd.DataFrame: - """Predict MECE component shares for the target frame.""" - - result = pd.DataFrame(index=target.index) - explicit_columns = list(model.explicit_component_columns) - unresolved = pd.Series(True, index=target.index, dtype=bool) - - for feature_set in model.feature_sets: - if not feature_set: - continue - table = model.group_share_tables.get(feature_set) - if table is None or table.empty: - continue - feature_frame = _normalized_feature_frame(target.loc[unresolved], feature_set) - feature_frame["__row_id"] = feature_frame.index - merged = feature_frame.merge( - table, - on=list(feature_set), - how="left", - sort=False, - ).set_index("__row_id") - merged = merged.reindex(feature_frame.index) - matched_mask = merged[explicit_columns].notna().all(axis=1) - if not matched_mask.any(): - continue - matched_index = matched_mask.index[matched_mask] - result.loc[matched_index, explicit_columns] = merged.loc[ - matched_index, - explicit_columns, - ] - unresolved.loc[matched_index] = False - - for component in explicit_columns: - default_series = pd.Series( - model.overall_explicit_shares[component], - index=result.index, - dtype=float, - ) - result[component] = pd.to_numeric( - result.get(component, default_series), - errors="coerce", - ).fillna(model.overall_explicit_shares[component]) - result[component] = result[component].clip(lower=0.0, upper=1.0) - - explicit_sum = result[explicit_columns].sum(axis=1) - overfull_mask = explicit_sum > 1.0 - if overfull_mask.any(): - result.loc[overfull_mask, explicit_columns] = result.loc[ - overfull_mask, - explicit_columns, - ].div(explicit_sum.loc[overfull_mask], axis=0) - explicit_sum = result[explicit_columns].sum(axis=1) - - if model.implicit_component_column is not None: - result[model.implicit_component_column] = (1.0 - explicit_sum).clip( - lower=0.0, - upper=1.0, - ) - - return result.loc[:, list(model.component_columns)] diff --git a/src/microplex_us/geography.py b/src/microplex_us/geography.py deleted file mode 100644 index 6e89ee22..00000000 --- a/src/microplex_us/geography.py +++ /dev/null @@ -1,763 +0,0 @@ -"""US-specific Census block geography helpers.""" - -from __future__ import annotations - -from functools import lru_cache -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from microplex.geography import ( - AtomicGeographyCrosswalk, - GeographyProvider, - GeographyQuery, - ProbabilisticAtomicGeographyAssigner, - nearest_numeric_partition_key, -) - -STATE_LEN = 2 -COUNTY_LEN = 3 -TRACT_LEN = 6 -BLOCK_LEN = 4 - -STATE_GEOID_LEN = STATE_LEN -COUNTY_GEOID_LEN = STATE_LEN + COUNTY_LEN -TRACT_GEOID_LEN = STATE_LEN + COUNTY_LEN + TRACT_LEN -BLOCK_GEOID_LEN = STATE_LEN + COUNTY_LEN + TRACT_LEN + BLOCK_LEN - -PACKAGE_ROOT = Path(__file__).resolve().parents[2] -DEFAULT_DATA_DIR_CANDIDATES = ( - PACKAGE_ROOT / "data", - PACKAGE_ROOT.parent / "microplex" / "data", -) -DEFAULT_DATA_DIR = next( - ( - candidate - for candidate in DEFAULT_DATA_DIR_CANDIDATES - if (candidate / "block_probabilities.parquet").exists() - ), - next( - (candidate for candidate in DEFAULT_DATA_DIR_CANDIDATES if candidate.exists()), - DEFAULT_DATA_DIR_CANDIDATES[0], - ), -) -DEFAULT_BLOCK_PROBABILITIES_PATH = DEFAULT_DATA_DIR / "block_probabilities.parquet" -DEFAULT_BLOCK_PROBABILITIES_SPM_GEOGRAPHY_PATH = ( - DEFAULT_DATA_DIR / "block_probabilities_spm_geography.parquet" -) -SPM_METRO_AREA_COLUMN = "spm_metro_area" -CENSUS_CBSA_DELINEATION_URL = ( - "https://www2.census.gov/programs-surveys/metro-micro/geographies/" - "reference-files/2020/delineation-files/list1_2020.xls" -) -US_STATE_ABBR_BY_FIPS = { - "01": "AL", - "02": "AK", - "04": "AZ", - "05": "AR", - "06": "CA", - "08": "CO", - "09": "CT", - "10": "DE", - "11": "DC", - "12": "FL", - "13": "GA", - "15": "HI", - "16": "ID", - "17": "IL", - "18": "IN", - "19": "IA", - "20": "KS", - "21": "KY", - "22": "LA", - "23": "ME", - "24": "MD", - "25": "MA", - "26": "MI", - "27": "MN", - "28": "MS", - "29": "MO", - "30": "MT", - "31": "NE", - "32": "NV", - "33": "NH", - "34": "NJ", - "35": "NM", - "36": "NY", - "37": "NC", - "38": "ND", - "39": "OH", - "40": "OK", - "41": "OR", - "42": "PA", - "44": "RI", - "45": "SC", - "46": "SD", - "47": "TN", - "48": "TX", - "49": "UT", - "50": "VT", - "51": "VA", - "53": "WA", - "54": "WV", - "55": "WI", - "56": "WY", - "72": "PR", -} - - -def load_block_probabilities(path: str | Path | None = None) -> pd.DataFrame: - """Load US Census block probabilities from parquet.""" - path = default_runtime_block_probabilities_path() if path is None else Path(path) - if path is None: - path = DEFAULT_BLOCK_PROBABILITIES_PATH - if not path.exists(): - raise FileNotFoundError( - f"Block probabilities file not found at {path}.\n" - "Run the US geography preparation pipeline first." - ) - return pd.read_parquet(path) - - -def default_runtime_block_probabilities_path() -> Path | None: - """Return the preferred runtime block probabilities file when available.""" - if DEFAULT_BLOCK_PROBABILITIES_SPM_GEOGRAPHY_PATH.exists(): - return DEFAULT_BLOCK_PROBABILITIES_SPM_GEOGRAPHY_PATH - if DEFAULT_BLOCK_PROBABILITIES_PATH.exists(): - return DEFAULT_BLOCK_PROBABILITIES_PATH - return None - - -def normalize_us_state_fips(value: Any) -> str: - """Normalize US state FIPS values to two-character strings.""" - return str(int(round(float(value)))).zfill(2) - - -def _normalize_us_state_fips_series(values: pd.Series) -> pd.Series: - numeric = pd.to_numeric(values, errors="coerce") - normalized = numeric.round().astype("Int64").astype("string").str.zfill(2) - return normalized.mask(numeric.isna()) - - -def normalize_us_county_fips(value: Any) -> str | None: - """Normalize US county FIPS values to five-character strings.""" - try: - if pd.isna(value): - return None - text = str(value).strip() - if not text: - return None - numeric = pd.to_numeric(pd.Series([text]), errors="coerce").iloc[0] - if pd.notna(numeric): - return str(int(round(float(numeric)))).zfill(COUNTY_GEOID_LEN) - digits = "".join(character for character in text if character.isdigit()) - return digits.zfill(COUNTY_GEOID_LEN) if digits else None - except (TypeError, ValueError, OverflowError): - return None - - -def normalize_state_legislative_district_id( - value: Any, - *, - chamber: str | None = None, -) -> str | None: - """Normalize state legislative district IDs to STATE-SLDU/SLDL-NNN.""" - if value is None: - return None - try: - if pd.isna(value): - return None - except (TypeError, ValueError): - pass - raw = str(value).strip() - if not raw: - return None - - labeled = _normalize_labeled_state_legislative_district_id(raw) - if labeled is not None: - return labeled - - if raw.startswith("610U900US"): - raw = raw[-5:] - chamber = "upper" - elif raw.startswith("620L900US"): - raw = raw[-5:] - chamber = "lower" - - if raw.isdigit() and len(raw) >= 5 and chamber in {"upper", "lower"}: - state_fips = raw[:2] - district_code = raw[2:] - state_abbr = US_STATE_ABBR_BY_FIPS.get(state_fips, state_fips) - chamber_label = "SLDU" if chamber == "upper" else "SLDL" - return f"{state_abbr}-{chamber_label}-{_normalize_sld_district(district_code)}" - - return raw - - -def _normalize_labeled_state_legislative_district_id(raw: str) -> str | None: - parts = raw.split("-") - if len(parts) != 3: - return None - state, chamber_label, district_code = (part.strip() for part in parts) - if not state or not chamber_label or not district_code: - return None - canonical_chamber_label = { - "SLDU": "SLDU", - "SD": "SLDU", - "SLDL": "SLDL", - "HD": "SLDL", - "AD": "SLDL", - }.get(chamber_label.upper()) - if canonical_chamber_label is None: - return None - return ( - f"{state.upper()}-{canonical_chamber_label}-" - f"{_normalize_sld_district(district_code)}" - ) - - -def _normalize_sld_district(value: Any) -> str: - text = str(value).strip() - if text.endswith(".0") and text[:-2].isdigit(): - text = text[:-2] - return f"{int(text):03d}" if text.isdigit() else text - - -@lru_cache(maxsize=8) -def _spm_metro_area_codes(year: int) -> frozenset[str]: - try: - from spm_calculator.geoadj import list_metro_areas - except ImportError: - return frozenset() - return frozenset(str(area["code"]) for area in list_metro_areas(year)) - - -def state_nonmetro_spm_area_code( - state_fips: Any, - *, - year: int = 2024, -) -> str | None: - """Return the Census SPM state-nonmetro area code when one exists.""" - return _state_spm_area_code(state_fips, suffix=2, year=year) - - -def _state_spm_area_code( - state_fips: Any, - *, - suffix: int, - year: int, -) -> str | None: - try: - normalized_state = normalize_us_state_fips(state_fips) - except (TypeError, ValueError, OverflowError): - return None - if normalized_state == "00": - return None - code = str(int(normalized_state) * 1_000 + suffix) - return code if code in _spm_metro_area_codes(year) else None - - -def _state_spm_area_code_series( - values: pd.Series, - *, - suffix: int, - year: int, -) -> pd.Series: - states = _normalize_us_state_fips_series(values) - numeric_states = pd.to_numeric(states, errors="coerce") - codes = (numeric_states * 1_000 + suffix).round().astype("Int64").astype("string") - valid_codes = _spm_metro_area_codes(year) - return codes.where( - states.notna() & states.ne("00") & codes.isin(valid_codes) - ).astype("string") - - -@lru_cache(maxsize=1) -def _census_cbsa_crosswalk() -> dict[str, str] | None: - """Load county-to-CBSA crosswalk from Census' official delineation file.""" - try: - delineation = pd.read_excel( - CENSUS_CBSA_DELINEATION_URL, - header=2, - dtype=str, - ) - except Exception: - return None - - required_columns = {"CBSA Code", "FIPS State Code", "FIPS County Code"} - if not required_columns.issubset(delineation.columns): - return None - - rows = delineation.dropna( - subset=["CBSA Code", "FIPS State Code", "FIPS County Code"] - ).copy() - rows["county_fips"] = rows["FIPS State Code"].astype(str).str.zfill(2) + rows[ - "FIPS County Code" - ].astype(str).str.zfill(3) - rows["cbsa_code"] = rows["CBSA Code"].astype(str).str.strip() - return dict(zip(rows["county_fips"], rows["cbsa_code"], strict=False)) - - -def _normalize_census_area_code(value: Any) -> str | None: - if value is None or pd.isna(value): - return None - text = str(value).strip() - if not text or text.lower() in {"nan", "none", ""} or text in {"0", "00000"}: - return None - numeric = pd.to_numeric(pd.Series([text]), errors="coerce").iloc[0] - return str(int(round(float(numeric)))) if pd.notna(numeric) else text - - -def _normalize_census_area_code_series(values: pd.Series) -> pd.Series: - result = values.astype("string").str.strip() - invalid = ( - result.isna() - | result.str.lower().isin({"", "nan", "none", ""}) - | result.isin({"0", "00000"}) - ) - numeric = pd.to_numeric(result, errors="coerce") - numeric_codes = numeric.round().astype("Int64").astype("string") - result = result.mask(numeric.notna(), numeric_codes) - return result.mask(invalid).astype("string") - - -def _normalize_spm_area_code(value: Any, *, year: int) -> str | None: - code = _normalize_census_area_code(value) - if code is None: - return None - return code if code in _spm_metro_area_codes(year) else None - - -def add_spm_metro_area_geography( - frame: pd.DataFrame, - *, - year: int = 2024, - county_column: str = "county_fips", - state_column: str = "state_fips", - cbsa_column: str = "cbsa_code", - spm_metro_area_column: str = SPM_METRO_AREA_COLUMN, - derive_cbsa_from_primary_source: bool = True, -) -> pd.DataFrame: - """Attach Census SPM metro/nonmetro area IDs from block-derived geography. - - The final SPM threshold geography is a Census metropolitan area code when - SPM publishes one; otherwise it is a state metro/nonmetro SPM code. We only - classify blank, micropolitan, or unsupported CBSA values into a state area - when they came from a trusted CBSA source: either an existing CBSA column or - the Census delineation input. - """ - if frame.empty: - result = frame.copy() - if spm_metro_area_column not in result.columns: - result[spm_metro_area_column] = pd.Series(dtype="string") - return result - - result = frame.copy() - trusted_cbsa_source = cbsa_column in result.columns - if cbsa_column in result.columns: - cbsa_values = _normalize_census_area_code_series(result[cbsa_column]) - else: - cbsa_values = pd.Series(pd.NA, index=result.index, dtype="string") - - if ( - derive_cbsa_from_primary_source - and county_column in result.columns - and cbsa_values.isna().any() - ): - cbsa_crosswalk = _census_cbsa_crosswalk() - if cbsa_crosswalk is not None: - trusted_cbsa_source = True - county_values = result[county_column].map(normalize_us_county_fips) - derived_cbsa = county_values.map(cbsa_crosswalk) - cbsa_values = cbsa_values.combine_first( - _normalize_census_area_code_series(derived_cbsa) - ) - - result[cbsa_column] = cbsa_values.astype("string") - - spm_area = cbsa_values.where(cbsa_values.isin(_spm_metro_area_codes(year))).astype( - "string" - ) - if trusted_cbsa_source and state_column in result.columns: - nonmetro_codes = _state_spm_area_code_series( - result[state_column], - suffix=2, - year=year, - ) - state_metro_codes = _state_spm_area_code_series( - result[state_column], - suffix=1, - year=year, - ) - state_fallback_codes = nonmetro_codes.combine_first( - state_metro_codes.where(cbsa_values.notna()) - ) - spm_area = spm_area.combine_first(state_fallback_codes) - - result[spm_metro_area_column] = spm_area - return result - - -def derive_geographies( - block_geoids: list[str] | np.ndarray | pd.Series, - include_cd: bool = False, - include_sld: bool = False, - include_spm_metro_area: bool = False, - block_data: pd.DataFrame | None = None, - year: int = 2024, -) -> pd.DataFrame: - """Derive parent geographies from Census block GEOIDs.""" - geoids = pd.Series(block_geoids).astype(str) - result = pd.DataFrame( - { - "block_geoid": geoids, - "state_fips": geoids.str[:STATE_GEOID_LEN], - "county_fips": geoids.str[:COUNTY_GEOID_LEN], - "tract_geoid": geoids.str[:TRACT_GEOID_LEN], - } - ) - if include_cd or include_sld: - block_data = load_block_probabilities() if block_data is None else block_data - if include_cd: - result["cd_id"] = geoids.map( - dict(zip(block_data["geoid"], block_data["cd_id"])) - ) - if include_sld: - if "sldu_id" in block_data.columns: - result["sldu_id"] = geoids.map( - dict(zip(block_data["geoid"], block_data["sldu_id"])) - ) - if "sldl_id" in block_data.columns: - result["sldl_id"] = geoids.map( - dict(zip(block_data["geoid"], block_data["sldl_id"])) - ) - if include_spm_metro_area: - result = add_spm_metro_area_geography(result, year=year) - return result - - -class BlockGeography(GeographyProvider): - """US atomic-geography provider backed by Census blocks.""" - - def __init__( - self, - data_path: str | Path | None = None, - lazy_load: bool = True, - ): - self._data_path = data_path - self._data: pd.DataFrame | None = None - self._cd_lookup: dict[str, str] | None = None - self._sldu_lookup: dict[str, str] | None = None - self._sldl_lookup: dict[str, str] | None = None - self._state_blocks: dict[str, pd.DataFrame] | None = None - if not lazy_load: - self._load_data() - - @classmethod - def from_data(cls, data: pd.DataFrame) -> BlockGeography: - instance = cls(lazy_load=True) - instance._data = data.copy() - return instance - - def _load_data(self) -> None: - if self._data is None: - self._data = load_block_probabilities(self._data_path) - - @property - def data(self) -> pd.DataFrame: - if self._data is None: - self._load_data() - return self._data - - @staticmethod - @lru_cache(maxsize=100000) - def get_state(block_geoid: str) -> str: - return block_geoid[:STATE_GEOID_LEN] - - @staticmethod - @lru_cache(maxsize=100000) - def get_county(block_geoid: str) -> str: - return block_geoid[:COUNTY_GEOID_LEN] - - @staticmethod - @lru_cache(maxsize=100000) - def get_tract(block_geoid: str) -> str: - return block_geoid[:TRACT_GEOID_LEN] - - def get_cd(self, block_geoid: str) -> str | None: - if self._cd_lookup is None: - self._build_lookups() - return self._cd_lookup.get(block_geoid) - - def get_sldu(self, block_geoid: str) -> str | None: - if self._sldu_lookup is None: - self._build_lookups() - return self._sldu_lookup.get(block_geoid) - - def get_sldl(self, block_geoid: str) -> str | None: - if self._sldl_lookup is None: - self._build_lookups() - return self._sldl_lookup.get(block_geoid) - - def _build_lookups(self) -> None: - self._cd_lookup = dict(zip(self.data["geoid"], self.data["cd_id"])) - self._sldu_lookup = ( - dict(zip(self.data["geoid"], self.data["sldu_id"])) - if "sldu_id" in self.data.columns - else {} - ) - self._sldl_lookup = ( - dict(zip(self.data["geoid"], self.data["sldl_id"])) - if "sldl_id" in self.data.columns - else {} - ) - - def get_all_geographies(self, block_geoid: str) -> dict[str, str | None]: - return { - "state_fips": self.get_state(block_geoid), - "county_fips": self.get_county(block_geoid), - "tract_geoid": self.get_tract(block_geoid), - "cd_id": self.get_cd(block_geoid), - "sldu_id": self.get_sldu(block_geoid), - "sldl_id": self.get_sldl(block_geoid), - "spm_metro_area": self._get_spm_metro_area(block_geoid), - } - - def _get_spm_metro_area(self, block_geoid: str) -> str | None: - if SPM_METRO_AREA_COLUMN in self.data.columns: - match = self.data.loc[self.data["geoid"].astype(str).eq(block_geoid)] - if not match.empty: - value = match[SPM_METRO_AREA_COLUMN].iloc[0] - return None if pd.isna(value) else str(value) - - rows = { - "state_fips": [self.get_state(block_geoid)], - "county_fips": [self.get_county(block_geoid)], - } - if "cbsa_code" in self.data.columns: - match = self.data.loc[self.data["geoid"].astype(str).eq(block_geoid)] - if not match.empty: - rows["cbsa_code"] = [match["cbsa_code"].iloc[0]] - value = add_spm_metro_area_geography( - pd.DataFrame(rows), - derive_cbsa_from_primary_source=False, - )["spm_metro_area"].iloc[0] - return None if pd.isna(value) else str(value) - - def to_crosswalk(self) -> AtomicGeographyCrosswalk: - crosswalk = self.data.copy() - if "county_fips" not in crosswalk.columns and {"state_fips", "county"}.issubset( - crosswalk.columns - ): - crosswalk["county_fips"] = crosswalk["state_fips"].astype(str) + crosswalk[ - "county" - ].astype(str) - if "tract_geoid" not in crosswalk.columns and { - "state_fips", - "county", - "tract", - }.issubset(crosswalk.columns): - crosswalk["tract_geoid"] = ( - crosswalk["state_fips"].astype(str) - + crosswalk["county"].astype(str) - + crosswalk["tract"].astype(str) - ) - geography_columns = tuple( - column - for column in ( - "state_fips", - "county_fips", - "tract_geoid", - "cd_id", - "sldu_id", - "sldl_id", - "cbsa_code", - SPM_METRO_AREA_COLUMN, - ) - if column in crosswalk.columns - ) - return AtomicGeographyCrosswalk( - data=crosswalk.rename(columns={"geoid": "block_geoid"}), - atomic_id_column="block_geoid", - geography_columns=geography_columns, - probability_column="prob" if "prob" in crosswalk.columns else None, - ) - - def load_crosswalk( - self, query: GeographyQuery | None = None - ) -> AtomicGeographyCrosswalk: - query = query or GeographyQuery() - crosswalk = self.to_crosswalk() - if not query.geography_columns and query.probability_column is None: - return crosswalk - return AtomicGeographyCrosswalk( - data=crosswalk.data.copy(), - atomic_id_column=crosswalk.atomic_id_column, - geography_columns=tuple(query.geography_columns) - or crosswalk.geography_columns, - probability_column=query.probability_column or crosswalk.probability_column, - ) - - def load_assigner( - self, - query: GeographyQuery | None = None, - ) -> ProbabilisticAtomicGeographyAssigner: - query = query or GeographyQuery() - partition_columns = tuple(query.partition_columns) or ("state_fips",) - partition_normalizers = dict(query.partition_normalizers) - fallback_resolver = query.fallback_resolver - if partition_columns == ("state_fips",): - partition_normalizers.setdefault("state_fips", normalize_us_state_fips) - if fallback_resolver is None: - fallback_resolver = nearest_numeric_partition_key - return ProbabilisticAtomicGeographyAssigner( - crosswalk=self.load_crosswalk(query), - partition_columns=partition_columns, - probability_column=query.probability_column, - partition_normalizers=partition_normalizers, - fallback_resolver=fallback_resolver, - ) - - def assign( - self, - frame: pd.DataFrame, - *, - state_column: str = "state_fips", - atomic_id_column: str = "block_geoid", - random_state: int | None = None, - ) -> pd.DataFrame: - working = frame.copy() - if state_column != "state_fips": - working = working.rename(columns={state_column: "state_fips"}) - assigned = self.load_assigner().assign( - working, - atomic_id_column=atomic_id_column, - random_state=random_state, - ) - if state_column != "state_fips": - assigned = assigned.rename(columns={"state_fips": state_column}) - return assigned - - def materialize( - self, - frame: pd.DataFrame, - *, - columns: tuple[str, ...] | list[str] | None = None, - atomic_id_column: str = "block_geoid", - ) -> pd.DataFrame: - return self.to_crosswalk().materialize( - frame, - columns=columns, - atomic_id_column=atomic_id_column, - ) - - def sample_blocks( - self, - state_fips: str, - n: int, - replace: bool = True, - random_state: int | None = None, - ) -> np.ndarray: - if self._state_blocks is None: - self._build_state_index() - if state_fips not in self._state_blocks: - raise ValueError(f"State FIPS '{state_fips}' not found in block data.") - state_df = self._state_blocks[state_fips] - if random_state is not None: - np.random.seed(random_state) - sampled_indices = np.random.choice( - len(state_df), - size=n, - replace=replace, - p=state_df["prob"].values, - ) - geoids = state_df["geoid"].astype(str).to_numpy() - return np.asarray(geoids[sampled_indices]) - - def _build_state_index(self) -> None: - self._state_blocks = {} - for state_fips, group in self.data.groupby("state_fips"): - self._state_blocks[state_fips] = group[["geoid", "prob"]].copy() - - def sample_blocks_national( - self, - n: int, - replace: bool = True, - random_state: int | None = None, - ) -> np.ndarray: - if random_state is not None: - np.random.seed(random_state) - sampled_indices = np.random.choice( - len(self.data), - size=n, - replace=replace, - p=self.data["national_prob"].values, - ) - geoids = self.data["geoid"].astype(str).to_numpy() - return np.asarray(geoids[sampled_indices]) - - def get_blocks_in_state(self, state_fips: str) -> pd.DataFrame: - return self.data[self.data["state_fips"] == state_fips].copy() - - def get_blocks_in_county(self, county_fips: str) -> pd.DataFrame: - state = county_fips[:STATE_GEOID_LEN] - county = county_fips[STATE_GEOID_LEN:] - return self.data[ - (self.data["state_fips"] == state) & (self.data["county"] == county) - ].copy() - - def get_blocks_in_tract(self, tract_geoid: str) -> pd.DataFrame: - return self.data[self.data["tract_geoid"] == tract_geoid].copy() - - def get_blocks_in_cd(self, cd_id: str) -> pd.DataFrame: - return self.data[self.data["cd_id"] == cd_id].copy() - - def get_blocks_in_sldu(self, sldu_id: str) -> pd.DataFrame: - if "sldu_id" not in self.data.columns: - return pd.DataFrame() - return self.data[self.data["sldu_id"] == sldu_id].copy() - - def get_blocks_in_sldl(self, sldl_id: str) -> pd.DataFrame: - if "sldl_id" not in self.data.columns: - return pd.DataFrame() - return self.data[self.data["sldl_id"] == sldl_id].copy() - - @property - def states(self) -> list[str]: - return sorted(self.data["state_fips"].unique()) - - @property - def n_blocks(self) -> int: - return len(self.data) - - def __repr__(self) -> str: - if self._data is None: - return "BlockGeography(not loaded)" - return f"BlockGeography({self.n_blocks:,} blocks, {len(self.states)} states)" - - -__all__ = [ - "STATE_LEN", - "COUNTY_LEN", - "TRACT_LEN", - "BLOCK_LEN", - "STATE_GEOID_LEN", - "COUNTY_GEOID_LEN", - "TRACT_GEOID_LEN", - "BLOCK_GEOID_LEN", - "DEFAULT_DATA_DIR", - "DEFAULT_BLOCK_PROBABILITIES_PATH", - "DEFAULT_BLOCK_PROBABILITIES_SPM_GEOGRAPHY_PATH", - "CENSUS_CBSA_DELINEATION_URL", - "SPM_METRO_AREA_COLUMN", - "default_runtime_block_probabilities_path", - "load_block_probabilities", - "normalize_us_state_fips", - "normalize_us_county_fips", - "normalize_state_legislative_district_id", - "state_nonmetro_spm_area_code", - "add_spm_metro_area_geography", - "derive_geographies", - "BlockGeography", -] diff --git a/src/microplex_us/hierarchical.py b/src/microplex_us/hierarchical.py deleted file mode 100644 index 51b14668..00000000 --- a/src/microplex_us/hierarchical.py +++ /dev/null @@ -1,31 +0,0 @@ -"""US-specific preprocessing helpers around the generic hierarchical synthesizer.""" - -from __future__ import annotations - -import pandas as pd - - -def prepare_cps_for_hierarchical( - cps_person_data: pd.DataFrame, - hh_id_col: str = "household_id", -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Aggregate CPS person rows into household summaries for hierarchical synthesis.""" - persons = cps_person_data.copy() - - household_summary = persons.groupby(hh_id_col).agg( - { - "age": ["count", lambda values: (values >= 18).sum(), lambda values: (values < 18).sum()], - } - ) - household_summary.columns = ["n_persons", "n_adults", "n_children"] - household_summary = household_summary.reset_index() - - for variable in ["state_fips", "tenure", "hh_weight"]: - if variable in persons.columns: - first_values = persons.groupby(hh_id_col)[variable].first() - household_summary[variable] = household_summary[hh_id_col].map(first_values) - - return household_summary, persons - - -__all__ = ["prepare_cps_for_hierarchical"] diff --git a/src/microplex_us/pipelines/ecps_export_contract.json b/src/microplex_us/manifests/ecps_export_contract.json similarity index 100% rename from src/microplex_us/pipelines/ecps_export_contract.json rename to src/microplex_us/manifests/ecps_export_contract.json diff --git a/src/microplex_us/pipelines/frozen_production_ecps_2024_benchmark_manifest.json b/src/microplex_us/manifests/frozen_production_ecps_2024_benchmark_manifest.json similarity index 100% rename from src/microplex_us/pipelines/frozen_production_ecps_2024_benchmark_manifest.json rename to src/microplex_us/manifests/frozen_production_ecps_2024_benchmark_manifest.json diff --git a/src/microplex_us/microdata_roles.py b/src/microplex_us/microdata_roles.py deleted file mode 100644 index 3f4519c3..00000000 --- a/src/microplex_us/microdata_roles.py +++ /dev/null @@ -1,203 +0,0 @@ -"""Source-specific microdata variable role metadata. - -This is the Microplex-side bridge to the richer Arch source-data contract: -Arch preserves what a source says, while Microplex decides which source columns -are model inputs versus source-reported outputs or diagnostics. -""" - -from __future__ import annotations - -from enum import Enum - - -class MicrodataVariableRole(Enum): - """How Microplex should treat one source-native microdata variable.""" - - SOURCE_INPUT = "source_input" - REPORTED_RETURN_LINE_INPUT = "reported_return_line_input" - CALCULATED_TAX_OUTPUT = "calculated_tax_output" - - -class PolicyEngineUSVariableRole(Enum): - """How Microplex should treat a PolicyEngine US variable at export time.""" - - PRESERVED_INPUT = "preserved_input" - TAKEUP_INPUT = "takeup_input" - REPORTED_OUTPUT = "reported_output" - CALCULATED_OUTPUT = "calculated_output" - - -PUF_CALCULATED_TAX_OUTPUT_VARIABLES: frozenset[str] = frozenset( - { - "american_opportunity_credit", - "amt_foreign_tax_credit", - "early_withdrawal_penalty", - "energy_efficient_home_improvement_credit", - "excess_withheld_payroll_tax", - "foreign_tax_credit", - "general_business_credit", - "other_credits", - "prior_year_minimum_tax_credit", - "recapture_of_investment_credit", - "savers_credit", - "state_and_local_sales_or_income_tax", - "state_income_tax_paid", - "taxable_social_security", - "taxable_unemployment_compensation", - "unreported_payroll_tax", - } -) - -POLICYENGINE_US_TAKEUP_INPUT_VARIABLES: frozenset[str] = frozenset( - { - "takes_up_aca_if_eligible", - "takes_up_early_head_start_if_eligible", - "takes_up_eitc", - "takes_up_head_start_if_eligible", - "takes_up_housing_assistance_if_eligible", - "takes_up_medicaid_if_eligible", - "takes_up_medicare_if_eligible", - "takes_up_snap_if_eligible", - "takes_up_ssi_if_eligible", - "takes_up_tanf_if_eligible", - "would_claim_wic", - "would_file_taxes_voluntarily", - } -) - -POLICYENGINE_US_REPORTED_BENEFIT_AMOUNT_VARIABLES: frozenset[str] = frozenset( - { - "snap_reported", - "ssi_reported", - "tanf_reported", - } -) - -POLICYENGINE_US_REPORTED_TAX_OUTPUT_VARIABLES: frozenset[str] = frozenset( - PUF_CALCULATED_TAX_OUTPUT_VARIABLES - | { - "state_income_tax_reported", - } -) - -POLICYENGINE_US_REPORTED_OUTPUT_VARIABLES: frozenset[str] = frozenset( - POLICYENGINE_US_REPORTED_BENEFIT_AMOUNT_VARIABLES - | POLICYENGINE_US_REPORTED_TAX_OUTPUT_VARIABLES -) - -POLICYENGINE_US_CALCULATED_OUTPUT_VARIABLES: frozenset[str] = frozenset( - { - "aca_ptc", - "additional_ctc", - "assigned_aca_ptc", - "loss_limited_net_capital_gains", - "net_capital_gains", - "chip_enrolled", - "ctc", - "early_head_start", - "eitc", - "filing_status", - "head_start", - "income_tax", - "income_tax_positive", - "is_aca_ptc_eligible", - "medicaid", - "medicaid_cost", - "medicaid_enrolled", - "non_refundable_ctc", - "premium_tax_credit", - "refundable_ctc", - "rent", - "snap", - "ssi", - "state_income_tax", - "tanf", - "total_income_tax", - "wic", - } -) - -POLICYENGINE_US_CONSTRUCTION_INPUT_VARIABLES: frozenset[str] = frozenset() - -POLICYENGINE_US_DIRECT_EXPORT_BLOCKED_VARIABLES: frozenset[str] = frozenset( - POLICYENGINE_US_CALCULATED_OUTPUT_VARIABLES - | POLICYENGINE_US_REPORTED_OUTPUT_VARIABLES -) - - -def source_name_matches_prefix(source_name: str, prefix: str) -> bool: - """Return whether a source name is an exact or year-suffixed source prefix.""" - return source_name == prefix or source_name.startswith(f"{prefix}_") - - -def microdata_variable_role( - source_name: str, - variable_name: str, -) -> MicrodataVariableRole: - """Resolve the source-specific role for one microdata variable.""" - if ( - source_name_matches_prefix(source_name, "irs_soi_puf") - and variable_name in PUF_CALCULATED_TAX_OUTPUT_VARIABLES - ): - return MicrodataVariableRole.CALCULATED_TAX_OUTPUT - return MicrodataVariableRole.SOURCE_INPUT - - -def is_model_input_microdata_variable( - source_name: str, - variable_name: str, -) -> bool: - """Return whether a source column should enter model-ready microdata.""" - return microdata_variable_role( - source_name, - variable_name, - ) is not MicrodataVariableRole.CALCULATED_TAX_OUTPUT - - -def non_model_input_microdata_variables( - source_name: str, - variable_names: list[str] | tuple[str, ...] | set[str] | frozenset[str], -) -> tuple[str, ...]: - """Return source columns that should stay out of model-ready microdata.""" - return tuple( - variable_name - for variable_name in variable_names - if not is_model_input_microdata_variable(source_name, variable_name) - ) - - -def policyengine_us_variable_role(variable_name: str) -> PolicyEngineUSVariableRole: - """Resolve the Microplex role for a PolicyEngine US variable name.""" - if variable_name in POLICYENGINE_US_CONSTRUCTION_INPUT_VARIABLES: - return PolicyEngineUSVariableRole.PRESERVED_INPUT - if variable_name in POLICYENGINE_US_CALCULATED_OUTPUT_VARIABLES: - return PolicyEngineUSVariableRole.CALCULATED_OUTPUT - if variable_name in POLICYENGINE_US_REPORTED_OUTPUT_VARIABLES: - return PolicyEngineUSVariableRole.REPORTED_OUTPUT - if variable_name in POLICYENGINE_US_TAKEUP_INPUT_VARIABLES: - return PolicyEngineUSVariableRole.TAKEUP_INPUT - return PolicyEngineUSVariableRole.PRESERVED_INPUT - - -def is_policyengine_us_direct_export_blocked(variable_name: str) -> bool: - """Return whether a source column may not override a PE-US variable.""" - return ( - policyengine_us_variable_role(variable_name) - in { - PolicyEngineUSVariableRole.CALCULATED_OUTPUT, - PolicyEngineUSVariableRole.REPORTED_OUTPUT, - } - ) - - -def blocked_policyengine_us_direct_export_variables( - variable_names: list[str] | tuple[str, ...] | set[str] | frozenset[str], -) -> tuple[str, ...]: - """Return requested direct overrides that violate the variable contract.""" - return tuple( - sorted( - variable_name - for variable_name in variable_names - if is_policyengine_us_direct_export_blocked(variable_name) - ) - ) diff --git a/src/microplex_us/pe_source_impute_engine.py b/src/microplex_us/pe_source_impute_engine.py deleted file mode 100644 index 8803985f..00000000 --- a/src/microplex_us/pe_source_impute_engine.py +++ /dev/null @@ -1,348 +0,0 @@ -"""Execution helpers for PE source-impute donor blocks.""" - -from __future__ import annotations - -from collections.abc import Callable -from dataclasses import dataclass - -import numpy as np -import pandas as pd -from microplex.core import EntityType - -from microplex_us.pe_source_impute_specs import ( - PESourceImputeBlockSpec, - load_pe_source_impute_block_specs, - prepare_pe_source_impute_condition_frame, - resolve_pe_source_impute_block_key, -) -from microplex_us.variables import ( - DonorImputationBlockSpec, - apply_donor_variable_semantics, - is_projected_condition_var_compatible, -) - -DonorConditionCompatibilityFn = Callable[[pd.Series, pd.Series], bool] -DonorImputerBuilderFn = Callable[[list[str], tuple[str, ...]], object] -DonorRankMatcherFn = Callable[..., pd.Series] -CanProjectToEntityFn = Callable[[pd.DataFrame, pd.DataFrame, EntityType], bool] -ProjectFrameToEntityFn = Callable[..., pd.DataFrame] -EntityKeyFn = Callable[[EntityType], str | None] - - -@dataclass(frozen=True) -class PESourceImputeConditionSurface: - """Prepared donor/current condition frames for one PE donor block.""" - - spec: PESourceImputeBlockSpec - donor_frame: pd.DataFrame - current_frame: pd.DataFrame - - def compatible_predictors( - self, - *, - compatibility_fn: DonorConditionCompatibilityFn, - ) -> list[str]: - """Return the manifest predictor surface filtered to compatible columns.""" - return [ - variable - for variable in self.spec.predictors - if variable in self.donor_frame.columns - and variable in self.current_frame.columns - and compatibility_fn(self.donor_frame[variable], self.current_frame[variable]) - ] - - -@dataclass(frozen=True) -class PESourceImputeBlockRunRequest: - """Inputs needed to execute one PE donor block once its surface is resolved.""" - - donor_block_spec: DonorImputationBlockSpec - donor_fit_source: pd.DataFrame - current_generation_source: pd.DataFrame - current_frame: pd.DataFrame - entity_key: str | None - - -@dataclass(frozen=True) -class PESourceImputeBlockRunResult: - """Updated seed frame after executing one PE donor block.""" - - updated_frame: pd.DataFrame - integrated_variables: tuple[str, ...] - condition_vars: tuple[str, ...] - - -@dataclass(frozen=True) -class PESourceImputePreparedBlockInputs: - """Prepared PE donor-block inputs before imputation execution.""" - - donor_fit_source: pd.DataFrame - current_generation_source: pd.DataFrame - raw_shared_vars: tuple[str, ...] - shared_vars_after_model_exclusion: tuple[str, ...] - shared_vars_for_block: tuple[str, ...] - entity_compatible_shared_vars: tuple[str, ...] - projection_applied: bool - entity_key: str | None - condition_surface: PESourceImputeConditionSurface | None - - -@dataclass(frozen=True) -class PESourceImputeConditionedBlockRunRequest: - """Inputs needed to execute one donor block after conditions are selected.""" - - block_request: PESourceImputeBlockRunRequest - donor_condition_source: pd.DataFrame - current_condition_source: pd.DataFrame - condition_vars: tuple[str, ...] - - -@dataclass(frozen=True) -class PESourceImputeBlockEngine: - """Centralized resolver for PE donor-block specs and condition surfaces.""" - - specs: dict[str, PESourceImputeBlockSpec] - - @classmethod - def default(cls) -> PESourceImputeBlockEngine: - return cls(specs=load_pe_source_impute_block_specs()) - - def resolve_spec( - self, - *, - donor_source_name: str | None, - donor_block: tuple[str, ...], - ) -> PESourceImputeBlockSpec | None: - """Resolve one donor source/block pair to a PE source-impute spec.""" - key = resolve_pe_source_impute_block_key( - donor_source_name=donor_source_name, - donor_block=donor_block, - ) - if key is None: - return None - return self.specs[key] - - def prepare_condition_surface( - self, - *, - donor_frame: pd.DataFrame, - current_frame: pd.DataFrame, - donor_source_name: str | None, - donor_block: tuple[str, ...], - ) -> PESourceImputeConditionSurface | None: - """Prepare the PE prespecified donor/current condition frames for one block.""" - spec = self.resolve_spec( - donor_source_name=donor_source_name, - donor_block=donor_block, - ) - if spec is None: - return None - return PESourceImputeConditionSurface( - spec=spec, - donor_frame=prepare_pe_source_impute_condition_frame(donor_frame, spec), - current_frame=prepare_pe_source_impute_condition_frame(current_frame, spec), - ) - - def prepare_block_inputs( - self, - *, - donor_seed: pd.DataFrame, - current_frame: pd.DataFrame, - shared_vars: list[str], - donor_block_spec: DonorImputationBlockSpec, - donor_source_name: str | None, - prepare_pe_surface: bool, - can_project_to_entity: CanProjectToEntityFn, - project_frame_to_entity: ProjectFrameToEntityFn, - entity_key_fn: EntityKeyFn, - ) -> PESourceImputePreparedBlockInputs: - """Prepare one PE donor block's working frames and optional PE surface.""" - donor_working = donor_seed.copy() - if donor_block_spec.prepare_frame is not None: - donor_working = donor_block_spec.prepare_frame(donor_working) - - shared_vars_for_block = [ - variable - for variable in shared_vars - if variable not in donor_block_spec.model_variables - ] - shared_vars_after_model_exclusion = tuple(shared_vars_for_block) - entity_compatible_shared_vars: tuple[str, ...] = () - donor_fit_source = donor_working - current_generation_source = current_frame - entity_key = entity_key_fn(donor_block_spec.native_entity) - projection_applied = False - - if can_project_to_entity( - current_frame, - donor_working, - donor_block_spec.native_entity, - ): - projection_applied = True - entity_compatible_shared_vars = [ - variable - for variable in shared_vars - if is_projected_condition_var_compatible( - variable, - projected_entity=donor_block_spec.native_entity, - allowed_condition_entities=donor_block_spec.condition_entities, - ) - ] - if entity_compatible_shared_vars: - shared_vars_for_block = entity_compatible_shared_vars - entity_compatible_shared_vars = tuple(entity_compatible_shared_vars) - donor_fit_source = project_frame_to_entity( - donor_working, - entity=donor_block_spec.native_entity, - variables=( - set(shared_vars_for_block) - | set(donor_block_spec.model_variables) - | {"hh_weight"} - ), - ) - current_generation_source = project_frame_to_entity( - current_frame, - entity=donor_block_spec.native_entity, - variables=set(shared_vars_for_block), - ) - - condition_surface = None - if prepare_pe_surface: - condition_surface = self.prepare_condition_surface( - donor_frame=donor_fit_source, - current_frame=current_generation_source, - donor_source_name=donor_source_name, - donor_block=donor_block_spec.model_variables, - ) - - return PESourceImputePreparedBlockInputs( - donor_fit_source=donor_fit_source, - current_generation_source=current_generation_source, - raw_shared_vars=tuple(shared_vars), - shared_vars_after_model_exclusion=shared_vars_after_model_exclusion, - shared_vars_for_block=tuple(shared_vars_for_block), - entity_compatible_shared_vars=entity_compatible_shared_vars, - projection_applied=projection_applied, - entity_key=entity_key, - condition_surface=condition_surface, - ) - - def run_prepared_block( - self, - *, - surface: PESourceImputeConditionSurface, - request: PESourceImputeBlockRunRequest, - build_imputer: DonorImputerBuilderFn, - rank_match: DonorRankMatcherFn, - compatibility_fn: DonorConditionCompatibilityFn, - fit_kwargs: dict[str, int | float | bool], - seed: int, - rng: np.random.Generator, - ) -> PESourceImputeBlockRunResult | None: - """Run one PE prespecified donor block from fit through matched assignment.""" - condition_vars = surface.compatible_predictors( - compatibility_fn=compatibility_fn, - ) - return self.run_conditioned_block( - request=PESourceImputeConditionedBlockRunRequest( - block_request=request, - donor_condition_source=surface.donor_frame, - current_condition_source=surface.current_frame, - condition_vars=tuple(condition_vars), - ), - build_imputer=build_imputer, - rank_match=rank_match, - fit_kwargs=fit_kwargs, - seed=seed, - rng=rng, - ) - - def run_conditioned_block( - self, - *, - request: PESourceImputeConditionedBlockRunRequest, - build_imputer: DonorImputerBuilderFn, - rank_match: DonorRankMatcherFn, - fit_kwargs: dict[str, int | float | bool], - seed: int, - rng: np.random.Generator, - ) -> PESourceImputeBlockRunResult | None: - """Run one donor block after the conditioning surface is already selected.""" - condition_vars = list(request.condition_vars) - if not condition_vars: - return None - - block_request = request.block_request - fit_frame = request.donor_condition_source[ - condition_vars + list(block_request.donor_block_spec.model_variables) + ["hh_weight"] - ].copy() - fit_frame = fit_frame.rename(columns={"hh_weight": "weight"}) - imputer = build_imputer( - condition_vars=condition_vars, - target_vars=block_request.donor_block_spec.model_variables, - ) - imputer.fit( - fit_frame, - weight_col="weight", - **fit_kwargs, - ) - generated = imputer.generate( - request.current_condition_source[condition_vars].copy(), - seed=seed, - ) - updated = block_request.current_frame.copy() - for variable in block_request.donor_block_spec.model_variables: - donor_support = ( - pd.to_numeric(block_request.donor_fit_source[variable], errors="coerce") - .replace([np.inf, -np.inf], np.nan) - .dropna() - ) - generated_scores = pd.to_numeric( - generated[variable], - errors="coerce", - ).replace([np.inf, -np.inf], np.nan) - if donor_support.empty: - updated[variable] = generated_scores.fillna(0.0).astype(float) - continue - donor_weights = pd.to_numeric( - block_request.donor_fit_source.loc[donor_support.index, "hh_weight"], - errors="coerce", - ).fillna(0.0) - matched_values = rank_match( - generated_scores.fillna(float(donor_support.median())).astype(float), - donor_values=donor_support.astype(float), - donor_weights=donor_weights.astype(float), - rng=rng, - strategy=block_request.donor_block_spec.strategy_for(variable), - ) - if ( - block_request.entity_key is not None - and block_request.entity_key in block_request.current_generation_source.columns - ): - entity_values = pd.Series( - matched_values.to_numpy(dtype=float), - index=block_request.current_generation_source[ - block_request.entity_key - ].to_numpy(), - dtype=float, - ) - updated[variable] = pd.to_numeric( - updated[block_request.entity_key].map(entity_values), - errors="coerce", - ).fillna(0.0) - else: - updated[variable] = matched_values - if block_request.donor_block_spec.restore_frame is not None: - updated = block_request.donor_block_spec.restore_frame(updated) - updated = apply_donor_variable_semantics( - updated, - block_request.donor_block_spec.restored_variables, - ) - return PESourceImputeBlockRunResult( - updated_frame=updated, - integrated_variables=block_request.donor_block_spec.restored_variables, - condition_vars=tuple(condition_vars), - ) - - -PE_SOURCE_IMPUTE_BLOCK_ENGINE = PESourceImputeBlockEngine.default() diff --git a/src/microplex_us/pe_source_impute_specs.py b/src/microplex_us/pe_source_impute_specs.py deleted file mode 100644 index ea43ed15..00000000 --- a/src/microplex_us/pe_source_impute_specs.py +++ /dev/null @@ -1,423 +0,0 @@ -"""Shared PE source-impute donor block specs loaded from manifest data.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass -from functools import cache -from pathlib import Path -from typing import Any - -import pandas as pd -from microplex.core import SourceArchetype - - -@dataclass(frozen=True) -class PERawIndicatorSpec: - """One manifest-backed raw indicator rule.""" - - column: str - equals: str | int | float | bool - - -@dataclass(frozen=True) -class PESourceImputeRawLoaderSpec: - """Declarative raw-file extraction contract for one donor block.""" - - filename: str - delimiter: str | None - usecols: tuple[str, ...] - direct_columns: dict[str, str] - sum_columns_contains: dict[str, str] - indicator_columns: dict[str, PERawIndicatorSpec] - int_columns: tuple[str, ...] - household_id_parts: tuple[str, ...] - person_id_parts: tuple[str, ...] - constant_columns: dict[str, str | int | float | bool] - copy_columns: dict[str, str] - - -@dataclass(frozen=True) -class PEPolicyengineDatasetLoaderSpec: - """Declarative subprocess dataset-loader contract for one donor block.""" - - module: str - class_name: str - builder_kind: str - household_index_key: str | None - person_household_key: str | None - person_id_key: str | None - length_source_key: str | None - direct_person_columns: dict[str, str] - boolean_person_columns: dict[str, str] - row_indexed_person_columns: dict[str, str] - mapped_row_person_columns: dict[str, str] - mapped_value_tables: dict[str, dict[str, int]] - fallback_person_columns: dict[str, tuple[str, ...]] - copy_person_columns: dict[str, str] - constant_person_columns: dict[str, str | int | float | bool] - income_sum_columns: tuple[str, ...] - int_person_columns: tuple[str, ...] - sex_from_boolean_source: str | None - sex_true_value: int | None - sex_false_value: int | None - generated_household_ids: bool - person_id_from_household_id: bool - - -@dataclass(frozen=True) -class PESourceImputeBlockSpec: - """Declarative contract for one PE donor-survey block.""" - - key: str - survey_name: str - block_name: str | None - default_year: int - archetype: SourceArchetype | None - dataset_loader: PEPolicyengineDatasetLoaderSpec | None - raw_loader: PESourceImputeRawLoaderSpec | None - required_monthcode: int | None - annualized_variables: tuple[str, ...] - household_count_variables: tuple[str, ...] - household_variables: tuple[str, ...] - person_variables: tuple[str, ...] - target_variables: tuple[str, ...] - predictors: tuple[str, ...] - - @property - def descriptor_name(self) -> str: - if self.block_name is None: - return self.survey_name - return f"{self.survey_name}_{self.block_name}" - - def source_name(self, year: int) -> str: - return f"{self.descriptor_name}_{year}" - - def matches_source_name(self, source_name: str | None) -> bool: - """Return whether one runtime source name matches this donor block.""" - normalized = (source_name or "").strip().lower() - descriptor_name = self.descriptor_name.lower() - return normalized == descriptor_name or normalized.startswith(f"{descriptor_name}_") - - -def _manifest_path() -> Path: - return Path(__file__).resolve().parent / "manifests" / "pe_source_impute_blocks.json" - - -def _archetype_from_name(value: str | None) -> SourceArchetype | None: - if value is None: - return None - return SourceArchetype(value) - - -def _raw_indicator_from_payload(payload: dict[str, Any]) -> PERawIndicatorSpec: - return PERawIndicatorSpec( - column=str(payload["column"]), - equals=payload["equals"], - ) - - -def _raw_loader_from_payload( - payload: dict[str, Any] | None, -) -> PESourceImputeRawLoaderSpec | None: - if payload is None: - return None - return PESourceImputeRawLoaderSpec( - filename=str(payload["filename"]), - delimiter=payload.get("delimiter"), - usecols=tuple(payload.get("usecols", ())), - direct_columns={str(key): str(value) for key, value in payload.get("direct_columns", {}).items()}, - sum_columns_contains={ - str(key): str(value) - for key, value in payload.get("sum_columns_contains", {}).items() - }, - indicator_columns={ - str(key): _raw_indicator_from_payload(value) - for key, value in payload.get("indicator_columns", {}).items() - }, - int_columns=tuple(payload.get("int_columns", ())), - household_id_parts=tuple(payload.get("household_id_parts", ())), - person_id_parts=tuple(payload.get("person_id_parts", ())), - constant_columns={ - str(key): value - for key, value in payload.get("constant_columns", {}).items() - }, - copy_columns={str(key): str(value) for key, value in payload.get("copy_columns", {}).items()}, - ) - - -def _dataset_loader_from_payload( - payload: dict[str, Any] | None, -) -> PEPolicyengineDatasetLoaderSpec | None: - if payload is None: - return None - return PEPolicyengineDatasetLoaderSpec( - module=str(payload["module"]), - class_name=str(payload["class_name"]), - builder_kind=str(payload["builder_kind"]), - household_index_key=payload.get("household_index_key"), - person_household_key=payload.get("person_household_key"), - person_id_key=payload.get("person_id_key"), - length_source_key=payload.get("length_source_key"), - direct_person_columns={ - str(key): str(value) - for key, value in payload.get("direct_person_columns", {}).items() - }, - boolean_person_columns={ - str(key): str(value) - for key, value in payload.get("boolean_person_columns", {}).items() - }, - row_indexed_person_columns={ - str(key): str(value) - for key, value in payload.get("row_indexed_person_columns", {}).items() - }, - mapped_row_person_columns={ - str(key): str(value) - for key, value in payload.get("mapped_row_person_columns", {}).items() - }, - mapped_value_tables={ - str(key): { - str(mapped_key): int(mapped_value) - for mapped_key, mapped_value in value.items() - } - for key, value in payload.get("mapped_value_tables", {}).items() - }, - fallback_person_columns={ - str(key): tuple(str(item) for item in value) - for key, value in payload.get("fallback_person_columns", {}).items() - }, - copy_person_columns={ - str(key): str(value) - for key, value in payload.get("copy_person_columns", {}).items() - }, - constant_person_columns={ - str(key): value - for key, value in payload.get("constant_person_columns", {}).items() - }, - income_sum_columns=tuple(payload.get("income_sum_columns", ())), - int_person_columns=tuple(payload.get("int_person_columns", ())), - sex_from_boolean_source=payload.get("sex_from_boolean_source"), - sex_true_value=( - None if payload.get("sex_true_value") is None else int(payload["sex_true_value"]) - ), - sex_false_value=( - None if payload.get("sex_false_value") is None else int(payload["sex_false_value"]) - ), - generated_household_ids=bool(payload.get("generated_household_ids", False)), - person_id_from_household_id=bool(payload.get("person_id_from_household_id", False)), - ) - - -def _spec_from_payload(key: str, payload: dict[str, Any]) -> PESourceImputeBlockSpec: - return PESourceImputeBlockSpec( - key=key, - survey_name=str(payload["survey_name"]), - block_name=payload.get("block_name"), - default_year=int(payload["default_year"]), - archetype=_archetype_from_name(payload.get("archetype")), - dataset_loader=_dataset_loader_from_payload(payload.get("dataset_loader")), - raw_loader=_raw_loader_from_payload(payload.get("raw_loader")), - required_monthcode=( - None - if payload.get("required_monthcode") is None - else int(payload["required_monthcode"]) - ), - annualized_variables=tuple(payload.get("annualized_variables", ())), - household_count_variables=tuple(payload.get("household_count_variables", ())), - household_variables=tuple(payload["household_variables"]), - person_variables=tuple(payload["person_variables"]), - target_variables=tuple(payload["target_variables"]), - predictors=tuple(payload["predictors"]), - ) - - -@cache -def load_pe_source_impute_block_specs() -> dict[str, PESourceImputeBlockSpec]: - """Load the PE donor-block spec manifest.""" - with _manifest_path().open("r", encoding="utf-8") as handle: - payload = json.load(handle) - blocks = payload.get("blocks", {}) - return { - key: _spec_from_payload(key, value) - for key, value in blocks.items() - } - - -def get_pe_source_impute_block_spec(key: str) -> PESourceImputeBlockSpec: - """Return one named PE donor-block spec.""" - specs = load_pe_source_impute_block_specs() - try: - return specs[key] - except KeyError as error: - available = ", ".join(sorted(specs)) - raise KeyError(f"Unknown PE source-impute block '{key}'. Expected one of: {available}") from error - - -def resolve_sipp_source_impute_block_spec(block: str) -> PESourceImputeBlockSpec: - """Resolve one SIPP donor block by short block name.""" - return get_pe_source_impute_block_spec(f"sipp_{block}") - - -def resolve_pe_source_impute_block_key( - *, - donor_source_name: str | None, - donor_block: tuple[str, ...], -) -> str | None: - """Map a donor source name and target block to one manifest block key.""" - block_set = set(donor_block) - for key, spec in load_pe_source_impute_block_specs().items(): - if not spec.matches_source_name(donor_source_name): - continue - if block_set <= set(spec.target_variables): - return key - return None - - -_HOUSEHOLD_COUNT_AGE_THRESHOLDS = { - "count_under_18": 18, - "count_under_6": 6, -} - - -def apply_pe_source_impute_loader_postprocess( - frame: pd.DataFrame, - spec: PESourceImputeBlockSpec, - *, - month_column: str = "MONTHCODE", - household_key: str = "household_id", - age_column: str = "age", -) -> pd.DataFrame: - """Apply manifest-backed donor-row postprocessing for one PE block.""" - result = frame.copy() - if spec.required_monthcode is not None and month_column in result.columns: - monthcode = pd.to_numeric(result[month_column], errors="coerce") - result = result[monthcode.eq(spec.required_monthcode)].copy() - for variable in spec.annualized_variables: - if variable not in result.columns: - continue - result[variable] = pd.to_numeric(result[variable], errors="coerce").fillna(0.0) * 12.0 - if household_key not in result.columns or age_column not in result.columns: - return result - ages = pd.to_numeric(result[age_column], errors="coerce").fillna(0.0) - for variable in spec.household_count_variables: - threshold = _HOUSEHOLD_COUNT_AGE_THRESHOLDS.get(variable) - if threshold is None: - continue - result[variable] = ( - ages.lt(threshold) - .groupby(result[household_key], dropna=False) - .transform("sum") - .astype(float) - ) - return result - - -def prepare_pe_source_impute_condition_frame( - frame: pd.DataFrame, - spec: PESourceImputeBlockSpec, -) -> pd.DataFrame: - """Derive the manifest-backed PE condition surface for one donor block.""" - prepared = frame.copy() - zero = pd.Series(0.0, index=prepared.index, dtype=float) - required = set(spec.predictors) - - def first_present(*columns: str) -> pd.Series: - for column in columns: - if column in prepared.columns: - return ( - pd.to_numeric(prepared[column], errors="coerce") - .fillna(0.0) - .astype(float) - ) - return zero.copy() - - if "is_male" in required and "is_male" not in prepared.columns and "sex" in prepared.columns: - sex = pd.to_numeric(prepared["sex"], errors="coerce").fillna(0) - prepared["is_male"] = sex.eq(1).astype(float) - elif "is_male" in required and "is_male" in prepared.columns: - prepared["is_male"] = pd.to_numeric(prepared["is_male"], errors="coerce").fillna(0.0) - - if "is_female" in required and "is_female" not in prepared.columns and "sex" in prepared.columns: - sex = pd.to_numeric(prepared["sex"], errors="coerce").fillna(0) - prepared["is_female"] = sex.eq(2).astype(float) - elif "is_female" in required and "is_female" in prepared.columns: - prepared["is_female"] = pd.to_numeric(prepared["is_female"], errors="coerce").fillna(0.0) - - if "is_household_head" in required and "is_household_head" not in prepared.columns: - if "is_head" in prepared.columns: - prepared["is_household_head"] = ( - pd.to_numeric(prepared["is_head"], errors="coerce").fillna(0.0).astype(float) - ) - - if "tenure_type" in required and "tenure_type" not in prepared.columns and "tenure" in prepared.columns: - prepared["tenure_type"] = ( - pd.to_numeric(prepared["tenure"], errors="coerce").fillna(0.0).astype(float) - ) - - if "social_security" in required and "social_security" not in prepared.columns: - prepared["social_security"] = first_present("gross_social_security", "social_security") - - if "pension_income" in required and "pension_income" not in prepared.columns: - prepared["pension_income"] = first_present("taxable_pension_income", "pension_income") - - if "interest_dividend_income" in required and "interest_dividend_income" not in prepared.columns: - prepared["interest_dividend_income"] = ( - first_present("taxable_interest_income", "interest_income") - + first_present("ordinary_dividend_income", "dividend_income") - ) - - if ( - "social_security_pension_income" in required - and "social_security_pension_income" not in prepared.columns - ): - prepared["social_security_pension_income"] = ( - first_present("social_security", "gross_social_security") - + first_present("pension_income", "taxable_pension_income") - ) - - if "is_married" in required and "is_married" not in prepared.columns: - if "filing_status" in prepared.columns: - filing_status = prepared["filing_status"].astype(str) - prepared["is_married"] = filing_status.eq("JOINT").astype(float) - elif "marital_status" in prepared.columns: - marital_status = ( - pd.to_numeric(prepared["marital_status"], errors="coerce").fillna(0).astype(int) - ) - prepared["is_married"] = marital_status.isin({1, 2}).astype(float) - - household_key = next( - (candidate for candidate in ("household_id", "spm_unit_id", "family_id") if candidate in prepared.columns), - None, - ) - if household_key is not None: - household_groups = prepared.groupby(household_key, dropna=False) - if "household_size" in required and "household_size" not in prepared.columns: - prepared["household_size"] = ( - household_groups[household_key].transform("size").astype(float) - ) - if "age" in prepared.columns: - ages = pd.to_numeric(prepared["age"], errors="coerce").fillna(0.0) - for variable in required & set(_HOUSEHOLD_COUNT_AGE_THRESHOLDS): - if variable in prepared.columns: - continue - threshold = _HOUSEHOLD_COUNT_AGE_THRESHOLDS[variable] - prepared[variable] = ( - ages.lt(threshold) - .groupby(prepared[household_key], dropna=False) - .transform("sum") - .astype(float) - ) - - if ( - "own_children_in_household" in required - and "own_children_in_household" not in prepared.columns - and "count_under_18" in prepared.columns - ): - prepared["own_children_in_household"] = ( - pd.to_numeric(prepared["count_under_18"], errors="coerce") - .fillna(0.0) - .gt(0.0) - .astype(float) - ) - - return prepared diff --git a/src/microplex_us/pe_targets.py b/src/microplex_us/pe_targets.py deleted file mode 100644 index ce35109e..00000000 --- a/src/microplex_us/pe_targets.py +++ /dev/null @@ -1,234 +0,0 @@ -"""PolicyEngine-parity calibration targets.""" - -from datetime import date -from pathlib import Path - -import pandas as pd -import yaml - -# US States -STATES = [ - "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", - "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", - "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", - "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", - "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY" -] - -# State FIPS codes -STATE_FIPS = { - "AL": "01", "AK": "02", "AZ": "04", "AR": "05", "CA": "06", - "CO": "08", "CT": "09", "DE": "10", "DC": "11", "FL": "12", - "GA": "13", "HI": "15", "ID": "16", "IL": "17", "IN": "18", - "IA": "19", "KS": "20", "KY": "21", "LA": "22", "ME": "23", - "MD": "24", "MA": "25", "MI": "26", "MN": "27", "MS": "28", - "MO": "29", "MT": "30", "NE": "31", "NV": "32", "NH": "33", - "NJ": "34", "NM": "35", "NY": "36", "NC": "37", "ND": "38", - "OH": "39", "OK": "40", "OR": "41", "PA": "42", "RI": "44", - "SC": "45", "SD": "46", "TN": "47", "TX": "48", "UT": "49", - "VT": "50", "VA": "51", "WA": "53", "WV": "54", "WI": "55", "WY": "56" -} - - -class PETargets: - """PolicyEngine calibration targets.""" - - # IRS SOI income variables mapped to CPS columns - INCOME_MAP = { - 'employment_income': 'employment_income', - 'self_employment_income': 'self_employment_income', - 'social_security': 'social_security_income', - 'ssi': 'ssi_income', - 'unemployment_compensation': 'unemployment_income', - 'dividend_income': 'dividend_income', - 'interest_income': 'interest_income', - 'rental_income': 'rental_income', - 'pension_income': 'pension_income', - 'capital_gains': 'capital_gains', - } - - # Benefit programs - BENEFIT_MAP = { - 'snap_participation': 'snap_enrolled', - 'ssi_participation': 'ssi_enrolled', - 'social_security_participation': 'social_security_enrolled', - 'medicaid_enrollment': 'medicaid_enrolled', - } - - def __init__(self, pe_path: str | None = None): - """Initialize PE targets loader. - - Args: - pe_path: Path to PE-US calibration folder. If None, uses installed package. - """ - if pe_path is None: - # Default to installed package location - import sys - for path in sys.path: - test_path = Path(path) / "policyengine_us" / "parameters" / "calibration" - if test_path.exists(): - pe_path = test_path - break - - self.pe_path = Path(pe_path) if pe_path else None - self._targets = None - - def load_all(self) -> pd.DataFrame: - """Load all PE calibration targets.""" - if self._targets is not None: - return self._targets - - if self.pe_path is None: - raise ValueError("PE calibration path not found") - - targets = [] - - for yaml_file in self.pe_path.rglob("*.yaml"): - targets.extend(self._parse_yaml(yaml_file)) - - self._targets = pd.DataFrame(targets) - return self._targets - - def _parse_yaml(self, yaml_file: Path) -> list[dict]: - """Parse a PE calibration YAML file.""" - with open(yaml_file) as f: - data = yaml.safe_load(f) - - if not data: - return [] - - rel_path = yaml_file.relative_to(self.pe_path) - category = str(rel_path.parent).replace("/", ".") - name = yaml_file.stem - - metadata = data.get('metadata', {}) - unit = metadata.get('unit', 'unknown') - description = data.get('description', '') - - targets = [] - - if 'values' in data: - # National target - values = data['values'] - latest_date = max(values.keys()) - latest_value = values[latest_date] - - targets.append({ - 'name': name, - 'category': category, - 'value': latest_value, - 'year': latest_date.year if isinstance(latest_date, date) else int(str(latest_date)[:4]), - 'unit': unit, - 'geography': 'national', - 'state_code': None, - 'state_fips': None, - 'description': description - }) - else: - # State-level data - for key, val in data.items(): - if key in STATES and isinstance(val, dict): - latest_date = max(val.keys()) - latest_value = val[latest_date] - - targets.append({ - 'name': f"{name}_{key}", - 'category': category, - 'value': latest_value, - 'year': latest_date.year if isinstance(latest_date, date) else int(str(latest_date)[:4]), - 'unit': unit, - 'geography': 'state', - 'state_code': key, - 'state_fips': STATE_FIPS.get(key), - 'description': description - }) - - return targets - - def get_national_targets(self) -> pd.DataFrame: - """Get national-level targets.""" - df = self.load_all() - return df[df['geography'] == 'national'] - - def get_state_targets(self, state: str | None = None) -> pd.DataFrame: - """Get state-level targets. - - Args: - state: State code (e.g., 'CA') or None for all states - """ - df = self.load_all() - state_df = df[df['geography'] == 'state'] - - if state: - state_df = state_df[state_df['state_code'] == state] - - return state_df - - def get_income_targets(self) -> pd.DataFrame: - """Get IRS SOI income targets.""" - df = self.load_all() - return df[df['category'].str.startswith('gov.irs.soi')] - - def get_benefit_targets(self) -> pd.DataFrame: - """Get benefit program targets (SNAP, SSI, SS, Medicaid, etc.).""" - df = self.load_all() - benefit_cats = ['gov.usda.snap', 'gov.ssa.ssi', 'gov.ssa.social_security', - 'gov.hhs.medicaid', 'gov.hhs.cms.chip', 'gov.aca'] - return df[df['category'].str.startswith(tuple(benefit_cats))] - - def summary(self) -> dict: - """Get summary of available targets.""" - df = self.load_all() - - return { - 'total': len(df), - 'national': len(df[df['geography'] == 'national']), - 'state': len(df[df['geography'] == 'state']), - 'by_category': df.groupby('category').size().to_dict(), - 'income_targets': len(self.get_income_targets()), - 'benefit_targets': len(self.get_benefit_targets()), - } - - -def get_pe_targets() -> PETargets: - """Get PolicyEngine targets instance.""" - return PETargets() - - -def create_calibration_targets( - synthetic_df: pd.DataFrame, - target_types: list[str] = None -) -> dict[str, float]: - """Create calibration target dict from PE targets. - - Args: - synthetic_df: Synthetic population DataFrame with income/benefit columns - target_types: List of target types to include. Options: - - 'income': IRS SOI income totals - - 'benefits': Benefit program participation - - 'population': Census population by state - - 'all': All targets - - Returns: - Dict mapping target name to target value - """ - pe = get_pe_targets() - - if target_types is None: - target_types = ['income', 'benefits'] - - targets = {} - - if 'income' in target_types or 'all' in target_types: - income_df = pe.get_income_targets() - for _, row in income_df.iterrows(): - targets[f"income_{row['name']}"] = row['value'] - - if 'benefits' in target_types or 'all' in target_types: - benefit_df = pe.get_benefit_targets() - # Only national for now - national_benefits = benefit_df[benefit_df['geography'] == 'national'] - for _, row in national_benefits.iterrows(): - targets[f"benefit_{row['name']}"] = row['value'] - - return targets diff --git a/src/microplex_us/pipelines/__init__.py b/src/microplex_us/pipelines/__init__.py deleted file mode 100644 index 6f1e19d5..00000000 --- a/src/microplex_us/pipelines/__init__.py +++ /dev/null @@ -1,425 +0,0 @@ -"""US production pipeline APIs. - -The package root intentionally resolves exports lazily so importing one pipeline -submodule does not require every optional core/data dependency used by all other -pipelines. -""" - -from __future__ import annotations - -from importlib import import_module -from typing import Any - - -def _exports(module: str, names: tuple[str, ...]) -> dict[str, str]: - return {name: module for name in names} - - -_EXPORT_MODULES: dict[str, str] = { - **_exports( - "microplex_us.pipelines.artifacts", - ( - "USMicroplexArtifactPaths", - "USMicroplexVersionedBuildArtifacts", - "build_and_save_versioned_us_microplex", - "build_and_save_versioned_us_microplex_from_data_dir", - "build_and_save_versioned_us_microplex_from_source_provider", - "build_and_save_versioned_us_microplex_from_source_providers", - "replay_and_save_versioned_us_microplex_policyengine_stage", - "replay_us_microplex_policyengine_stage_from_artifact", - "save_us_microplex_artifacts", - "save_versioned_us_microplex_artifacts", - "save_versioned_us_microplex_build_result", - ), - ), - **_exports( - "microplex_us.pipelines.backfill_pe_native_audit", - ( - "backfill_us_pe_native_audit_bundle", - "backfill_us_pe_native_audit_bundles", - "backfill_us_pe_native_audit_root", - ), - ), - **_exports( - "microplex_us.pipelines.backfill_pe_native_scores", - ( - "backfill_us_pe_native_scores_bundle", - "backfill_us_pe_native_scores_bundles", - "backfill_us_pe_native_scores_root", - "discover_us_candidate_artifact_dirs", - "rebuild_us_pe_native_run_registry", - ), - ), - **_exports( - "microplex_us.pipelines.calibration_stage_parity", - ( - "build_us_calibration_stage_parity_audit", - "write_us_calibration_stage_parity_audit", - ), - ), - **_exports( - "microplex_us.pipelines.experiments", - ( - "USMicroplexExperimentReport", - "USMicroplexExperimentResult", - "USMicroplexSourceExperimentSpec", - "build_us_n_synthetic_sweep_experiments", - "default_us_source_mix_experiments", - "run_us_microplex_n_synthetic_sweep", - "run_us_microplex_source_experiments", - ), - ), - **_exports( - "microplex_us.pipelines.ecps_replacement_comparison", - ( - "build_sound_ecps_replacement_comparison", - "write_sound_ecps_replacement_comparison", - ), - ), - **_exports( - "microplex_us.pipelines.index_db", - ( - "append_us_microplex_run_index_entry", - "compare_us_microplex_target_delta_rows", - "list_us_microplex_target_delta_rows", - "rebuild_us_microplex_run_index", - "resolve_us_microplex_run_index_path", - "select_us_microplex_frontier_index_row", - ), - ), - **_exports( - "microplex_us.pipelines.imputation_ablation", - ( - "ImputationAblationReport", - "ImputationAblationSliceSpec", - "ImputationAblationVariant", - "ImputationAblationVariantScore", - "ImputationSliceScore", - "ImputationTargetScore", - "default_imputation_ablation_variants", - "score_imputation_ablation_variants", - ), - ), - **_exports( - "microplex_us.pipelines.local_reweighting", - ( - "USHouseholdTargetReweightingResult", - "reweight_us_household_targets", - ), - ), - **_exports( - "microplex_us.pipelines.pe_native_optimization", - ( - "PolicyEngineUSNativeWeightOptimizationResult", - "optimize_pe_native_loss_weights", - "optimize_policyengine_us_native_loss_dataset", - "rewrite_policyengine_us_dataset_weights", - ), - ), - **_exports( - "microplex_us.pipelines.pe_native_scores", - ( - "PolicyEngineUSEnhancedCPSNativeScores", - "build_us_pe_native_target_diagnostics_payload", - "compare_us_pe_native_target_deltas", - "compute_batch_us_pe_native_scores", - "compute_policyengine_us_enhanced_cps_native_scores", - "compute_us_pe_native_scores", - "resolve_policyengine_us_data_python", - "resolve_policyengine_us_data_repo_root", - "score_policyengine_us_native_broad_loss", - "write_us_pe_native_scores", - ), - ), - **_exports( - "microplex_us.pipelines.pe_us_data_rebuild", - ( - "PEUSDataRebuildProgram", - "PEUSDataRebuildStage", - "PEUSDataRebuildStatus", - "build_policyengine_us_data_rebuild_markdown", - "build_policyengine_us_data_rebuild_pipeline", - "default_policyengine_us_data_rebuild_config", - "default_policyengine_us_data_rebuild_program", - "default_policyengine_us_data_rebuild_source_providers", - ), - ), - **_exports( - "microplex_us.pipelines.pe_us_data_rebuild_audit", - ( - "build_policyengine_us_data_rebuild_native_audit", - "write_policyengine_us_data_rebuild_native_audit", - ), - ), - **_exports( - "microplex_us.pipelines.pe_us_data_rebuild_checkpoint", - ( - "PEUSDataRebuildCheckpointEvidenceResult", - "PEUSDataRebuildCheckpointResult", - "attach_policyengine_us_data_rebuild_checkpoint_evidence", - "default_policyengine_us_data_rebuild_checkpoint_config", - "default_policyengine_us_data_rebuild_queries", - "run_policyengine_us_data_rebuild_checkpoint", - ), - ), - **_exports( - "microplex_us.pipelines.pe_us_data_rebuild_parity", - ( - "build_policyengine_us_data_rebuild_parity_artifact", - "write_policyengine_us_data_rebuild_parity_artifact", - ), - ), - **_exports( - "microplex_us.pipelines.performance", - ( - "USMicroplexPerformanceHarnessConfig", - "USMicroplexPerformanceHarnessRequest", - "USMicroplexPerformanceHarnessResult", - "USMicroplexPerformanceSession", - "run_us_microplex_performance_harness", - "warm_us_microplex_parity_cache", - ), - ), - **_exports( - "microplex_us.pipelines.pre_sim_parity", - ( - "DEFAULT_PRE_SIM_FOCUS_VARIABLES", - "PreSimParityVariableSpec", - "build_us_pre_sim_parity_audit", - "write_us_pre_sim_parity_audit", - ), - ), - **_exports( - "microplex_us.pipelines.reduced_benchmark", - ( - "DEFAULT_ATOMIC_AGE_BINS", - "DEFAULT_ATOMIC_AGE_LABELS", - "DEFAULT_ATOMIC_EMPLOYMENT_INCOME_BINS", - "DEFAULT_ATOMIC_EMPLOYMENT_INCOME_LABELS", - "USMicroplexReducedBenchmarkHarnessConfig", - "USMicroplexReducedBenchmarkHarnessResult", - "USMicroplexReducedBenchmarkReport", - "USMicroplexReducedBenchmarkSpec", - "USMicroplexReducedCalibrationReport", - "USMicroplexReducedDimensionSpec", - "USMicroplexReducedMeasureSpec", - "USMicroplexReducedMultiCalibrationReport", - "calibrate_and_evaluate_us_reduced_benchmark_specs", - "calibrate_and_evaluate_us_reduced_benchmarks", - "default_us_atomic_rung0_benchmarks", - "default_us_atomic_rung1_benchmarks", - "default_us_atomic_rung2_calibration", - "default_us_atomic_rung3_calibration", - "default_us_atomic_rung4_calibration", - "default_us_atomic_rung5_calibration", - "evaluate_us_reduced_benchmark", - "reduced_benchmark_specs_to_calibration_targets", - "reduced_benchmark_to_calibration_targets", - "run_us_microplex_reduced_benchmark_harness", - ), - ), - **_exports( - "microplex_us.pipelines.registry", - ( - "FrontierMetric", - "USMicroplexRunRegistryEntry", - "append_us_microplex_run_registry_entry", - "build_us_microplex_run_registry_entry", - "load_us_microplex_run_registry", - "resolve_us_microplex_frontier_artifact_dir", - "select_us_microplex_frontier_entry", - ), - ), - **_exports( - "microplex_us.pipelines.seed_stage_parity", - ( - "DEFAULT_SEED_STAGE_BOOLEAN_LANDING_FEATURES", - "DEFAULT_SEED_STAGE_CANDIDATE_ONLY_LANDING_FEATURES", - "DEFAULT_SEED_STAGE_CATEGORICAL_LANDING_FEATURES", - "DEFAULT_SEED_STAGE_FOCUS_VARIABLES", - "SeedStageBooleanLandingFeatureSpec", - "SeedStageCategoricalLandingFeatureSpec", - "SeedStageFocusVariableSpec", - "build_us_seed_stage_parity_audit", - "build_us_seed_tax_unit_support_audit", - "write_us_seed_stage_parity_audit", - "write_us_seed_tax_unit_support_audit", - ), - ), - **_exports( - "microplex_us.pipelines.site_snapshot", - ( - "build_us_microplex_site_snapshot", - "write_us_microplex_site_snapshot", - ), - ), - **_exports( - "microplex_us.pipelines.stage_contracts", - ( - "USPipelineStageContract", - "USStageArtifactContract", - "USStageResourceContract", - "USStageValidationContract", - "config_keys_for_us_pipeline_stage", - "default_us_pipeline_stage_contracts", - "get_us_stage_artifact_contract", - "get_us_pipeline_stage_contract", - "resolve_us_stage_artifact_contract_path", - "serialize_us_pipeline_stage_contracts", - ), - ), - **_exports( - "microplex_us.pipelines.stage_artifacts", - ( - "USCalibratedStageArtifacts", - "USCandidateCalibrationReplayArtifacts", - "USCandidateStageArtifacts", - "USDatasetAssemblyArtifacts", - "USPolicyEngineEntityStageArtifacts", - "USSeedScaffoldStageArtifacts", - "USStageArtifactInventory", - "build_us_stage_artifact_inventory", - "load_us_calibrated_stage_artifacts", - "load_us_candidate_calibration_replay_artifacts", - "load_us_candidate_stage_artifacts", - "load_us_dataset_assembly_artifacts", - "load_us_policyengine_entity_stage_artifacts", - "load_us_seed_scaffold_stage_artifacts", - "load_us_stage_artifact_inventory", - "resolve_us_stage_artifact_path_checked", - "write_us_stage_artifact_inventory", - ), - ), - **_exports( - "microplex_us.pipelines.stage_manifest", - ( - "USDataFlowStageSummary", - "USStageArtifactRecord", - "USStageFailureRecord", - "USStageLifecycleStatus", - "USStageManifest", - "USStageMetric", - "USStageRecord", - "USStageRuntimeEventRecord", - "USStageStatus", - "USValidationEvidenceManifest", - "USValidationEvidenceRecord", - "build_us_validation_evidence_manifest", - "build_us_stage_manifest", - "load_us_policyengine_entity_stage_artifact", - "load_us_stage_manifest", - "resolve_us_stage_artifact_path", - "stage_summary_for_data_flow_snapshot", - "write_us_policyengine_entity_stage_artifact", - "write_us_stage_manifest", - "write_us_validation_evidence_manifest", - ), - ), - **_exports( - "microplex_us.pipelines.stage_readiness", - ( - "USConditionalReadinessReport", - "USConditionalReadinessStageRecord", - "build_us_conditional_readiness_report", - "build_us_stage_reuse_key", - "load_us_conditional_readiness_report", - "write_us_conditional_readiness_report", - ), - ), - **_exports( - "microplex_us.pipelines.stage_run", - ( - "USAuxiliaryArtifact", - "USArtifactRef", - "USCalibrationOutputs", - "USDatasetAssemblyOutputs", - "USDiagnosticOutput", - "USDonorSynthesisOutputs", - "USPolicyEngineEntityOutputs", - "USRunProfileOutputs", - "USSeedScaffoldOutputs", - "USSourceLoadingOutputs", - "USSourcePlanningOutputs", - "USStageInputOverride", - "USStageInputValidationSettings", - "USStageInputValidator", - "USStageOutputManifest", - "USStageRunWriter", - "USValidationBenchmarkingOutputs", - "build_us_stage_output_manifests_from_artifact_manifest", - "parse_us_stage_input_override", - "resolve_us_manifest_or_contract_artifact_path", - "write_us_stage_run_manifests_from_artifact_manifest", - ), - ), - **_exports( - "microplex_us.pipelines.stage_runtime", - ( - "RuntimeUpdateSection", - "USStageRuntimeWriter", - ), - ), - **_exports( - "microplex_us.pipelines.stage9_replay", - ( - "USStage9ReplayResult", - "replay_us_stage9_validation_benchmarking", - ), - ), - **_exports( - "microplex_us.pipelines.summarize_pe_native_family_drilldown", - ( - "classify_pe_native_target_family", - "summarize_us_pe_native_family_drilldown", - ), - ), - **_exports( - "microplex_us.pipelines.summarize_pe_native_regressions", - ("summarize_us_pe_native_regressions",), - ), - **_exports( - "microplex_us.pipelines.summarize_policyengine_oracle_regressions", - ("summarize_us_policyengine_oracle_regressions",), - ), - **_exports( - "microplex_us.pipelines.summarize_policyengine_oracle_target_drilldown", - ("summarize_us_policyengine_oracle_target_drilldown",), - ), - **_exports( - "microplex_us.pipelines.source_stage_parity", - ( - "DEFAULT_CPS_SOURCE_STAGE_FOCUS_VARIABLES", - "DEFAULT_PUF_SOURCE_STAGE_FOCUS_VARIABLES", - "SourceStageParityVariableSpec", - "build_us_cps_source_stage_parity_audit", - "build_us_puf_source_stage_parity_audit", - "build_us_source_stage_parity_audit", - "observation_frame_to_policyengine_entity_bundle", - "write_us_cps_source_stage_parity_audit", - "write_us_puf_source_stage_parity_audit", - "write_us_source_stage_parity_audit", - ), - ), - **_exports( - "microplex_us.pipelines.us", - ( - "USMicroplexBuildConfig", - "USMicroplexBuildResult", - "USMicroplexPipeline", - "USMicroplexTargets", - "build_us_microplex", - ), - ), -} - -__all__ = list(_EXPORT_MODULES) - - -def __getattr__(name: str) -> Any: - """Resolve pipeline convenience exports on first access.""" - module_name = _EXPORT_MODULES.get(name) - if module_name is None: - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") - value = getattr(import_module(module_name), name) - globals()[name] = value - return value diff --git a/src/microplex_us/pipelines/artifact_dataset_assembly.py b/src/microplex_us/pipelines/artifact_dataset_assembly.py deleted file mode 100644 index c812f4ae..00000000 --- a/src/microplex_us/pipelines/artifact_dataset_assembly.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Dataset-assembly artifact helpers for saved US Microplex bundles.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any - -import pandas as pd - -from microplex_us.capital_gains_lots import ( - SyntheticCapitalGainsLotConfig, - generate_synthetic_capital_gains_lots, - synthetic_capital_gains_lot_metadata, - validate_capital_gains_lot_anchors, - write_capital_gains_lots_sqlite, -) -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.us import USMicroplexBuildResult - - -def _maybe_write_capital_gains_lot_artifact( - result: USMicroplexBuildResult, - output_dir: Path, -) -> tuple[Path | None, dict[str, Any] | None]: - if ( - not result.config.capital_gains_lots_enabled - or result.policyengine_tables is None - ): - return None, None - persons = result.policyengine_tables.persons - gain_column = "long_term_capital_gains_before_response" - if gain_column not in persons.columns: - return None, { - "enabled": True, - "written": False, - "reason": f"missing {gain_column}", - } - - period = result.config.policyengine_dataset_year or 2024 - lot_config = SyntheticCapitalGainsLotConfig( - random_seed=( - result.config.capital_gains_lots_random_seed - if result.config.capital_gains_lots_random_seed is not None - else result.config.random_seed - ), - max_lots_per_person=int(result.config.capital_gains_lots_max_lots_per_person), - ) - lots = generate_synthetic_capital_gains_lots( - persons, - period=period, - config=lot_config, - gain_column=gain_column, - ) - validate_capital_gains_lot_anchors(persons, lots, gain_column=gain_column) - metadata = synthetic_capital_gains_lot_metadata( - lot_config, - period=period, - source_gain_column=gain_column, - ) - nonzero_people = int( - pd.to_numeric(persons[gain_column], errors="coerce").fillna(0.0).ne(0.0).sum() - ) - metadata.update( - { - "person_rows": int(len(persons)), - "nonzero_person_rows": nonzero_people, - "lot_rows": int(len(lots)), - } - ) - path = resolve_us_stage_artifact_contract_path( - output_dir, - "08_dataset_assembly", - "capital_gains_lots", - ) - write_capital_gains_lots_sqlite(lots, path, metadata=metadata) - return path, { - "enabled": True, - "written": True, - "path": path.name, - "person_rows": int(len(persons)), - "nonzero_person_rows": nonzero_people, - "lot_rows": int(len(lots)), - "source_gain_column": gain_column, - "max_lots_per_person": int(lot_config.max_lots_per_person), - } diff --git a/src/microplex_us/pipelines/artifact_io.py b/src/microplex_us/pipelines/artifact_io.py deleted file mode 100644 index 3f125435..00000000 --- a/src/microplex_us/pipelines/artifact_io.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Low-level filesystem helpers for saved US Microplex artifacts.""" - -from __future__ import annotations - -import json -from collections.abc import Mapping -from pathlib import Path -from typing import Any - -import pandas as pd - -from microplex_us.pipelines.stage_contracts import ( - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_run import USArtifactRef, USDiagnosticOutput - - -def _stage_artifact_ref( - artifact_root: str | Path, - stage_id: str, - artifact_key: str, - *, - assume_exists: bool = False, -) -> USArtifactRef: - contract = get_us_stage_artifact_contract(stage_id, artifact_key) - return USArtifactRef( - key=artifact_key, - path=resolve_us_stage_artifact_contract_path( - artifact_root, - stage_id, - artifact_key, - ), - format=contract.format, - required=contract.required, - resume_role=contract.resume_role, - assume_exists=assume_exists, - ) - - -def _stage_diagnostics( - stage_id: str, - summary: Mapping[str, Any], -) -> dict[str, USDiagnosticOutput]: - return { - "stage_summary": USDiagnosticOutput( - key="stage_summary", - description=f"Runtime diagnostic summary for {stage_id}.", - summary=dict(summary), - ) - } - - -def _write_parquet_unless_live_artifact_exists( - path: Path, - frame: pd.DataFrame, - *, - live_artifact: bool, -) -> None: - if live_artifact and path.exists(): - return - path.parent.mkdir(parents=True, exist_ok=True) - frame.to_parquet(path, index=False) - - -def _write_json_unless_live_artifact_exists( - path: Path, - payload: Mapping[str, Any], - *, - live_artifact: bool, -) -> None: - if live_artifact and path.exists(): - return - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - - -def _resolve_saved_artifact_file( - artifact_root: Path, - manifest: dict[str, Any], - artifact_key: str, -) -> Path: - artifacts = dict(manifest.get("artifacts", {})) - filename = artifacts.get(artifact_key) - if not filename: - filename = ( - "targets.json" if artifact_key == "targets" else f"{artifact_key}.parquet" - ) - path = Path(filename) - if not path.is_absolute(): - path = artifact_root / path - if not path.exists(): - raise FileNotFoundError(f"Saved artifact file not found: {path}") - return path - - -def _resolve_optional_saved_artifact_file( - artifact_root: Path, - manifest: dict[str, Any], - artifact_key: str, -) -> Path | None: - artifacts = dict(manifest.get("artifacts", {})) - filename = artifacts.get(artifact_key) - if not filename: - return None - path = Path(str(filename)) - if not path.is_absolute(): - path = artifact_root / path - if not path.exists(): - raise FileNotFoundError(f"Saved optional artifact file not found: {path}") - return path - - -def _write_json_atomically(path: Path, payload: dict[str, Any]) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - temp_path = path.with_name(f".{path.name}.tmp") - temp_path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temp_path.replace(path) diff --git a/src/microplex_us/pipelines/artifact_replay.py b/src/microplex_us/pipelines/artifact_replay.py deleted file mode 100644 index 0e010e35..00000000 --- a/src/microplex_us/pipelines/artifact_replay.py +++ /dev/null @@ -1,276 +0,0 @@ -"""Replay helpers for saved US Microplex artifact bundles.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -import h5py -import pandas as pd -from microplex.targets import TargetProvider - -from microplex_us.pipelines.artifact_io import ( - _resolve_optional_saved_artifact_file, - _resolve_saved_artifact_file, -) -from microplex_us.pipelines.artifact_types import USMicroplexVersionedBuildArtifacts -from microplex_us.pipelines.registry import FrontierMetric -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexPipeline, - USMicroplexTargets, -) -from microplex_us.pipelines.versioned_artifacts import ( - _finalize_versioned_build_artifacts, -) -from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessSlice, -) - - -def _facade_pipeline_cls() -> type[USMicroplexPipeline]: - from microplex_us.pipelines import artifacts - - return artifacts.USMicroplexPipeline - - -def _infer_baseline_household_weight_sum( - baseline_dataset: str | Path, - *, - target_period: int, -) -> float | None: - """Best-effort household-weight total inferred from a PE baseline H5.""" - - dataset_path = Path(baseline_dataset).expanduser() - if not dataset_path.exists(): - return None - try: - with h5py.File(dataset_path, "r") as handle: - weights = handle.get("household_weight") - if weights is None: - return None - period_key = str(int(target_period)) - if period_key not in weights: - return None - weight_sum = float(weights[period_key][...].sum()) - except (FileNotFoundError, OSError, ValueError): - return None - return weight_sum if weight_sum > 0.0 else None - - -def _refresh_baseline_derived_weight_targets( - config_payload: dict[str, Any], - *, - explicit_override_keys: set[str], -) -> None: - """Refresh derived total-weight knobs after a replay baseline override.""" - - if "policyengine_baseline_dataset" not in explicit_override_keys: - return - baseline_dataset = config_payload.get("policyengine_baseline_dataset") - if baseline_dataset in (None, ""): - return - target_period = int( - config_payload.get("policyengine_target_period") - or config_payload.get("policyengine_dataset_year") - or 2024 - ) - baseline_weight_sum = _infer_baseline_household_weight_sum( - baseline_dataset, - target_period=target_period, - ) - if baseline_weight_sum is None: - return - - if "policyengine_selection_target_total_weight" not in explicit_override_keys: - config_payload["policyengine_selection_target_total_weight"] = ( - baseline_weight_sum - ) - if config_payload.get("calibration_backend") == "none": - return - if config_payload.get("policyengine_calibration_rescale_to_input_weight_sum"): - if ( - "policyengine_calibration_rescale_to_target_total_weight" - not in explicit_override_keys - ): - config_payload["policyengine_calibration_rescale_to_target_total_weight"] = ( - False - ) - if "policyengine_calibration_target_total_weight" not in explicit_override_keys: - config_payload["policyengine_calibration_target_total_weight"] = None - return - if "policyengine_calibration_target_total_weight" not in explicit_override_keys: - config_payload["policyengine_calibration_target_total_weight"] = ( - baseline_weight_sum - ) - if ( - "policyengine_calibration_rescale_to_target_total_weight" - not in explicit_override_keys - ): - config_payload["policyengine_calibration_rescale_to_target_total_weight"] = True - - -def replay_us_microplex_policyengine_stage_from_artifact( - artifact_dir: str | Path, - *, - config_overrides: dict[str, Any] | None = None, - policyengine_baseline_dataset: str | Path | None = None, -) -> USMicroplexBuildResult: - """Replay calibration/export inputs from a saved artifact without raw ETL. - - This reloads saved seed and synthetic rows, applies optional runtime config - overrides, and reruns the downstream calibration stage from the saved - synthetic population. For PE-DB builds, this intentionally calls - ``calibrate_policyengine_tables`` even when ``calibration_backend="none"`` - so PE target materialization and export-only variables stay on the same - path as a full pipeline build. - """ - - artifact_root = Path(artifact_dir).expanduser().resolve() - manifest_path = artifact_root / "manifest.json" - if not manifest_path.exists(): - raise FileNotFoundError(f"Saved artifact manifest not found: {manifest_path}") - - manifest = json.loads(manifest_path.read_text()) - config_payload = dict(manifest.get("config", {})) - resolved_config_overrides = dict(config_overrides or {}) - if ( - policyengine_baseline_dataset is not None - and "policyengine_baseline_dataset" not in resolved_config_overrides - ): - resolved_config_overrides["policyengine_baseline_dataset"] = str( - policyengine_baseline_dataset - ) - explicit_override_keys = set(resolved_config_overrides) - config_payload.update(resolved_config_overrides) - _refresh_baseline_derived_weight_targets( - config_payload, - explicit_override_keys=explicit_override_keys, - ) - config = USMicroplexBuildConfig(**config_payload) - - seed_data = pd.read_parquet( - _resolve_saved_artifact_file(artifact_root, manifest, "seed_data") - ) - scaffold_seed_data_path = _resolve_optional_saved_artifact_file( - artifact_root, - manifest, - "scaffold_seed_data", - ) - scaffold_seed_data = ( - pd.read_parquet(scaffold_seed_data_path) - if scaffold_seed_data_path is not None - else None - ) - synthetic_data = pd.read_parquet( - _resolve_saved_artifact_file(artifact_root, manifest, "synthetic_data") - ) - targets_payload = json.loads( - _resolve_saved_artifact_file(artifact_root, manifest, "targets").read_text() - ) - targets = USMicroplexTargets( - marginal=dict(targets_payload.get("marginal", {})), - continuous=dict(targets_payload.get("continuous", {})), - ) - - pipeline = _facade_pipeline_cls()(config) - pre_calibration_policyengine_tables = pipeline.build_policyengine_entity_tables( - synthetic_data - ) - if config.policyengine_targets_db is not None: - policyengine_tables, calibrated_data, calibration_summary = ( - pipeline.calibrate_policyengine_tables(pre_calibration_policyengine_tables) - ) - else: - calibrated_data, calibration_summary = pipeline.calibrate( - synthetic_data, - targets, - ) - policyengine_tables = pipeline.build_policyengine_entity_tables(calibrated_data) - - synthesis_metadata = dict(manifest.get("synthesis", {})) - synthesis_metadata["policyengine_stage_replay"] = { - "source_artifact_dir": str(artifact_root), - "source_manifest": str(manifest_path), - "config_override_keys": sorted((config_overrides or {}).keys()), - } - - return USMicroplexBuildResult( - config=config, - seed_data=seed_data, - synthetic_data=synthetic_data, - calibrated_data=calibrated_data, - targets=targets, - calibration_summary=calibration_summary, - synthesis_metadata=synthesis_metadata, - policyengine_tables=policyengine_tables, - pre_calibration_policyengine_tables=pre_calibration_policyengine_tables, - scaffold_seed_data=scaffold_seed_data, - ) - - -def replay_and_save_versioned_us_microplex_policyengine_stage( - artifact_dir: str | Path, - output_root: str | Path | None = None, - *, - config_overrides: dict[str, Any] | None = None, - version_id: str | None = None, - frontier_metric: FrontierMetric = "candidate_composite_parity_loss", - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - defer_policyengine_harness: bool = True, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, -) -> USMicroplexVersionedBuildArtifacts: - """Replay a saved artifact's policy stage and persist a new versioned bundle.""" - - artifact_root = Path(artifact_dir).expanduser().resolve() - build_result = replay_us_microplex_policyengine_stage_from_artifact( - artifact_root, - config_overrides=config_overrides, - policyengine_baseline_dataset=policyengine_baseline_dataset, - ) - resolved_output_root = ( - Path(output_root).expanduser().resolve() - if output_root is not None - else artifact_root.parent - ) - replay_metadata = { - "policyengine_stage_replay": True, - "source_artifact_dir": str(artifact_root), - **dict(run_registry_metadata or {}), - } - return _finalize_versioned_build_artifacts( - build_result, - output_root=resolved_output_root, - version_id=version_id, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=replay_metadata, - ) diff --git a/src/microplex_us/pipelines/artifact_source_diagnostics.py b/src/microplex_us/pipelines/artifact_source_diagnostics.py deleted file mode 100644 index 08039803..00000000 --- a/src/microplex_us/pipelines/artifact_source_diagnostics.py +++ /dev/null @@ -1,416 +0,0 @@ -"""Source-plan and source-weight diagnostics for saved artifact bundles.""" - -from __future__ import annotations - -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -import pandas as pd - -from microplex_us.data_sources.forbes import ForbesFixedSpineConfig -from microplex_us.pipelines.artifact_io import _write_json_atomically -from microplex_us.pipelines.us import USMicroplexBuildResult - - -def _write_us_source_plan_artifact( - result: USMicroplexBuildResult, - output_path: Path, -) -> None: - synthesis = dict(result.synthesis_metadata) - source_names = tuple( - dict.fromkeys( - value - for value in ( - *list(synthesis.get("source_names", ())), - synthesis.get("scaffold_source"), - ) - if isinstance(value, str) and value - ) - ) - payload = { - "formatVersion": 1, - "stageId": "03_source_planning", - "sourceNames": list(source_names), - "scaffoldSource": synthesis.get("scaffold_source"), - "donorIntegratedVariables": list( - synthesis.get("donor_integrated_variables", ()) - ), - "conditionVars": list(synthesis.get("condition_vars", ())), - "targetVars": list(synthesis.get("target_vars", ())), - "donorAuthoritativeOverrideVariables": list( - synthesis.get("donor_authoritative_override_variables", ()) - ), - "donorExcludedVariables": list(synthesis.get("donor_excluded_variables", ())), - } - if result.fusion_plan is not None: - payload["fusionPlan"] = { - "sourceNames": list(result.fusion_plan.source_names), - } - _write_json_atomically(output_path, payload) - - -def _build_source_weight_diagnostics( - result: USMicroplexBuildResult, -) -> dict[str, Any]: - """Summarize source-weight provenance without exporting diagnostics to H5.""" - - entity_summaries = _entity_weight_summaries(result) - household_summary = entity_summaries["households"] - total_household_weight = household_summary["weight_sum"] - source_names = _source_names_for_diagnostics(result) - scaffold_source = _scaffold_source_for_diagnostics(result) - donor_sources = [ - source_name - for source_name in source_names - if scaffold_source is None or source_name != scaffold_source - ] - sources: list[dict[str, Any]] = [] - - fixed_spine_entry = _fixed_spine_source_entry( - result, - total_entity_summaries=entity_summaries, - ) - fixed_entity_summaries = ( - { - entity: { - "count": fixed_spine_entry.get(f"{prefix}_count", 0), - "weight_sum": fixed_spine_entry.get(f"{prefix}_weight_sum", 0.0), - "available": fixed_spine_entry.get(f"{prefix}_weight_sum") is not None, - } - for entity, prefix in _SOURCE_DIAGNOSTIC_ENTITY_PREFIXES.items() - } - if fixed_spine_entry is not None - else {} - ) - ordinary_entity_summaries = _subtract_entity_summaries( - entity_summaries, - fixed_entity_summaries, - ) - - sources.append( - { - "source_name": scaffold_source or "microplex_synthetic_population", - "source_class": "synthetic_population", - "source_role": "scaffold", - "source_names": source_names, - **_source_entity_fields(ordinary_entity_summaries, entity_summaries), - } - ) - - donor_integrated_variables = list( - result.synthesis_metadata.get("donor_integrated_variables", ()) - ) - for source_name in donor_sources: - sources.append( - { - "source_name": source_name, - "source_class": "donor_imputation", - "source_role": "donor", - "integrated_variable_count": len(donor_integrated_variables), - "row_contribution": "variables_imputed_into_synthetic_rows", - **_source_entity_fields( - _zero_entity_summaries(), - entity_summaries, - ), - } - ) - - if fixed_spine_entry is not None: - sources.append(fixed_spine_entry) - - numeric_shares = [ - float(source["household_weight_share"]) - for source in sources - if isinstance(source.get("household_weight_share"), int | float) - ] - summary = { - "diagnostic_scope": "saved_artifact_entity_weight_by_source_rows", - "household_count": household_summary["count"], - "total_household_weight": total_household_weight, - "person_count": entity_summaries["persons"]["count"], - "total_person_weight": entity_summaries["persons"]["weight_sum"], - "tax_unit_count": entity_summaries["tax_units"]["count"], - "total_tax_unit_weight": entity_summaries["tax_units"]["weight_sum"], - "source_entry_count": len(sources), - "donor_source_count": len(donor_sources), - "donor_integrated_variable_count": len(donor_integrated_variables), - "support_rows_appended": False, - "donor_rows_appended": False, - "support_household_weight_sum": 0.0, - "support_household_weight_share": 0.0, - "puf_support_household_weight_sum": 0.0, - "puf_support_household_weight_share": 0.0, - "max_source_household_weight_share": ( - max(numeric_shares) if numeric_shares else None - ), - "fixed_spine_enabled": bool( - isinstance(result.calibration_summary.get("fixed_spine"), dict) - and result.calibration_summary.get("fixed_spine", {}).get("enabled") - ), - "h5_exported": False, - } - - return { - "formatVersion": 1, - "created_at": datetime.now(UTC).isoformat(), - "summary": summary, - "sources": sources, - "notes": [ - "Donor sources contribute imputed variables to synthetic rows; they are not appended as weighted source rows.", - "Source diagnostics are written as a sidecar and are intentionally not exported into PolicyEngine H5 variables.", - ], - } - - -_SOURCE_DIAGNOSTIC_ENTITY_PREFIXES = { - "households": "household", - "persons": "person", - "tax_units": "tax_unit", -} - - -def _entity_weight_summaries( - result: USMicroplexBuildResult, -) -> dict[str, dict[str, Any]]: - summaries = _zero_entity_summaries() - if result.policyengine_tables is not None: - for entity in _SOURCE_DIAGNOSTIC_ENTITY_PREFIXES: - frame, weights = _policyengine_entity_weights(result, entity) - if frame is None or weights is None: - continue - summaries[entity] = { - "count": int(len(frame)), - "weight_sum": float(weights.sum()), - "available": True, - } - return summaries - - frame = result.calibrated_data - if frame.empty: - return summaries - weight_column = ( - "household_weight" if "household_weight" in frame.columns else "weight" - ) - if weight_column not in frame.columns: - summaries["persons"] = { - "count": int(len(frame)), - "weight_sum": 0.0, - "available": False, - } - return summaries - - weights = pd.to_numeric(frame[weight_column], errors="coerce").fillna(0.0) - summaries["persons"] = { - "count": int(len(frame)), - "weight_sum": float(weights.sum()), - "available": True, - } - if "household_id" in frame.columns: - household_weights = weights.groupby(frame["household_id"], sort=False).first() - summaries["households"] = { - "count": int(len(household_weights)), - "weight_sum": float(household_weights.sum()), - "available": True, - } - return summaries - - -def _zero_entity_summaries() -> dict[str, dict[str, Any]]: - return { - entity: {"count": 0, "weight_sum": 0.0, "available": False} - for entity in _SOURCE_DIAGNOSTIC_ENTITY_PREFIXES - } - - -def _subtract_entity_summaries( - total: dict[str, dict[str, Any]], - subtract: dict[str, dict[str, Any]], -) -> dict[str, dict[str, Any]]: - result: dict[str, dict[str, Any]] = {} - for entity in _SOURCE_DIAGNOSTIC_ENTITY_PREFIXES: - total_summary = total.get(entity, {}) - subtract_summary = subtract.get(entity, {}) - total_count = int(total_summary.get("count", 0) or 0) - subtract_count = int(subtract_summary.get("count", 0) or 0) - total_weight = float(total_summary.get("weight_sum", 0.0) or 0.0) - subtract_weight = float(subtract_summary.get("weight_sum", 0.0) or 0.0) - result[entity] = { - "count": max(total_count - subtract_count, 0), - "weight_sum": max(total_weight - subtract_weight, 0.0), - "available": bool(total_summary.get("available", False)), - } - return result - - -def _source_entity_fields( - source: dict[str, dict[str, Any]], - total: dict[str, dict[str, Any]], -) -> dict[str, Any]: - fields: dict[str, Any] = {} - for entity, prefix in _SOURCE_DIAGNOSTIC_ENTITY_PREFIXES.items(): - source_summary = source.get(entity, {}) - total_summary = total.get(entity, {}) - source_weight = source_summary.get("weight_sum") - fields[f"{prefix}_count"] = int(source_summary.get("count", 0) or 0) - fields[f"{prefix}_weight_sum"] = ( - float(source_weight) if source_weight is not None else None - ) - fields[f"{prefix}_weight_share"] = _weight_share( - float(source_weight or 0.0), - float(total_summary.get("weight_sum", 0.0) or 0.0), - ) - return fields - - -def _policyengine_entity_weights( - result: USMicroplexBuildResult, - entity: str, -) -> tuple[pd.DataFrame | None, pd.Series | None]: - tables = result.policyengine_tables - if tables is None: - return None, None - households = tables.households - if households is None or "household_weight" not in households.columns: - household_weight_by_id = None - else: - household_weights = pd.to_numeric( - households["household_weight"], - errors="coerce", - ).fillna(0.0) - household_weight_by_id = pd.Series( - household_weights.to_numpy(dtype=float), - index=households["household_id"], - ) - if entity == "households": - if households is None or household_weight_by_id is None: - return None, None - return households, household_weights - if entity == "persons": - return _frame_and_entity_weights( - tables.persons, - direct_weight_columns=("weight", "person_weight", "household_weight"), - household_weight_by_id=household_weight_by_id, - ) - if entity == "tax_units": - return _frame_and_entity_weights( - tables.tax_units, - direct_weight_columns=("tax_unit_weight", "household_weight"), - household_weight_by_id=household_weight_by_id, - ) - return None, None - - -def _frame_and_entity_weights( - frame: pd.DataFrame | None, - *, - direct_weight_columns: tuple[str, ...], - household_weight_by_id: pd.Series | None, -) -> tuple[pd.DataFrame | None, pd.Series | None]: - if frame is None: - return None, None - for column in direct_weight_columns: - if column in frame.columns: - return ( - frame, - pd.to_numeric(frame[column], errors="coerce").fillna(0.0), - ) - if household_weight_by_id is not None and "household_id" in frame.columns: - return ( - frame, - frame["household_id"].map(household_weight_by_id).fillna(0.0), - ) - return frame, pd.Series(0.0, index=frame.index, dtype=float) - - -def _source_names_for_diagnostics(result: USMicroplexBuildResult) -> list[str]: - synthesis = dict(result.synthesis_metadata) - names: list[str] = [] - if result.fusion_plan is not None: - names.extend(str(name) for name in result.fusion_plan.source_names) - names.extend(str(name) for name in synthesis.get("source_names", ()) if name) - scaffold_source = synthesis.get("scaffold_source") - if scaffold_source: - names.append(str(scaffold_source)) - for frame in result.source_frames: - source = getattr(frame, "source", None) - source_name = getattr(source, "name", None) - if source_name: - names.append(str(source_name)) - return list(dict.fromkeys(names)) - - -def _scaffold_source_for_diagnostics(result: USMicroplexBuildResult) -> str | None: - scaffold_source = result.synthesis_metadata.get("scaffold_source") - if scaffold_source: - return str(scaffold_source) - source_names = _source_names_for_diagnostics(result) - return source_names[0] if source_names else None - - -def _fixed_spine_source_entry( - result: USMicroplexBuildResult, - *, - total_entity_summaries: dict[str, dict[str, Any]], -) -> dict[str, Any] | None: - fixed_spine = result.calibration_summary.get("fixed_spine") - if not isinstance(fixed_spine, dict) or not fixed_spine.get("enabled"): - return None - - source_metadata = dict(fixed_spine.get("source_metadata", {})) - entry: dict[str, Any] = { - "source_name": source_metadata.get("source", "forbes_fixed_spine"), - "source_class": "fixed_spine", - "source_role": "post_calibration_append", - "source_metadata": source_metadata, - } - fixed_spine_config = ForbesFixedSpineConfig() - fixed_entity_summaries = _fixed_spine_entity_summaries( - result, - fixed_spine_config=fixed_spine_config, - ) - entry.update( - { - **_source_entity_fields( - fixed_entity_summaries, - total_entity_summaries, - ), - "household_id_detection": { - "method": "forbes_default_household_id_floor", - "minimum_household_id": fixed_spine_config.household_id_start, - }, - } - ) - return entry - - -def _fixed_spine_entity_summaries( - result: USMicroplexBuildResult, - *, - fixed_spine_config: ForbesFixedSpineConfig, -) -> dict[str, dict[str, Any]]: - summaries = _zero_entity_summaries() - id_floors = { - "households": ("household_id", fixed_spine_config.household_id_start), - "persons": ("person_id", fixed_spine_config.person_id_start), - "tax_units": ("tax_unit_id", fixed_spine_config.tax_unit_id_start), - } - for entity, (id_column, id_floor) in id_floors.items(): - frame, weights = _policyengine_entity_weights(result, entity) - if frame is None or weights is None or id_column not in frame.columns: - continue - ids = pd.to_numeric(frame[id_column], errors="coerce") - fixed_mask = ids >= id_floor - fixed_weights = weights.loc[fixed_mask] - summaries[entity] = { - "count": int(fixed_mask.sum()), - "weight_sum": float(fixed_weights.sum()), - "available": True, - } - return summaries - - -def _weight_share(value: float, denominator: float) -> float | None: - if denominator <= 0: - return None - return float(value) / float(denominator) diff --git a/src/microplex_us/pipelines/artifact_types.py b/src/microplex_us/pipelines/artifact_types.py deleted file mode 100644 index d28fcf6d..00000000 --- a/src/microplex_us/pipelines/artifact_types.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Shared result types for saved US Microplex artifact bundles.""" - -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.us import USMicroplexBuildResult - - -@dataclass(frozen=True) -class USMicroplexArtifactPaths: - """Filesystem locations for persisted pipeline artifacts.""" - - output_dir: Path - seed_data: Path - synthetic_data: Path - calibrated_data: Path - targets: Path - manifest: Path - version_id: str | None = None - scaffold_seed_data: Path | None = None - synthesizer: Path | None = None - policyengine_dataset: Path | None = None - data_flow_snapshot: Path | None = None - stage_manifest: Path | None = None - artifact_inventory: Path | None = None - conditional_readiness: Path | None = None - source_plan: Path | None = None - pre_calibration_policyengine_entity_tables: Path | None = None - policyengine_entity_tables: Path | None = None - calibration_summary: Path | None = None - validation_evidence: Path | None = None - policyengine_harness: Path | None = None - policyengine_native_scores: Path | None = None - policyengine_native_audit: Path | None = None - policyengine_native_target_diagnostics: Path | None = None - child_tax_unit_agi_drift: Path | None = None - capital_gains_lots: Path | None = None - source_weight_diagnostics: Path | None = None - run_registry: Path | None = None - run_index_db: Path | None = None - - -@dataclass(frozen=True) -class USMicroplexVersionedBuildArtifacts: - """End-to-end build, save, and frontier-tracking result.""" - - build_result: USMicroplexBuildResult - artifact_paths: USMicroplexArtifactPaths - current_entry: Any | None = None - frontier_entry: Any | None = None - frontier_delta: float | None = None diff --git a/src/microplex_us/pipelines/artifact_validation.py b/src/microplex_us/pipelines/artifact_validation.py deleted file mode 100644 index ced295d5..00000000 --- a/src/microplex_us/pipelines/artifact_validation.py +++ /dev/null @@ -1,160 +0,0 @@ -"""Validation and benchmark artifact helpers for saved US Microplex bundles.""" - -from __future__ import annotations - -from collections.abc import Mapping -from importlib.metadata import PackageNotFoundError, version -from pathlib import Path -from typing import Any - -from microplex.targets import TargetProvider - -from microplex_us.pipelines.us import USMicroplexBuildResult -from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessSlice, - default_policyengine_us_db_all_target_slices, - default_policyengine_us_harness_slices, - filter_nonempty_policyengine_us_harness_slices, -) -from microplex_us.policyengine.us import PolicyEngineUSDBTargetProvider - - -def _stage9_benchmark_summary(manifest: Mapping[str, Any]) -> dict[str, Any]: - summary: dict[str, Any] = {} - for key in ( - "policyengine_harness", - "policyengine_native_scores", - "policyengine_native_audit", - "imputation_ablation", - ): - value = manifest.get(key) - if isinstance(value, Mapping): - summary[key] = dict(value) - diagnostics = manifest.get("diagnostics") - if isinstance(diagnostics, Mapping): - for key in ("child_tax_unit_agi_drift", "capital_gains_lots"): - value = diagnostics.get(key) - if isinstance(value, Mapping): - summary[key] = dict(value) - return summary - - -def _summarize_child_tax_unit_agi_drift_ratios( - payload: dict[str, Any], - *, - stage: str, - variables: tuple[str, ...], -) -> dict[str, Any]: - stages = dict(payload.get("stages", {})) - stage_payload = dict(stages.get(stage, {})) - subsets = dict(stage_payload.get("subsets", {})) - adults = dict(subsets.get("adults", {})) - dependents = dict(subsets.get("dependents_under_20", {})) - ratios: dict[str, float | None] = {} - for variable in variables: - adult_sum = adults.get(variable, {}).get("sum") - child_sum = dependents.get(variable, {}).get("sum") - if adult_sum in (None, 0): - ratios[variable] = None - else: - ratios[variable] = float(child_sum or 0.0) / float(adult_sum) - return { - "stage": stage, - "dependents_under_20_sum_share": ratios, - } - - -def _resolve_policyengine_harness_context( - result: USMicroplexBuildResult, - *, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None, - policyengine_target_provider: TargetProvider | None, - policyengine_baseline_dataset: str | Path | None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ), - policyengine_harness_metadata: dict[str, Any] | None, -) -> tuple[ - TargetProvider | None, - str | Path | None, - tuple[PolicyEngineUSHarnessSlice, ...], - dict[str, Any], -]: - resolved_target_provider = policyengine_target_provider - if ( - resolved_target_provider is None - and result.config.policyengine_targets_db is not None - ): - resolved_target_provider = PolicyEngineUSDBTargetProvider( - result.config.policyengine_targets_db - ) - - resolved_baseline_dataset = ( - policyengine_baseline_dataset or result.config.policyengine_baseline_dataset - ) - - harness_period = result.config.policyengine_dataset_year or 2024 - if policyengine_harness_slices is not None: - resolved_harness_slices = tuple(policyengine_harness_slices) - elif result.config.policyengine_targets_db is not None: - resolved_harness_slices = default_policyengine_us_db_all_target_slices( - period=harness_period, - reform_id=result.config.policyengine_target_reform_id, - ) - else: - resolved_harness_slices = default_policyengine_us_harness_slices( - period=harness_period - ) - if resolved_target_provider is not None and resolved_harness_slices: - resolved_harness_slices = filter_nonempty_policyengine_us_harness_slices( - resolved_target_provider, - resolved_harness_slices, - cache=policyengine_comparison_cache, - ) - - resolved_harness_metadata = { - "baseline_dataset": ( - Path(resolved_baseline_dataset).name - if resolved_baseline_dataset is not None - else None - ), - "targets_db": ( - Path(result.config.policyengine_targets_db).name - if result.config.policyengine_targets_db is not None - else None - ), - "target_period": result.config.policyengine_target_period, - "target_variables": list(result.config.policyengine_target_variables), - "target_domains": list(result.config.policyengine_target_domains), - "target_geo_levels": list(result.config.policyengine_target_geo_levels), - "target_profile": result.config.policyengine_target_profile, - "calibration_target_profile": ( - result.config.policyengine_calibration_target_profile - ), - "target_reform_id": result.config.policyengine_target_reform_id, - "harness_slice_names": [ - slice_spec.name for slice_spec in resolved_harness_slices - ], - "policyengine_us_runtime_version": _resolve_policyengine_us_runtime_version(), - "harness_suite": ( - "policyengine_us_all_targets" - if result.config.policyengine_targets_db is not None - and policyengine_harness_slices is None - else None - ), - **dict(policyengine_harness_metadata or {}), - } - return ( - resolved_target_provider, - resolved_baseline_dataset, - resolved_harness_slices, - resolved_harness_metadata, - ) - - -def _resolve_policyengine_us_runtime_version() -> str | None: - try: - return version("policyengine-us") - except PackageNotFoundError: - return None diff --git a/src/microplex_us/pipelines/artifacts.py b/src/microplex_us/pipelines/artifacts.py deleted file mode 100644 index 3cb464e7..00000000 --- a/src/microplex_us/pipelines/artifacts.py +++ /dev/null @@ -1,812 +0,0 @@ -"""Artifact persistence for production pipeline outputs.""" - -from __future__ import annotations - -import json -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -from microplex.targets import ( - TargetProvider, - assert_valid_benchmark_artifact_manifest, -) - -from microplex_us.pipelines.artifact_dataset_assembly import ( - _maybe_write_capital_gains_lot_artifact, -) -from microplex_us.pipelines.artifact_io import ( - _stage_artifact_ref, - _stage_diagnostics, - _write_json_atomically, - _write_json_unless_live_artifact_exists, - _write_parquet_unless_live_artifact_exists, -) -from microplex_us.pipelines.artifact_replay import ( - replay_and_save_versioned_us_microplex_policyengine_stage, - replay_us_microplex_policyengine_stage_from_artifact, -) -from microplex_us.pipelines.artifact_source_diagnostics import ( - _build_source_weight_diagnostics, - _write_us_source_plan_artifact, -) -from microplex_us.pipelines.artifact_types import ( - USMicroplexArtifactPaths, - USMicroplexVersionedBuildArtifacts, -) -from microplex_us.pipelines.artifact_validation import ( - _resolve_policyengine_harness_context, - _stage9_benchmark_summary, - _summarize_child_tax_unit_agi_drift_ratios, -) -from microplex_us.pipelines.index_db import ( - append_us_microplex_run_index_entry, -) -from microplex_us.pipelines.pe_native_scores import ( - compute_us_pe_native_scores, -) -from microplex_us.pipelines.registry import ( - append_us_microplex_run_registry_entry, - build_us_microplex_run_registry_entry, -) -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_manifest import ( - write_us_policyengine_entity_stage_artifact, - write_us_validation_evidence_manifest, -) -from microplex_us.pipelines.stage_run import ( - USDatasetAssemblyOutputs, - USStageInputOverride, - USValidationBenchmarkingOutputs, - write_us_stage_run_manifests_from_artifact_manifest, -) -from microplex_us.pipelines.stage_runtime import USStageRuntimeWriter -from microplex_us.pipelines.summarize_child_tax_unit_agi_drift import ( - DEFAULT_VARIABLES as DEFAULT_CHILD_TAX_UNIT_AGI_DRIFT_VARIABLES, -) -from microplex_us.pipelines.summarize_child_tax_unit_agi_drift import ( - summarize_child_tax_unit_agi_drift, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildResult, - USMicroplexPipeline, -) -from microplex_us.pipelines.versioned_artifacts import ( - _allocate_versioned_output_dir, - _allocate_versioned_output_dir_for_config, - _finalize_versioned_build_artifacts, - _initialize_versioned_stage_runtime_writer, - _json_ready, - _json_ready_query, - _provider_query_plan, - _registry_metric_value, - _short_config_hash, - build_and_save_versioned_us_microplex, - build_and_save_versioned_us_microplex_from_data_dir, - build_and_save_versioned_us_microplex_from_source_provider, - build_and_save_versioned_us_microplex_from_source_providers, - save_versioned_us_microplex_artifacts, - save_versioned_us_microplex_build_result, -) -from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessSlice, - evaluate_policyengine_us_harness, -) - -__all__ = [ - "USMicroplexArtifactPaths", - "USMicroplexVersionedBuildArtifacts", - "_allocate_versioned_output_dir", - "_allocate_versioned_output_dir_for_config", - "_finalize_versioned_build_artifacts", - "_initialize_versioned_stage_runtime_writer", - "_json_ready", - "_json_ready_query", - "_maybe_write_capital_gains_lot_artifact", - "_provider_query_plan", - "_registry_metric_value", - "_short_config_hash", - "build_and_save_versioned_us_microplex", - "build_and_save_versioned_us_microplex_from_data_dir", - "build_and_save_versioned_us_microplex_from_source_provider", - "build_and_save_versioned_us_microplex_from_source_providers", - "replay_and_save_versioned_us_microplex_policyengine_stage", - "replay_us_microplex_policyengine_stage_from_artifact", - "save_us_microplex_artifacts", - "save_versioned_us_microplex_artifacts", - "save_versioned_us_microplex_build_result", -] - - -def save_us_microplex_artifacts( - result: USMicroplexBuildResult, - output_dir: str | Path, - *, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - defer_policyengine_harness: bool = False, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, - enable_child_tax_unit_agi_drift: bool = False, - child_tax_unit_agi_drift_variables: tuple[str, ...] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), - stage_runtime_writer: USStageRuntimeWriter | None = None, -) -> USMicroplexArtifactPaths: - """Persist a build result as a reproducible artifact bundle.""" - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - scaffold_seed_data = ( - result.scaffold_seed_data - if result.scaffold_seed_data is not None - else result.seed_data - ) - pre_calibration_policyengine_tables = result.pre_calibration_policyengine_tables - - seed_data_path = resolve_us_stage_artifact_contract_path( - output_dir, - "05_donor_integration_synthesis", - "seed_data", - ) - synthetic_data_path = resolve_us_stage_artifact_contract_path( - output_dir, - "05_donor_integration_synthesis", - "synthetic_data", - ) - calibrated_data_path = resolve_us_stage_artifact_contract_path( - output_dir, - "07_calibration", - "calibrated_data", - ) - targets_path = resolve_us_stage_artifact_contract_path( - output_dir, - "07_calibration", - "targets", - ) - manifest_path = resolve_us_stage_artifact_contract_path( - output_dir, - "01_run_profile", - "manifest", - ) - source_weight_diagnostics_path = resolve_us_stage_artifact_contract_path( - output_dir, - "05_donor_integration_synthesis", - "source_weight_diagnostics", - ) - synthesizer_path = ( - resolve_us_stage_artifact_contract_path( - output_dir, - "05_donor_integration_synthesis", - "synthesizer", - ) - if result.synthesizer - else None - ) - policyengine_dataset_path = ( - resolve_us_stage_artifact_contract_path( - output_dir, - "08_dataset_assembly", - "policyengine_dataset", - ) - if result.policyengine_tables is not None - else None - ) - data_flow_snapshot_path = resolve_us_stage_artifact_contract_path( - output_dir, - "08_dataset_assembly", - "data_flow_snapshot", - ) - stage_manifest_path = resolve_us_stage_artifact_contract_path( - output_dir, - "08_dataset_assembly", - "stage_manifest", - ) - artifact_inventory_path = resolve_us_stage_artifact_contract_path( - output_dir, - "08_dataset_assembly", - "artifact_inventory", - ) - conditional_readiness_path = resolve_us_stage_artifact_contract_path( - output_dir, - "08_dataset_assembly", - "conditional_readiness", - ) - source_plan_path = resolve_us_stage_artifact_contract_path( - output_dir, - "03_source_planning", - "source_plan", - ) - scaffold_seed_data_path = resolve_us_stage_artifact_contract_path( - output_dir, - "04_seed_scaffold", - "scaffold_seed_data", - ) - policyengine_entity_tables_path = ( - resolve_us_stage_artifact_contract_path( - output_dir, - "07_calibration", - "policyengine_entity_tables", - ) - if result.policyengine_tables is not None - else None - ) - calibration_summary_path = resolve_us_stage_artifact_contract_path( - output_dir, - "07_calibration", - "calibration_summary", - ) - pre_calibration_policyengine_entity_tables_path = ( - resolve_us_stage_artifact_contract_path( - output_dir, - "06_policyengine_entities", - "pre_calibration_policyengine_entity_tables", - ) - if pre_calibration_policyengine_tables is not None - else None - ) - validation_evidence_path = ( - resolve_us_stage_artifact_contract_path( - output_dir, - "09_validation_benchmarking", - "validation_evidence", - ) - if result.policyengine_tables is not None - else None - ) - policyengine_harness_path = None - policyengine_native_scores_path = None - resolved_run_registry_path = None - resolved_run_index_path = None - harness_payload = None - live_artifacts = stage_runtime_writer is not None - - if stage_runtime_writer is not None: - stage_runtime_writer.start_stage("08_dataset_assembly") - - try: - _write_parquet_unless_live_artifact_exists( - scaffold_seed_data_path, - scaffold_seed_data, - live_artifact=live_artifacts, - ) - _write_parquet_unless_live_artifact_exists( - seed_data_path, - result.seed_data, - live_artifact=live_artifacts, - ) - _write_parquet_unless_live_artifact_exists( - synthetic_data_path, - result.synthetic_data, - live_artifact=live_artifacts, - ) - _write_parquet_unless_live_artifact_exists( - calibrated_data_path, - result.calibrated_data, - live_artifact=live_artifacts, - ) - _write_json_unless_live_artifact_exists( - targets_path, - { - "marginal": result.targets.marginal, - "continuous": result.targets.continuous, - }, - live_artifact=live_artifacts, - ) - - if result.synthesizer is not None and synthesizer_path is not None: - result.synthesizer.save(synthesizer_path) - - if not (live_artifacts and source_plan_path.exists()): - _write_us_source_plan_artifact(result, source_plan_path) - if not (live_artifacts and calibration_summary_path.exists()): - _write_json_atomically(calibration_summary_path, result.calibration_summary) - source_weight_diagnostics_payload = _build_source_weight_diagnostics(result) - _write_json_atomically( - source_weight_diagnostics_path, - source_weight_diagnostics_payload, - ) - - if ( - pre_calibration_policyengine_entity_tables_path is not None - and pre_calibration_policyengine_tables is not None - ): - if not ( - live_artifacts - and pre_calibration_policyengine_entity_tables_path.exists() - ): - write_us_policyengine_entity_stage_artifact( - pre_calibration_policyengine_tables, - output_dir, - stage_id="06_policyengine_entities", - artifact_key="pre_calibration_policyengine_entity_tables", - checkpoint_stage="post_microsim", - ) - if ( - policyengine_entity_tables_path is not None - and result.policyengine_tables is not None - ): - if not (live_artifacts and policyengine_entity_tables_path.exists()): - write_us_policyengine_entity_stage_artifact( - result.policyengine_tables, - output_dir, - stage_id="07_calibration", - artifact_key="policyengine_entity_tables", - checkpoint_stage="post_calibration", - ) - if ( - result.policyengine_tables is not None - and policyengine_dataset_path is not None - ): - period = result.config.policyengine_dataset_year or 2024 - USMicroplexPipeline(result.config).export_policyengine_dataset( - result, - policyengine_dataset_path, - period=period, - ) - capital_gains_lots_path, capital_gains_lots_summary = ( - _maybe_write_capital_gains_lot_artifact(result, output_dir) - ) - - if stage_runtime_writer is not None: - stage_runtime_writer.complete_stage( - USDatasetAssemblyOutputs( - policyengine_dataset=( - _stage_artifact_ref( - output_dir, - "08_dataset_assembly", - "policyengine_dataset", - ) - if policyengine_dataset_path is not None - else None - ), - stage_manifest=_stage_artifact_ref( - output_dir, - "08_dataset_assembly", - "stage_manifest", - assume_exists=True, - ), - data_flow_snapshot=_stage_artifact_ref( - output_dir, - "08_dataset_assembly", - "data_flow_snapshot", - assume_exists=True, - ), - artifact_inventory=_stage_artifact_ref( - output_dir, - "08_dataset_assembly", - "artifact_inventory", - assume_exists=True, - ), - conditional_readiness=_stage_artifact_ref( - output_dir, - "08_dataset_assembly", - "conditional_readiness", - assume_exists=True, - ), - diagnostics=_stage_diagnostics( - "08_dataset_assembly", - { - "policyengine_dataset": ( - str(policyengine_dataset_path.relative_to(output_dir)) - if policyengine_dataset_path is not None - else None - ), - "has_capital_gains_lots": ( - capital_gains_lots_path is not None - ), - }, - ), - ) - ) - except Exception as exc: - if stage_runtime_writer is not None: - stage_runtime_writer.fail_stage("08_dataset_assembly", exc) - raise - - try: - if stage_runtime_writer is not None: - stage_runtime_writer.start_stage("09_validation_benchmarking") - - ( - resolved_target_provider, - resolved_baseline_dataset, - resolved_harness_slices, - resolved_harness_metadata, - ) = _resolve_policyengine_harness_context( - result, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - ) - - harness_summary = None - native_scores_payload = ( - dict(precomputed_policyengine_native_scores) - if precomputed_policyengine_native_scores is not None - else None - ) - if precomputed_policyengine_harness_payload is not None: - harness_payload = dict(precomputed_policyengine_harness_payload) - policyengine_harness_path = resolve_us_stage_artifact_contract_path( - output_dir, - "09_validation_benchmarking", - "policyengine_harness", - ) - policyengine_harness_path.write_text( - json.dumps(harness_payload, indent=2, sort_keys=True) - ) - harness_summary = harness_payload.get("summary") - elif ( - not defer_policyengine_harness - and result.policyengine_tables is not None - and resolved_target_provider is not None - and resolved_baseline_dataset is not None - and resolved_harness_slices - ): - harness_period = result.config.policyengine_dataset_year or 2024 - harness_run = evaluate_policyengine_us_harness( - result.policyengine_tables, - resolved_target_provider, - resolved_harness_slices, - baseline_dataset=str(resolved_baseline_dataset), - dataset_year=harness_period, - simulation_cls=result.config.policyengine_simulation_cls, - candidate_label="microplex", - baseline_label="policyengine_us_data", - metadata=resolved_harness_metadata, - cache=policyengine_comparison_cache, - ) - policyengine_harness_path = resolve_us_stage_artifact_contract_path( - output_dir, - "09_validation_benchmarking", - "policyengine_harness", - ) - harness_run.save(policyengine_harness_path) - harness_payload = harness_run.to_dict() - harness_summary = harness_payload["summary"] - - if native_scores_payload is not None: - policyengine_native_scores_path = resolve_us_stage_artifact_contract_path( - output_dir, - "09_validation_benchmarking", - "policyengine_native_scores", - ) - policyengine_native_scores_path.write_text( - json.dumps(native_scores_payload, indent=2, sort_keys=True) - ) - elif ( - not defer_policyengine_native_score - and policyengine_dataset_path is not None - and resolved_baseline_dataset is not None - ): - try: - native_scores_payload = compute_us_pe_native_scores( - candidate_dataset_path=policyengine_dataset_path, - baseline_dataset_path=resolved_baseline_dataset, - period=result.config.policyengine_dataset_year or 2024, - policyengine_us_data_repo=policyengine_us_data_repo, - ) - policyengine_native_scores_path = ( - resolve_us_stage_artifact_contract_path( - output_dir, - "09_validation_benchmarking", - "policyengine_native_scores", - ) - ) - policyengine_native_scores_path.write_text( - json.dumps(native_scores_payload, indent=2, sort_keys=True) - ) - except Exception: - if require_policyengine_native_score: - raise - - child_tax_unit_agi_drift_path = None - child_tax_unit_agi_drift_summary: dict[str, Any] | None = None - if enable_child_tax_unit_agi_drift: - try: - drift_path = resolve_us_stage_artifact_contract_path( - output_dir, - "09_validation_benchmarking", - "child_tax_unit_agi_drift", - ) - variables = ( - child_tax_unit_agi_drift_variables - or DEFAULT_CHILD_TAX_UNIT_AGI_DRIFT_VARIABLES - ) - payload = summarize_child_tax_unit_agi_drift( - output_dir, - variables=variables, - ) - drift_path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - child_tax_unit_agi_drift_path = drift_path - child_tax_unit_agi_drift_summary = ( - _summarize_child_tax_unit_agi_drift_ratios( - payload, - stage="calibrated", - variables=variables, - ) - ) - except Exception as exc: # pragma: no cover - diagnostic best-effort - child_tax_unit_agi_drift_summary = { - "error": f"{type(exc).__name__}: {exc}", - } - - manifest = { - "created_at": datetime.now(UTC).isoformat(), - "config": result.config.to_dict(), - "rows": { - "seed": int(len(result.seed_data)), - "synthetic": int(len(result.synthetic_data)), - "calibrated": int(len(result.calibrated_data)), - }, - "weights": { - "nonzero": result.n_nonzero_weights, - "total": result.total_weighted_population, - }, - "targets": { - "n_marginal_groups": len(result.targets.marginal), - "n_continuous": len(result.targets.continuous), - }, - "synthesis": result.synthesis_metadata, - "calibration": result.calibration_summary, - "artifacts": { - "seed_data": seed_data_path.name, - "scaffold_seed_data": str( - scaffold_seed_data_path.relative_to(output_dir) - ), - "synthetic_data": synthetic_data_path.name, - "calibrated_data": calibrated_data_path.name, - "targets": targets_path.name, - "synthesizer": synthesizer_path.name if synthesizer_path else None, - "source_plan": str(source_plan_path.relative_to(output_dir)), - "source_weight_diagnostics": source_weight_diagnostics_path.name, - "calibration_summary": str( - calibration_summary_path.relative_to(output_dir) - ), - "pre_calibration_policyengine_entity_tables": ( - str( - pre_calibration_policyengine_entity_tables_path.relative_to( - output_dir - ) - ) - if pre_calibration_policyengine_entity_tables_path is not None - and pre_calibration_policyengine_entity_tables_path.exists() - else None - ), - "policyengine_entity_tables": ( - str(policyengine_entity_tables_path.relative_to(output_dir)) - if policyengine_entity_tables_path is not None - else None - ), - "policyengine_dataset": ( - policyengine_dataset_path.name - if policyengine_dataset_path - else None - ), - "data_flow_snapshot": data_flow_snapshot_path.name, - "stage_manifest": stage_manifest_path.name, - "artifact_inventory": str( - artifact_inventory_path.relative_to(output_dir) - ), - "conditional_readiness": str( - conditional_readiness_path.relative_to(output_dir) - ), - "validation_evidence": ( - str(validation_evidence_path.relative_to(output_dir)) - if validation_evidence_path is not None - else None - ), - "policyengine_harness": ( - policyengine_harness_path.name - if policyengine_harness_path - else None - ), - "policyengine_native_scores": ( - policyengine_native_scores_path.name - if policyengine_native_scores_path is not None - else None - ), - "capital_gains_lots": ( - capital_gains_lots_path.name - if capital_gains_lots_path is not None - else None - ), - }, - } - if harness_summary is not None: - manifest["policyengine_harness"] = harness_summary - if native_scores_payload is not None: - manifest["policyengine_native_scores"] = dict( - native_scores_payload.get("summary", {}) - ) - if child_tax_unit_agi_drift_path is not None: - manifest["artifacts"]["child_tax_unit_agi_drift"] = ( - child_tax_unit_agi_drift_path.name - ) - if child_tax_unit_agi_drift_summary is not None: - manifest.setdefault("diagnostics", {})["child_tax_unit_agi_drift"] = ( - child_tax_unit_agi_drift_summary - ) - if capital_gains_lots_summary is not None: - manifest.setdefault("diagnostics", {})["capital_gains_lots"] = ( - capital_gains_lots_summary - ) - manifest.setdefault("diagnostics", {})["source_weight_diagnostics"] = dict( - source_weight_diagnostics_payload.get("summary", {}) - ) - if harness_summary is not None or native_scores_payload is not None: - resolved_run_registry_path = Path( - run_registry_path or output_dir.parent / "run_registry.jsonl" - ) - run_entry = build_us_microplex_run_registry_entry( - artifact_dir=output_dir, - manifest_path=manifest_path, - manifest=manifest, - policyengine_harness_path=policyengine_harness_path, - policyengine_harness_payload=harness_payload, - metadata=dict(run_registry_metadata or {}), - ) - recorded_entry = append_us_microplex_run_registry_entry( - resolved_run_registry_path, - run_entry, - ) - resolved_run_index_path = append_us_microplex_run_index_entry( - run_index_path or output_dir.parent, - recorded_entry, - policyengine_harness_payload=harness_payload, - ) - manifest["run_registry"] = { - "path": str(resolved_run_registry_path), - "artifact_id": recorded_entry.artifact_id, - "improved_candidate_frontier": recorded_entry.improved_candidate_frontier, - "improved_delta_frontier": recorded_entry.improved_delta_frontier, - "improved_composite_frontier": recorded_entry.improved_composite_frontier, - "improved_native_frontier": recorded_entry.improved_native_frontier, - "default_frontier_metric": ( - "enhanced_cps_native_loss_delta" - if native_scores_payload is not None - else "candidate_composite_parity_loss" - ), - } - manifest["run_index"] = { - "path": str(resolved_run_index_path), - "artifact_id": recorded_entry.artifact_id, - } - if stage_runtime_writer is not None: - stage_runtime_writer.manifest_payload = manifest - stage9_summary = _stage9_benchmark_summary(manifest) - if stage9_summary: - if validation_evidence_path is not None: - write_us_validation_evidence_manifest( - output_dir, - validation_evidence_path, - manifest_payload=manifest, - ) - stage_runtime_writer.complete_stage( - USValidationBenchmarkingOutputs( - validation_evidence=_stage_artifact_ref( - output_dir, - "09_validation_benchmarking", - "validation_evidence", - ), - benchmark_summary=stage9_summary, - policyengine_harness=( - _stage_artifact_ref( - output_dir, - "09_validation_benchmarking", - "policyengine_harness", - ) - if policyengine_harness_path is not None - else None - ), - policyengine_native_scores=( - _stage_artifact_ref( - output_dir, - "09_validation_benchmarking", - "policyengine_native_scores", - ) - if policyengine_native_scores_path is not None - else None - ), - diagnostics=_stage_diagnostics( - "09_validation_benchmarking", - stage9_summary, - ), - ) - ) - else: - stage_runtime_writer.defer_stage( - "09_validation_benchmarking", - "No validation or benchmark evidence was configured for this run.", - ) - manifest = stage_runtime_writer.finalize_from_artifact_manifest(manifest) - else: - manifest = write_us_stage_run_manifests_from_artifact_manifest( - output_dir, - manifest, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - except Exception as exc: - if stage_runtime_writer is not None: - stage_runtime_writer.fail_stage("09_validation_benchmarking", exc) - raise - assert_valid_benchmark_artifact_manifest( - manifest, - artifact_dir=output_dir, - manifest_path=manifest_path, - summary_section=( - "policyengine_harness" if harness_summary is not None else None - ), - required_artifact_keys=( - "scaffold_seed_data", - "seed_data", - "synthetic_data", - "calibrated_data", - "targets", - "source_weight_diagnostics", - *( - ("policyengine_native_scores",) - if native_scores_payload is not None - else () - ), - ), - required_summary_keys=( - ( - "candidate_mean_abs_relative_error", - "baseline_mean_abs_relative_error", - "mean_abs_relative_error_delta", - ) - if harness_summary is not None - else () - ), - ) - - return USMicroplexArtifactPaths( - output_dir=output_dir, - version_id=output_dir.name, - seed_data=seed_data_path, - synthetic_data=synthetic_data_path, - calibrated_data=calibrated_data_path, - targets=targets_path, - manifest=manifest_path, - scaffold_seed_data=scaffold_seed_data_path, - synthesizer=synthesizer_path, - policyengine_dataset=policyengine_dataset_path, - data_flow_snapshot=data_flow_snapshot_path, - stage_manifest=stage_manifest_path, - artifact_inventory=artifact_inventory_path, - conditional_readiness=conditional_readiness_path, - source_plan=source_plan_path, - pre_calibration_policyengine_entity_tables=( - pre_calibration_policyengine_entity_tables_path - ), - policyengine_entity_tables=policyengine_entity_tables_path, - calibration_summary=calibration_summary_path, - validation_evidence=validation_evidence_path, - policyengine_harness=policyengine_harness_path, - policyengine_native_scores=policyengine_native_scores_path, - policyengine_native_audit=None, - policyengine_native_target_diagnostics=None, - child_tax_unit_agi_drift=child_tax_unit_agi_drift_path, - capital_gains_lots=capital_gains_lots_path, - source_weight_diagnostics=source_weight_diagnostics_path, - run_registry=resolved_run_registry_path, - run_index_db=resolved_run_index_path, - ) diff --git a/src/microplex_us/pipelines/backfill_pe_native_audit.py b/src/microplex_us/pipelines/backfill_pe_native_audit.py deleted file mode 100644 index d0673a9e..00000000 --- a/src/microplex_us/pipelines/backfill_pe_native_audit.py +++ /dev/null @@ -1,392 +0,0 @@ -"""Backfill PE rebuild native-audit sidecars for historical US artifact bundles.""" - -from __future__ import annotations - -import argparse -import json -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path - -from microplex.targets import assert_valid_benchmark_artifact_manifest - -from microplex_us.pipelines.backfill_pe_native_scores import ( - discover_us_candidate_artifact_dirs, -) -from microplex_us.pipelines.pe_native_scores import ( - build_us_pe_native_target_diagnostics_payload, - compute_batch_us_pe_native_support_audits, - compute_batch_us_pe_native_target_deltas, -) -from microplex_us.pipelines.pe_us_data_rebuild_audit import ( - build_policyengine_us_data_rebuild_native_audit, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint import ( - _refresh_checkpoint_data_flow_snapshot, -) -from microplex_us.pipelines.stage_contracts import ( - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) - - -def backfill_us_pe_native_audit_bundle( - artifact_dir: str | Path, - *, - force: bool = False, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> Path: - """Backfill PE rebuild native-audit sidecar + manifest summary for one bundle.""" - - bundle_dir = Path(artifact_dir) - manifest_path = bundle_dir / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - artifacts = dict(manifest.get("artifacts", {})) - dataset_name = artifacts.get("policyengine_dataset") - if not dataset_name: - raise ValueError(f"{bundle_dir} does not declare a policyengine_dataset artifact") - - native_scores_path = _resolve_required_native_scores_path(bundle_dir, artifacts) - native_scores_payload = json.loads(native_scores_path.read_text()) - native_audit_path = resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_audit", - ) - if native_audit_path.exists() and not force: - payload = json.loads(native_audit_path.read_text()) - else: - payload = build_policyengine_us_data_rebuild_native_audit( - bundle_dir, - manifest_payload=manifest, - native_scores_payload=native_scores_payload, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - return _write_native_audit_payload_to_bundle( - bundle_dir=bundle_dir, - manifest_path=manifest_path, - manifest=manifest, - payload=payload, - ) - - -def backfill_us_pe_native_audit_bundles( - artifact_dirs: list[str | Path] | tuple[str | Path, ...], - *, - force: bool = False, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> list[Path]: - """Backfill PE rebuild native audits for a batch of saved bundles.""" - - if not artifact_dirs: - return [] - - manifest_paths: list[Path] = [] - grouped_pending: dict[ - tuple[Path, int], - list[tuple[Path, Path, dict, dict, Path]], - ] = {} - - for artifact_dir in artifact_dirs: - bundle_dir = Path(artifact_dir) - manifest_path = bundle_dir / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - artifacts = dict(manifest.get("artifacts", {})) - dataset_name = artifacts.get("policyengine_dataset") - if not dataset_name: - raise ValueError( - f"{bundle_dir} does not declare a policyengine_dataset artifact" - ) - - native_scores_path = _resolve_optional_native_scores_path(bundle_dir, artifacts) - if native_scores_path is None: - continue - manifest_paths.append(manifest_path) - native_scores_payload = json.loads(native_scores_path.read_text()) - native_audit_path = resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_audit", - ) - if native_audit_path.exists() and not force: - _write_native_audit_payload_to_bundle( - bundle_dir=bundle_dir, - manifest_path=manifest_path, - manifest=manifest, - payload=json.loads(native_audit_path.read_text()), - ) - continue - - period = int( - native_scores_payload.get("period") - or manifest.get("config", {}).get("policyengine_dataset_year", 2024) - ) - baseline_dataset = _resolve_baseline_dataset(manifest) - grouped_pending.setdefault((baseline_dataset, period), []).append( - ( - bundle_dir, - manifest_path, - manifest, - native_scores_payload, - bundle_dir / str(dataset_name), - ) - ) - - for (baseline_dataset, period), rows in grouped_pending.items(): - candidate_dataset_paths = [candidate_path for *_rest, candidate_path in rows] - with ThreadPoolExecutor(max_workers=2) as executor: - target_future = executor.submit( - compute_batch_us_pe_native_target_deltas, - candidate_dataset_paths=candidate_dataset_paths, - baseline_dataset_path=baseline_dataset, - period=period, - top_k=15, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - support_future = executor.submit( - compute_batch_us_pe_native_support_audits, - candidate_dataset_paths=candidate_dataset_paths, - baseline_dataset_path=baseline_dataset, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - target_delta_payloads = target_future.result() - support_audit_payloads = support_future.result() - - target_payload_by_candidate = { - str(Path(payload["to_dataset"]).expanduser().resolve()): payload - for payload in target_delta_payloads - } - support_payload_by_candidate = { - str(Path(payload["candidate_dataset"]).expanduser().resolve()): payload - for payload in support_audit_payloads - } - - if len(target_payload_by_candidate) != len(rows): - raise ValueError( - "PE-native batch target-delta backfill returned a different number " - "of payloads than bundles" - ) - if len(support_payload_by_candidate) != len(rows): - raise ValueError( - "PE-native batch support-audit backfill returned a different number " - "of payloads than bundles" - ) - - for ( - bundle_dir, - manifest_path, - manifest, - native_scores_payload, - candidate_dataset_path, - ) in rows: - candidate_key = str(candidate_dataset_path.expanduser().resolve()) - target_delta_payload = target_payload_by_candidate.get(candidate_key) - support_audit_payload = support_payload_by_candidate.get(candidate_key) - if target_delta_payload is None or support_audit_payload is None: - raise ValueError( - "PE-native batch audit backfill did not return payloads for " - f"{candidate_dataset_path}" - ) - payload = build_policyengine_us_data_rebuild_native_audit( - bundle_dir, - manifest_payload=manifest, - native_scores_payload=native_scores_payload, - target_delta_payload=target_delta_payload, - support_audit_payload=support_audit_payload, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - _write_native_audit_payload_to_bundle( - bundle_dir=bundle_dir, - manifest_path=manifest_path, - manifest=manifest, - payload=payload, - ) - - return manifest_paths - - -def backfill_us_pe_native_audit_root( - artifact_root: str | Path, - *, - force: bool = False, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> list[Path]: - """Backfill every eligible artifact bundle under one saved-output root.""" - - return backfill_us_pe_native_audit_bundles( - discover_us_candidate_artifact_dirs(artifact_root), - force=force, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - - -def _write_native_audit_payload_to_bundle( - *, - bundle_dir: Path, - manifest_path: Path, - manifest: dict, - payload: dict, -) -> Path: - native_audit_path = resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_audit", - ) - native_audit_path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - - artifacts = dict(manifest.get("artifacts", {})) - artifacts["policyengine_native_audit"] = str( - native_audit_path.relative_to(bundle_dir) - ) - extra_outputs = [native_audit_path.name] - target_delta_payload = payload.get("targetDelta") - if isinstance(target_delta_payload, dict): - target_diagnostics = build_us_pe_native_target_diagnostics_payload( - period=int(payload.get("period") or 2024), - from_label="policyengine-us-data", - to_label="microplex-us", - policyengine_targets_db_path=dict(manifest.get("config", {})).get( - "policyengine_targets_db" - ), - target_delta_payload=target_delta_payload, - artifact_id=str(payload.get("artifactId") or bundle_dir.name), - run_id=str(payload.get("artifactId") or bundle_dir.name), - ) - target_diagnostics_path = resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_target_diagnostics", - ) - target_diagnostics_path.write_text( - json.dumps(target_diagnostics, indent=2, sort_keys=True) - ) - artifacts["policyengine_native_target_diagnostics"] = str( - target_diagnostics_path.relative_to(bundle_dir) - ) - extra_outputs.append(target_diagnostics_path.name) - manifest["artifacts"] = artifacts - manifest["policyengine_native_audit"] = dict(payload.get("verdictHints", {})) - - _refresh_checkpoint_data_flow_snapshot( - bundle_dir, - manifest, - extra_outputs=tuple(extra_outputs), - ) - assert_valid_benchmark_artifact_manifest( - manifest, - artifact_dir=bundle_dir, - manifest_path=manifest_path, - summary_section=( - "policyengine_harness" - if manifest.get("policyengine_harness") is not None - else None - ), - required_artifact_keys=( - "policyengine_dataset", - "policyengine_native_scores", - "policyengine_native_audit", - ), - required_summary_keys=( - ( - "candidate_mean_abs_relative_error", - "baseline_mean_abs_relative_error", - "mean_abs_relative_error_delta", - ) - if manifest.get("policyengine_harness") is not None - else () - ), - ) - manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True)) - return manifest_path - - -def _resolve_required_native_scores_path( - bundle_dir: Path, - artifacts: dict, -) -> Path: - path = _resolve_optional_native_scores_path(bundle_dir, artifacts) - if path is not None: - return path - raise ValueError( - f"{bundle_dir} is missing policyengine_native_scores.json; backfill native scores first" - ) - - -def _resolve_optional_native_scores_path( - bundle_dir: Path, - artifacts: dict, -) -> Path | None: - artifact_name = ( - artifacts.get("policyengine_native_scores") - or get_us_stage_artifact_contract( - "09_validation_benchmarking", - "policyengine_native_scores", - ).path_hint - ) - if artifact_name is None: - return None - path = bundle_dir / str(artifact_name) - if path.exists(): - return path - return None - - -def _resolve_baseline_dataset(manifest: dict) -> Path: - config = dict(manifest.get("config", {})) - configured = config.get("policyengine_baseline_dataset") - if not configured: - raise ValueError("Manifest does not include policyengine_baseline_dataset") - return Path(str(configured)).expanduser().resolve() - - -def main(argv: list[str] | None = None) -> int: - """CLI entrypoint for historical PE rebuild native-audit backfill.""" - - parser = argparse.ArgumentParser( - description="Backfill PE rebuild native-audit sidecars for US artifact bundles.", - ) - parser.add_argument( - "artifact_root", - nargs="?", - default="artifacts", - help="Artifact output root to scan (defaults to ./artifacts).", - ) - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--policyengine-us-data-python") - parser.add_argument( - "--force", - action="store_true", - help="Recompute native audits even if a sidecar already exists.", - ) - args = parser.parse_args(argv) - - manifest_paths = backfill_us_pe_native_audit_root( - args.artifact_root, - force=args.force, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_python=args.policyengine_us_data_python, - ) - print( - json.dumps( - { - "artifact_root": str(Path(args.artifact_root).resolve()), - "backfilled_count": len(manifest_paths), - "manifest_paths": [str(path) for path in manifest_paths], - }, - indent=2, - sort_keys=True, - ) - ) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/backfill_pe_native_scores.py b/src/microplex_us/pipelines/backfill_pe_native_scores.py deleted file mode 100644 index 0e93030b..00000000 --- a/src/microplex_us/pipelines/backfill_pe_native_scores.py +++ /dev/null @@ -1,415 +0,0 @@ -"""Backfill PE-native broad-loss scores for historical US artifact bundles.""" - -from __future__ import annotations - -import argparse -import json -from pathlib import Path - -from microplex.targets import assert_valid_benchmark_artifact_manifest - -from microplex_us.pipelines.index_db import rebuild_us_microplex_run_index -from microplex_us.pipelines.pe_native_scores import ( - compute_batch_us_pe_native_scores, - compute_us_pe_native_scores, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint import ( - _refresh_checkpoint_data_flow_snapshot, -) -from microplex_us.pipelines.registry import ( - append_us_microplex_run_registry_entry, - build_us_microplex_run_registry_entry, -) -from microplex_us.pipelines.stage_contracts import ( - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) - - -def discover_us_candidate_artifact_dirs(artifact_root: str | Path) -> tuple[Path, ...]: - """Return saved US artifact bundle directories with a PE dataset and manifest.""" - - root = Path(artifact_root) - dataset_hint = get_us_stage_artifact_contract( - "08_dataset_assembly", - "policyengine_dataset", - ).path_hint - if dataset_hint is None: - raise RuntimeError("Stage 8 policyengine_dataset artifact has no path hint") - return tuple( - sorted( - path.parent - for path in root.rglob(dataset_hint) - if (path.parent / "manifest.json").exists() - ) - ) - - -def backfill_us_pe_native_scores_bundle( - artifact_dir: str | Path, - *, - baseline_dataset: str | Path | None = None, - force: bool = False, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> Path: - """Backfill PE-native broad-loss sidecar + manifest summary for one bundle.""" - - bundle_dir = Path(artifact_dir) - manifest_path = bundle_dir / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - artifacts = dict(manifest.get("artifacts", {})) - dataset_name = artifacts.get("policyengine_dataset") - if not dataset_name: - raise ValueError(f"{bundle_dir} does not declare a policyengine_dataset artifact") - - native_sidecar_path = resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_scores", - ) - if native_sidecar_path.exists() and not force: - payload = json.loads(native_sidecar_path.read_text()) - else: - resolved_baseline = _resolve_baseline_dataset( - manifest, - baseline_dataset=baseline_dataset, - ) - payload = compute_us_pe_native_scores( - candidate_dataset_path=bundle_dir / dataset_name, - baseline_dataset_path=resolved_baseline, - period=int(manifest.get("config", {}).get("policyengine_dataset_year", 2024)), - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - return _write_native_scores_payload_to_bundle( - bundle_dir=bundle_dir, - manifest_path=manifest_path, - manifest=manifest, - payload=payload, - ) - - -def backfill_us_pe_native_scores_bundles( - artifact_dirs: list[str | Path] | tuple[str | Path, ...], - *, - baseline_dataset: str | Path | None = None, - force: bool = False, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - rebuild_registry: bool = True, -) -> list[Path]: - """Backfill PE-native scores for a batch of saved bundles with grouped batch scoring.""" - - if not artifact_dirs: - return [] - - manifest_paths: list[Path] = [] - grouped_pending: dict[Path, list[tuple[Path, Path, dict[str, object], Path]]] = {} - - for artifact_dir in artifact_dirs: - bundle_dir = Path(artifact_dir) - manifest_path = bundle_dir / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - manifest_paths.append(manifest_path) - - native_sidecar_path = resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_scores", - ) - if native_sidecar_path.exists() and not force: - _write_native_scores_payload_to_bundle( - bundle_dir=bundle_dir, - manifest_path=manifest_path, - manifest=manifest, - payload=json.loads(native_sidecar_path.read_text()), - ) - continue - - dataset_name = dict(manifest.get("artifacts", {})).get("policyengine_dataset") - if not dataset_name: - raise ValueError( - f"{bundle_dir} does not declare a policyengine_dataset artifact" - ) - resolved_baseline = _resolve_baseline_dataset( - manifest, - baseline_dataset=baseline_dataset, - ) - grouped_pending.setdefault(resolved_baseline, []).append( - (bundle_dir, manifest_path, manifest, bundle_dir / dataset_name) - ) - - for resolved_baseline, rows in grouped_pending.items(): - payloads = compute_batch_us_pe_native_scores( - candidate_dataset_paths=[candidate_path for *_rest, candidate_path in rows], - baseline_dataset_path=resolved_baseline, - period=int(rows[0][2].get("config", {}).get("policyengine_dataset_year", 2024)), - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - if len(payloads) != len(rows): - raise ValueError( - "PE-native batch backfill returned a different number of payloads than bundles" - ) - for (bundle_dir, manifest_path, manifest, _candidate_path), payload in zip( - rows, - payloads, - strict=True, - ): - _write_native_scores_payload_to_bundle( - bundle_dir=bundle_dir, - manifest_path=manifest_path, - manifest=manifest, - payload=payload, - ) - - if rebuild_registry and manifest_paths: - manifest_groups: dict[Path, list[Path]] = {} - for manifest_path in manifest_paths: - manifest_groups.setdefault(manifest_path.parent.parent, []).append(manifest_path) - for artifact_root, root_manifest_paths in manifest_groups.items(): - rebuild_us_pe_native_run_registry( - artifact_root, - manifest_paths=root_manifest_paths, - ) - return manifest_paths - - -def backfill_us_pe_native_scores_root( - artifact_root: str | Path, - *, - baseline_dataset: str | Path | None = None, - force: bool = False, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - rebuild_registry: bool = True, -) -> list[Path]: - """Backfill every artifact bundle under one saved-output root.""" - - manifest_paths: list[Path] = [] - for artifact_dir in discover_us_candidate_artifact_dirs(artifact_root): - manifest_paths.append( - backfill_us_pe_native_scores_bundle( - artifact_dir, - baseline_dataset=baseline_dataset, - force=force, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - ) - if rebuild_registry and manifest_paths: - rebuild_us_pe_native_run_registry(artifact_root, manifest_paths=manifest_paths) - return manifest_paths - - -def _write_native_scores_payload_to_bundle( - *, - bundle_dir: Path, - manifest_path: Path, - manifest: dict, - payload: dict, -) -> Path: - native_sidecar_path = resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_scores", - ) - native_sidecar_path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - - artifacts = dict(manifest.get("artifacts", {})) - artifacts["policyengine_native_scores"] = str( - native_sidecar_path.relative_to(bundle_dir) - ) - manifest["artifacts"] = artifacts - manifest["policyengine_native_scores"] = dict(payload.get("summary", {})) - if "run_registry" in manifest: - manifest["run_registry"]["default_frontier_metric"] = ( - "enhanced_cps_native_loss_delta" - ) - - _refresh_checkpoint_data_flow_snapshot(bundle_dir, manifest) - assert_valid_benchmark_artifact_manifest( - manifest, - artifact_dir=bundle_dir, - manifest_path=manifest_path, - summary_section=( - "policyengine_harness" - if manifest.get("policyengine_harness") is not None - else None - ), - required_artifact_keys=("policyengine_dataset", "policyengine_native_scores"), - required_summary_keys=( - ( - "candidate_mean_abs_relative_error", - "baseline_mean_abs_relative_error", - "mean_abs_relative_error_delta", - ) - if manifest.get("policyengine_harness") is not None - else () - ), - ) - manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True)) - return manifest_path - - -def rebuild_us_pe_native_run_registry( - artifact_root: str | Path, - *, - manifest_paths: list[Path] | tuple[Path, ...] | None = None, -) -> Path: - """Rebuild one run_registry.jsonl from saved manifests under one artifact root.""" - - root = Path(artifact_root) - registry_path = root / "run_registry.jsonl" - if registry_path.exists(): - registry_path.unlink() - - manifests = ( - list(manifest_paths) - if manifest_paths is not None - else [path for path in root.rglob("manifest.json")] - ) - manifest_rows: list[tuple[str, Path, dict]] = [] - for manifest_path in manifests: - manifest = json.loads(manifest_path.read_text()) - if ( - manifest.get("policyengine_harness") is None - and manifest.get("policyengine_native_scores") is None - ): - continue - manifest_rows.append( - ( - str(manifest.get("created_at", "")), - manifest_path, - manifest, - ) - ) - - recorded_entries = [] - for _, manifest_path, manifest in sorted(manifest_rows, key=lambda item: item[0]): - bundle_dir = manifest_path.parent - harness_path = _resolve_optional_artifact_path( - bundle_dir, - manifest.get("artifacts", {}).get("policyengine_harness"), - ) - harness_payload = ( - json.loads(harness_path.read_text()) if harness_path is not None else None - ) - recorded = append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=bundle_dir, - manifest_path=manifest_path, - manifest=manifest, - policyengine_harness_path=harness_path, - policyengine_harness_payload=harness_payload, - ), - ) - recorded_entries.append((manifest_path, manifest, recorded)) - - index_path = ( - rebuild_us_microplex_run_index(root, registry_path=registry_path) - if recorded_entries - else root / "run_index.duckdb" - ) - - for manifest_path, manifest, recorded in recorded_entries: - manifest["run_registry"] = { - "path": str(registry_path), - "artifact_id": recorded.artifact_id, - "improved_candidate_frontier": recorded.improved_candidate_frontier, - "improved_delta_frontier": recorded.improved_delta_frontier, - "improved_composite_frontier": recorded.improved_composite_frontier, - "improved_native_frontier": recorded.improved_native_frontier, - "default_frontier_metric": ( - "enhanced_cps_native_loss_delta" - if manifest.get("policyengine_native_scores") is not None - else "candidate_composite_parity_loss" - ), - } - manifest["run_index"] = { - "path": str(index_path), - "artifact_id": recorded.artifact_id, - } - manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True)) - return registry_path - - -def main(argv: list[str] | None = None) -> int: - """CLI entrypoint for historical PE-native score backfill.""" - - parser = argparse.ArgumentParser( - description="Backfill PE-native broad-loss scores for US artifact bundles." - ) - parser.add_argument( - "artifact_root", - nargs="?", - default="artifacts", - help="Artifact output root to scan (defaults to ./artifacts).", - ) - parser.add_argument("--baseline-dataset") - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--policyengine-us-data-python") - parser.add_argument( - "--force", - action="store_true", - help="Recompute native scores even if a sidecar already exists.", - ) - parser.add_argument( - "--skip-registry-rebuild", - action="store_true", - help="Do not rebuild run_registry.jsonl / run_index.duckdb after backfill.", - ) - args = parser.parse_args(argv) - - manifest_paths = backfill_us_pe_native_scores_root( - args.artifact_root, - baseline_dataset=args.baseline_dataset, - force=args.force, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_python=args.policyengine_us_data_python, - rebuild_registry=not args.skip_registry_rebuild, - ) - print( - json.dumps( - { - "artifact_root": str(Path(args.artifact_root).resolve()), - "backfilled_count": len(manifest_paths), - "manifest_paths": [str(path) for path in manifest_paths], - }, - indent=2, - sort_keys=True, - ) - ) - return 0 - - -def _resolve_baseline_dataset( - manifest: dict, - *, - baseline_dataset: str | Path | None = None, -) -> Path: - if baseline_dataset is not None: - return Path(baseline_dataset).expanduser().resolve() - config = dict(manifest.get("config", {})) - configured = config.get("policyengine_baseline_dataset") - if not configured: - raise ValueError("Manifest does not include policyengine_baseline_dataset") - return Path(configured).expanduser().resolve() - - -def _resolve_optional_artifact_path( - bundle_dir: Path, - artifact_name: str | None, -) -> Path | None: - if not artifact_name: - return None - path = bundle_dir / artifact_name - if not path.exists(): - return None - return path - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/calibration_stage_parity.py b/src/microplex_us/pipelines/calibration_stage_parity.py deleted file mode 100644 index 57d62cb2..00000000 --- a/src/microplex_us/pipelines/calibration_stage_parity.py +++ /dev/null @@ -1,284 +0,0 @@ -"""Audit synthetic vs calibrated stage outputs, with optional PE reference context.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd - -from microplex_us.pipelines.pre_sim_parity import ( - DEFAULT_PRE_SIM_FOCUS_VARIABLES, - PreSimParityVariableSpec, -) -from microplex_us.pipelines.source_stage_parity import ( - _compare_series, - _resolve_bundle_variable, - _summarize_series, -) -from microplex_us.policyengine.us import load_policyengine_us_entity_tables - - -def build_us_calibration_stage_parity_audit( - synthetic_data: str | Path, - calibrated_data: str | Path, - *, - reference_dataset: str | Path | None = None, - period: int = 2024, - focus_variables: tuple[PreSimParityVariableSpec | str, ...] - | list[PreSimParityVariableSpec | str] = DEFAULT_PRE_SIM_FOCUS_VARIABLES, -) -> dict[str, Any]: - """Compare synthetic vs calibrated stage rows, with optional PE reference.""" - - synthetic_path = Path(synthetic_data).resolve() - calibrated_path = Path(calibrated_data).resolve() - synthetic_rows = pd.read_parquet(synthetic_path) - calibrated_rows = pd.read_parquet(calibrated_path) - focus_specs = _normalize_focus_variable_specs(focus_variables) - reference_bundle = ( - load_policyengine_us_entity_tables(Path(reference_dataset).resolve(), period=period) - if reference_dataset is not None - else None - ) - - return { - "schemaVersion": 1, - "comparisonStage": "calibration", - "period": int(period), - "synthetic_data": str(synthetic_path), - "calibrated_data": str(calibrated_path), - "reference_dataset": str(Path(reference_dataset).resolve()) - if reference_dataset is not None - else None, - "rowStructure": { - "synthetic": _row_structure_summary(synthetic_rows), - "calibrated": _row_structure_summary(calibrated_rows), - }, - "weightDiagnostics": { - "synthetic": _household_weight_diagnostics(synthetic_rows), - "calibrated": _household_weight_diagnostics(calibrated_rows), - }, - "focusVariables": { - spec.label: _calibration_variable_comparison( - synthetic_rows=synthetic_rows, - calibrated_rows=calibrated_rows, - reference_bundle=reference_bundle, - spec=spec, - ) - for spec in focus_specs - }, - } - - -def write_us_calibration_stage_parity_audit( - synthetic_data: str | Path, - calibrated_data: str | Path, - output_path: str | Path, - *, - reference_dataset: str | Path | None = None, - period: int = 2024, - focus_variables: tuple[PreSimParityVariableSpec | str, ...] - | list[PreSimParityVariableSpec | str] = DEFAULT_PRE_SIM_FOCUS_VARIABLES, -) -> Path: - """Persist one calibration-stage parity audit as JSON.""" - - output = Path(output_path).resolve() - payload = build_us_calibration_stage_parity_audit( - synthetic_data, - calibrated_data, - reference_dataset=reference_dataset, - period=period, - focus_variables=focus_variables, - ) - output.parent.mkdir(parents=True, exist_ok=True) - output.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return output - - -def _calibration_variable_comparison( - *, - synthetic_rows: pd.DataFrame, - calibrated_rows: pd.DataFrame, - reference_bundle, - spec: PreSimParityVariableSpec, -) -> dict[str, Any]: - synthetic_weights = _stage_weight_series(synthetic_rows) - calibrated_weights = _stage_weight_series(calibrated_rows) - - result: dict[str, Any] = { - "candidate_variable": spec.candidate_variable, - "reference_variable": spec.resolved_reference_variable, - "synthetic_present": spec.candidate_variable in synthetic_rows.columns, - "calibrated_present": spec.candidate_variable in calibrated_rows.columns, - "reference_present": False, - } - - if spec.candidate_variable in synthetic_rows.columns: - result["synthetic"] = _summarize_series( - synthetic_rows[spec.candidate_variable], - weights=synthetic_weights, - value_kind=spec.value_kind, - ) - if spec.candidate_variable in calibrated_rows.columns: - result["calibrated"] = _summarize_series( - calibrated_rows[spec.candidate_variable], - weights=calibrated_weights, - value_kind=spec.value_kind, - ) - if ( - spec.candidate_variable in synthetic_rows.columns - and spec.candidate_variable in calibrated_rows.columns - ): - result["calibrated_vs_synthetic"] = _compare_series( - calibrated_rows[spec.candidate_variable], - synthetic_rows[spec.candidate_variable], - candidate_weights=calibrated_weights, - reference_weights=synthetic_weights, - value_kind=spec.value_kind, - ) - - if reference_bundle is not None: - reference_entry = _resolve_bundle_variable( - reference_bundle, - spec.resolved_reference_variable, - ) - if reference_entry is not None: - result["reference_present"] = True - result["reference_entity"] = reference_entry["entity"].value - result["reference"] = _summarize_series( - reference_entry["series"], - weights=reference_entry["weights"], - value_kind=spec.value_kind, - ) - if spec.candidate_variable in calibrated_rows.columns: - result["calibrated_vs_reference"] = _compare_series( - calibrated_rows[spec.candidate_variable], - reference_entry["series"], - candidate_weights=calibrated_weights, - reference_weights=reference_entry["weights"], - value_kind=spec.value_kind, - ) - if spec.candidate_variable in synthetic_rows.columns: - result["synthetic_vs_reference"] = _compare_series( - synthetic_rows[spec.candidate_variable], - reference_entry["series"], - candidate_weights=synthetic_weights, - reference_weights=reference_entry["weights"], - value_kind=spec.value_kind, - ) - - return result - - -def _row_structure_summary(rows: pd.DataFrame) -> dict[str, Any]: - summary: dict[str, Any] = {"row_count": int(len(rows))} - if "household_id" in rows.columns: - household_ids = pd.to_numeric(rows["household_id"], errors="coerce") - summary["household_count"] = int(household_ids.nunique(dropna=True)) - if summary["household_count"] > 0: - rows_per_household = rows.groupby("household_id", observed=True).size() - summary["mean_rows_per_household"] = float(rows_per_household.mean()) - return summary - - -def _household_weight_diagnostics(rows: pd.DataFrame) -> dict[str, Any]: - if "household_id" not in rows.columns: - weights = _stage_weight_series(rows) - return _weight_summary(weights) - households = ( - rows.loc[:, ["household_id", _stage_weight_column(rows)]] - .dropna(subset=["household_id"]) - .drop_duplicates(subset=["household_id"]) - ) - weights = pd.to_numeric( - households[_stage_weight_column(rows)], - errors="coerce", - ).fillna(0.0) - summary = _weight_summary(weights) - summary["household_count"] = int(len(households)) - return summary - - -def _weight_summary(weights: pd.Series) -> dict[str, Any]: - values = pd.to_numeric(weights, errors="coerce").fillna(0.0).astype(float) - if values.empty: - return { - "total_weight": 0.0, - "mean_weight": 0.0, - "p50_weight": 0.0, - "p90_weight": 0.0, - "p99_weight": 0.0, - "max_weight": 0.0, - "effective_sample_size": 0.0, - } - total_weight = float(values.sum()) - ess = 0.0 - denom = float(np.square(values.to_numpy(dtype=float)).sum()) - if denom > 0.0: - ess = float((total_weight**2) / denom) - return { - "total_weight": total_weight, - "mean_weight": float(values.mean()), - "p50_weight": float(values.quantile(0.5)), - "p90_weight": float(values.quantile(0.9)), - "p99_weight": float(values.quantile(0.99)), - "max_weight": float(values.max()), - "effective_sample_size": ess, - } - - -def _stage_weight_column(rows: pd.DataFrame) -> str: - for candidate in ("weight", "household_weight"): - if candidate in rows.columns: - return candidate - raise ValueError("Stage rows must contain either 'weight' or 'household_weight'") - - -def _stage_weight_series(rows: pd.DataFrame) -> pd.Series: - return pd.to_numeric(rows[_stage_weight_column(rows)], errors="coerce").fillna(0.0) - - -def _normalize_focus_variable_specs( - focus_variables: tuple[PreSimParityVariableSpec | str, ...] - | list[PreSimParityVariableSpec | str], -) -> tuple[PreSimParityVariableSpec, ...]: - specs: list[PreSimParityVariableSpec] = [] - seen_labels: set[str] = set() - for variable in focus_variables: - spec = ( - variable - if isinstance(variable, PreSimParityVariableSpec) - else PreSimParityVariableSpec(str(variable), str(variable)) - ) - if spec.label in seen_labels: - continue - seen_labels.add(spec.label) - specs.append(spec) - return tuple(specs) - - -def main() -> None: - import argparse - - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("synthetic_data") - parser.add_argument("calibrated_data") - parser.add_argument("output_path") - parser.add_argument("--reference-dataset") - parser.add_argument("--period", type=int, default=2024) - args = parser.parse_args() - - output = write_us_calibration_stage_parity_audit( - args.synthetic_data, - args.calibrated_data, - args.output_path, - reference_dataset=args.reference_dataset, - period=args.period, - ) - print(output) - - -if __name__ == "__main__": - main() diff --git a/src/microplex_us/pipelines/cd_age_reweighting.py b/src/microplex_us/pipelines/cd_age_reweighting.py deleted file mode 100644 index 45c9681b..00000000 --- a/src/microplex_us/pipelines/cd_age_reweighting.py +++ /dev/null @@ -1,569 +0,0 @@ -"""Reweight PE-US H5 datasets to congressional-district age targets.""" - -from __future__ import annotations - -import argparse -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import h5py -import numpy as np -import pandas as pd -from scipy.optimize import minimize - -from microplex_us.pipelines.pe_native_optimization import ( - rewrite_policyengine_us_dataset_weights, -) -from microplex_us.policyengine import PolicyEngineUSDBTargetProvider -from microplex_us.policyengine.us import PolicyEngineUSConstraint - - -@dataclass(frozen=True) -class CDAgeTarget: - """One congressional-district person-count-by-age target.""" - - target_id: int - district_geoid: int - value: float - age_constraints: tuple[PolicyEngineUSConstraint, ...] - period: int - - @property - def age_key(self) -> tuple[tuple[str, str], ...]: - return tuple( - sorted((constraint.operation, str(constraint.value)) for constraint in self.age_constraints) - ) - - -def normalize_at_large_cd_geoids(values: np.ndarray) -> np.ndarray: - """Normalize statewide at-large districts from ``xx00`` to PE target ``xx01``.""" - result = np.asarray(values).copy() - finite = np.isfinite(result.astype(float, copy=False)) - as_int = result.astype(np.int64, copy=False) - at_large = finite & (as_int > 0) & (as_int % 100 == 0) - result[at_large] = as_int[at_large] + 1 - return result.astype(np.int64, copy=False) - - -def load_cd_age_targets( - target_db: str | Path, - *, - period: int = 2024, -) -> list[CDAgeTarget]: - """Load active district person-count-by-age targets from PE's target DB.""" - provider = PolicyEngineUSDBTargetProvider(target_db) - raw_targets = provider.load_targets( - period=period, - variables=["person_count"], - domain_variables=["age"], - geo_levels=["district"], - active_only=True, - ) - targets: list[CDAgeTarget] = [] - for target in raw_targets: - district_constraints = [ - constraint - for constraint in target.constraints - if constraint.variable == "congressional_district_geoid" - ] - age_constraints = tuple( - constraint for constraint in target.constraints if constraint.variable == "age" - ) - if len(district_constraints) != 1 or not age_constraints: - continue - targets.append( - CDAgeTarget( - target_id=int(target.target_id), - district_geoid=int(district_constraints[0].value), - value=float(target.value), - age_constraints=age_constraints, - period=int(target.period), - ) - ) - targets.sort(key=lambda target: (target.district_geoid, target.age_key, target.target_id)) - return targets - - -def reweight_h5_to_cd_age_targets( - *, - input_dataset: str | Path, - target_db: str | Path, - output_dataset: str | Path, - period: int = 2024, - max_iter: int = 300, - tol: float = 1e-9, - preserve_district_weight_sum: bool = True, - details_output: str | Path | None = None, -) -> dict[str, Any]: - """Apply independent per-CD entropy reweighting for age-distribution targets.""" - period_key = str(period) - targets = load_cd_age_targets(target_db, period=period) - if not targets: - raise ValueError("No district person_count-by-age targets were loaded") - - with h5py.File(input_dataset, "r") as handle: - household_ids = np.asarray(handle["household_id"][period_key]) - input_weights = np.asarray(handle["household_weight"][period_key], dtype=np.float64) - household_cd = normalize_at_large_cd_geoids( - np.asarray(handle["congressional_district_geoid"][period_key]) - ) - person_household_id = np.asarray(handle["person_household_id"][period_key]) - age = np.asarray(handle["age"][period_key], dtype=np.float64) - - person_household_index = _map_person_households_to_indices( - household_ids, - person_household_id, - ) - unique_age_keys = sorted({target.age_key for target in targets}) - household_age_counts = _build_household_age_count_matrix( - n_households=len(household_ids), - person_household_index=person_household_index, - age=age, - age_keys=unique_age_keys, - ) - age_key_to_col = {age_key: index for index, age_key in enumerate(unique_age_keys)} - - output_weights = input_weights.copy() - detail_rows: list[dict[str, Any]] = [] - district_failures: list[dict[str, Any]] = [] - targets_by_district: dict[int, list[CDAgeTarget]] = {} - for target in targets: - targets_by_district.setdefault(target.district_geoid, []).append(target) - - for district_geoid, district_targets in sorted(targets_by_district.items()): - household_mask = household_cd == district_geoid - household_indices = np.flatnonzero(household_mask) - if len(household_indices) == 0: - district_failures.append( - { - "district_geoid": district_geoid, - "reason": "no_households", - "target_count": len(district_targets), - } - ) - _append_detail_rows( - detail_rows, - targets=district_targets, - age_key_to_col=age_key_to_col, - household_indices=household_indices, - household_age_counts=household_age_counts, - input_weights=input_weights, - output_weights=output_weights, - status="no_households", - ) - continue - - row_cols = [age_key_to_col[target.age_key] for target in district_targets] - design = household_age_counts[np.ix_(household_indices, row_cols)].T.astype( - np.float64, - copy=False, - ) - target_values = np.asarray([target.value for target in district_targets], dtype=np.float64) - base_weights = input_weights[household_indices] - fit_design = design - fit_targets = target_values - if preserve_district_weight_sum: - fit_design = np.vstack( - [ - design, - np.ones((1, design.shape[1]), dtype=np.float64), - ] - ) - fit_targets = np.concatenate( - [target_values, np.asarray([base_weights.sum()], dtype=np.float64)] - ) - solution = _solve_entropy_weights( - design=fit_design, - base_weights=base_weights, - targets=fit_targets, - max_iter=max_iter, - tol=tol, - ) - output_weights[household_indices] = solution["weights"] - if not solution["success"]: - district_failures.append( - { - "district_geoid": district_geoid, - "reason": solution["message"], - "target_count": len(district_targets), - "max_abs_relative_error": solution["max_abs_relative_error"], - } - ) - _append_detail_rows( - detail_rows, - targets=district_targets, - age_key_to_col=age_key_to_col, - household_indices=household_indices, - household_age_counts=household_age_counts, - input_weights=input_weights, - output_weights=output_weights, - status="ok" if solution["success"] else "not_converged", - ) - - output_path = rewrite_policyengine_us_dataset_weights( - input_dataset_path=input_dataset, - output_dataset_path=output_dataset, - household_weights=output_weights, - period=period, - ) - _normalize_cd_geoids_in_h5(output_path, period=period) - - detail_frame = pd.DataFrame(detail_rows) - if details_output is not None: - detail_path = Path(details_output).expanduser().resolve() - detail_path.parent.mkdir(parents=True, exist_ok=True) - detail_frame.to_csv(detail_path, index=False) - - summary = _summarize_detail_frame( - detail_frame, - input_weight_sum=float(input_weights.sum()), - output_weight_sum=float(output_weights.sum()), - n_households=len(input_weights), - n_persons=len(age), - n_age_bins=len(unique_age_keys), - district_failures=district_failures, - ) - summary["preserve_district_weight_sum"] = bool(preserve_district_weight_sum) - summary["input_dataset"] = str(Path(input_dataset).expanduser().resolve()) - summary["output_dataset"] = str(Path(output_path).expanduser().resolve()) - summary["target_db"] = str(Path(target_db).expanduser().resolve()) - summary["period"] = int(period) - return summary - - -def build_cd_age_constraint_matrix( - *, - input_dataset: str | Path, - target_db: str | Path, - period: int = 2024, - target_weight: float = 1.0, -) -> dict[str, Any]: - """Build scaled sparse rows for CD person-count-by-age targets. - - The returned matrix has shape ``(targets, households)`` and uses the same - ``((estimate - target + 1) / (target + 1)) ** 2`` row scaling convention as - the PE-native broad matrix. - """ - if target_weight <= 0: - raise ValueError("target_weight must be positive") - period_key = str(period) - targets = load_cd_age_targets(target_db, period=period) - if not targets: - raise ValueError("No district person_count-by-age targets were loaded") - - with h5py.File(input_dataset, "r") as handle: - household_ids = np.asarray(handle["household_id"][period_key]) - household_cd = normalize_at_large_cd_geoids( - np.asarray(handle["congressional_district_geoid"][period_key]) - ) - person_household_id = np.asarray(handle["person_household_id"][period_key]) - age = np.asarray(handle["age"][period_key], dtype=np.float64) - - person_household_index = _map_person_households_to_indices( - household_ids, - person_household_id, - ) - unique_age_keys = sorted({target.age_key for target in targets}) - household_age_counts = _build_household_age_count_matrix( - n_households=len(household_ids), - person_household_index=person_household_index, - age=age, - age_keys=unique_age_keys, - ) - age_key_to_col = {age_key: index for index, age_key in enumerate(unique_age_keys)} - - rows: list[np.ndarray] = [] - cols: list[np.ndarray] = [] - vals: list[np.ndarray] = [] - target_values = np.asarray([target.value for target in targets], dtype=np.float64) - scaling = np.sqrt(float(target_weight) / float(len(targets))) / ( - target_values + 1.0 - ) - target_names: list[str] = [] - for row_index, target in enumerate(targets): - count_col = age_key_to_col[target.age_key] - household_indices = np.flatnonzero(household_cd == target.district_geoid) - counts = household_age_counts[household_indices, count_col] - nonzero = counts != 0 - if nonzero.any(): - rows.append(np.full(int(nonzero.sum()), row_index, dtype=np.int32)) - cols.append(household_indices[nonzero].astype(np.int32)) - vals.append((counts[nonzero] * scaling[row_index]).astype(np.float32)) - target_names.append( - "district/census/person_count_by_age/" - f"{target.district_geoid}/{json.dumps(target.age_key, separators=(',', ':'))}" - ) - - if rows: - import scipy.sparse as sp - - matrix = sp.csr_matrix( - ( - np.concatenate(vals), - (np.concatenate(rows), np.concatenate(cols)), - ), - shape=(len(targets), len(household_ids)), - dtype=np.float32, - ) - else: - import scipy.sparse as sp - - matrix = sp.csr_matrix((len(targets), len(household_ids)), dtype=np.float32) - - scaled_target = ((target_values - 1.0) * scaling).astype(np.float32) - return { - "matrix": matrix, - "target": scaled_target, - "metadata": { - "target_names": target_names, - "n_targets_total": int(len(targets)), - "n_targets_kept": int(len(targets)), - "n_districts": int(len({target.district_geoid for target in targets})), - "n_age_bins": int(len(unique_age_keys)), - "target_weight": float(target_weight), - "target_db": str(Path(target_db).expanduser().resolve()), - "family": "district_age_distribution", - }, - } - - -def _map_person_households_to_indices( - household_ids: np.ndarray, - person_household_ids: np.ndarray, -) -> np.ndarray: - household_index = {int(household_id): index for index, household_id in enumerate(household_ids)} - try: - return np.asarray( - [household_index[int(household_id)] for household_id in person_household_ids], - dtype=np.int64, - ) - except KeyError as exc: - raise ValueError(f"person_household_id references missing household_id {exc}") from exc - - -def _build_household_age_count_matrix( - *, - n_households: int, - person_household_index: np.ndarray, - age: np.ndarray, - age_keys: list[tuple[tuple[str, str], ...]], -) -> np.ndarray: - counts = np.zeros((n_households, len(age_keys)), dtype=np.float32) - for col, age_key in enumerate(age_keys): - mask = _evaluate_age_key(age, age_key) - np.add.at(counts[:, col], person_household_index[mask], 1.0) - return counts - - -def _evaluate_age_key( - age: np.ndarray, - age_key: tuple[tuple[str, str], ...], -) -> np.ndarray: - mask = np.ones(len(age), dtype=bool) - for operation, raw_value in age_key: - value = float(raw_value) - if operation == "==": - mask &= age == value - elif operation == "!=": - mask &= age != value - elif operation == ">": - mask &= age > value - elif operation == ">=": - mask &= age >= value - elif operation == "<": - mask &= age < value - elif operation == "<=": - mask &= age <= value - else: - raise ValueError(f"Unsupported age target operation: {operation!r}") - return mask - - -def _solve_entropy_weights( - *, - design: np.ndarray, - base_weights: np.ndarray, - targets: np.ndarray, - max_iter: int, - tol: float, -) -> dict[str, Any]: - support = design.sum(axis=1) > 0 - unsupported = (~support) & (np.abs(targets) > tol) - if unsupported.any(): - estimates = design @ base_weights - return { - "weights": base_weights.copy(), - "success": False, - "message": "unsupported_positive_targets", - "max_abs_relative_error": float( - _abs_relative_error(estimates, targets).max(initial=0.0) - ), - } - - def objective(lam: np.ndarray) -> tuple[float, np.ndarray]: - linear_predictor = np.clip(lam @ design, -50.0, 50.0) - weights = base_weights * np.exp(linear_predictor) - value = float(weights.sum() - np.dot(targets, lam)) - gradient = design @ weights - targets - return value, gradient - - result = minimize( - fun=lambda lam: objective(lam)[0], - x0=np.zeros(design.shape[0], dtype=np.float64), - jac=lambda lam: objective(lam)[1], - method="L-BFGS-B", - options={"maxiter": int(max_iter), "ftol": tol, "gtol": tol}, - ) - linear_predictor = np.clip(result.x @ design, -50.0, 50.0) - weights = base_weights * np.exp(linear_predictor) - estimates = design @ weights - max_error = float(_abs_relative_error(estimates, targets).max(initial=0.0)) - success = bool(result.success) or max_error <= max(1e-4, tol * 100) - return { - "weights": weights, - "success": success, - "message": str(result.message), - "max_abs_relative_error": max_error, - } - - -def _append_detail_rows( - rows: list[dict[str, Any]], - *, - targets: list[CDAgeTarget], - age_key_to_col: dict[tuple[tuple[str, str], ...], int], - household_indices: np.ndarray, - household_age_counts: np.ndarray, - input_weights: np.ndarray, - output_weights: np.ndarray, - status: str, -) -> None: - for target in targets: - col = age_key_to_col[target.age_key] - counts = household_age_counts[household_indices, col] - before = float(np.dot(counts, input_weights[household_indices])) - after = float(np.dot(counts, output_weights[household_indices])) - rows.append( - { - "target_id": target.target_id, - "district_geoid": target.district_geoid, - "age_key": json.dumps(target.age_key), - "target": target.value, - "estimate_before": before, - "estimate_after": after, - "relative_error_before": _relative_error(before, target.value), - "relative_error_after": _relative_error(after, target.value), - "abs_relative_error_before": abs(_relative_error(before, target.value)), - "abs_relative_error_after": abs(_relative_error(after, target.value)), - "period": target.period, - "status": status, - } - ) - - -def _relative_error(estimate: float, target: float) -> float: - if abs(target) <= 1e-12: - return 0.0 if abs(estimate) <= 1e-12 else float("inf") - return float((estimate - target) / abs(target)) - - -def _abs_relative_error(estimate: np.ndarray, target: np.ndarray) -> np.ndarray: - denominator = np.where(np.abs(target) <= 1e-12, 1.0, np.abs(target)) - return np.abs((estimate - target) / denominator) - - -def _summarize_detail_frame( - detail_frame: pd.DataFrame, - *, - input_weight_sum: float, - output_weight_sum: float, - n_households: int, - n_persons: int, - n_age_bins: int, - district_failures: list[dict[str, Any]], -) -> dict[str, Any]: - before = detail_frame["abs_relative_error_before"].to_numpy(dtype=np.float64) - after = detail_frame["abs_relative_error_after"].to_numpy(dtype=np.float64) - return { - "n_targets": int(len(detail_frame)), - "n_districts": int(detail_frame["district_geoid"].nunique()), - "n_households": int(n_households), - "n_persons": int(n_persons), - "n_age_bins": int(n_age_bins), - "input_weight_sum": float(input_weight_sum), - "output_weight_sum": float(output_weight_sum), - "weight_sum_relative_change": float( - (output_weight_sum - input_weight_sum) / input_weight_sum - ), - "mean_abs_relative_error_before": float(before.mean()), - "mean_abs_relative_error_after": float(after.mean()), - "median_abs_relative_error_before": float(np.median(before)), - "median_abs_relative_error_after": float(np.median(after)), - "p90_abs_relative_error_before": float(np.quantile(before, 0.9)), - "p90_abs_relative_error_after": float(np.quantile(after, 0.9)), - "p99_abs_relative_error_before": float(np.quantile(before, 0.99)), - "p99_abs_relative_error_after": float(np.quantile(after, 0.99)), - "max_abs_relative_error_before": float(before.max(initial=0.0)), - "max_abs_relative_error_after": float(after.max(initial=0.0)), - "failed_district_count": int(len(district_failures)), - "district_failures": district_failures, - } - - -def _normalize_cd_geoids_in_h5(path: str | Path, *, period: int) -> None: - period_key = str(period) - with h5py.File(path, "r+") as handle: - if "congressional_district_geoid" not in handle: - return - group = handle["congressional_district_geoid"] - if period_key not in group: - return - values = np.asarray(group[period_key]) - group[period_key][...] = normalize_at_large_cd_geoids(values).astype(values.dtype) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser() - parser.add_argument("--input-dataset", required=True) - parser.add_argument("--target-db", required=True) - parser.add_argument("--output-dataset", required=True) - parser.add_argument("--period", type=int, default=2024) - parser.add_argument("--max-iter", type=int, default=300) - parser.add_argument("--tol", type=float, default=1e-9) - parser.add_argument( - "--no-preserve-district-weight-sum", - dest="preserve_district_weight_sum", - action="store_false", - help=( - "Do not append a per-district household-weight preservation row. " - "The default preserves district household totals while fitting CD-age targets." - ), - ) - parser.set_defaults(preserve_district_weight_sum=True) - parser.add_argument("--summary-output") - parser.add_argument("--details-output") - args = parser.parse_args(argv) - - summary = reweight_h5_to_cd_age_targets( - input_dataset=args.input_dataset, - target_db=args.target_db, - output_dataset=args.output_dataset, - period=args.period, - max_iter=args.max_iter, - tol=args.tol, - preserve_district_weight_sum=args.preserve_district_weight_sum, - details_output=args.details_output, - ) - payload = json.dumps(summary, indent=2, sort_keys=True, allow_nan=False) - if args.summary_output: - summary_path = Path(args.summary_output).expanduser().resolve() - summary_path.parent.mkdir(parents=True, exist_ok=True) - summary_path.write_text(payload + "\n") - print(payload) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/check_export_columns.py b/src/microplex_us/pipelines/check_export_columns.py deleted file mode 100644 index 4e5e7859..00000000 --- a/src/microplex_us/pipelines/check_export_columns.py +++ /dev/null @@ -1,845 +0,0 @@ -"""Fast eCPS column-parity check for exported datasets. - -This is the cheap, millisecond gate that should pass *before* the -expensive MP-300k build. It compares the column set of a candidate export -against a frozen contract describing what the enhanced CPS (eCPS) baseline -exports, so column drift is catchable locally and in CI without producing -any data. - -The required/forbidden column diff here mirrors the one inside -``_column_contract_gate`` in ``mp300k_artifact_gates`` (``required -- present`` and ``forbidden & present``) -- but that gate only runs deep -in the slow artifact path. This module surfaces the same check as a -one-line local command and the first, cheap CI job. - -The contract (``ecps_export_contract.json``) defines three categories: - -- ``required`` -- columns MP must export to be a drop-in eCPS replacement. -- ``ecps_internal_optional`` -- eCPS clone-bookkeeping columns MP need not - export (neither required nor forbidden). -- ``forbidden`` -- transient takeup-input columns eCPS drops and MP must - not export. - -Heavy imports (``h5py``) are deferred so importing this module and running -the ``--columns-json`` path stay cheap. - -Usage:: - - python -m microplex_us.pipelines.check_export_columns export.h5 - python -m microplex_us.pipelines.check_export_columns \\ - --columns-json columns.json - python -m microplex_us.pipelines.check_export_columns \\ - --entity-tables checkpoints/post-imputation - python -m microplex_us.pipelines.check_export_columns export.h5 \\ - --contract custom_contract.json - -Exits 1 if any required column is missing or any forbidden column is -present; exits 0 otherwise. -""" - -from __future__ import annotations - -import argparse -import json -import re -import sys -from dataclasses import asdict, dataclass -from pathlib import Path -from typing import Any - -# Path to the committed contract shipped alongside this module. -DEFAULT_CONTRACT_PATH = Path(__file__).with_name("ecps_export_contract.json") -DEFAULT_SPEC_PATH = Path(__file__).resolve().parents[1] / "specs" / "us-2024.yaml" - -SIGNED_NUMERIC_SUPPORT_COLUMNS = frozenset( - { - "farm_income", - "farm_operations_income", - "partnership_s_corp_income", - "rental_income", - "self_employment_income_before_lsr", - } -) - - -@dataclass -class ColumnDiff: - """Result of comparing a present column set against a contract.""" - - missing_required: list[str] - forbidden_present: list[str] - extra_unknown: list[str] - - @property - def ok(self) -> bool: - """True when no required column is missing and none forbidden.""" - return not self.missing_required and not self.forbidden_present - - -@dataclass -class ColumnSupportStats: - """Compact support/variation summary for one exported H5 column.""" - - column: str - kind: str - row_count: int - nonzero_count: int | None - positive_count: int | None - negative_count: int | None - unique_count: int - - -@dataclass -class ColumnSupportIssue: - """One eCPS-populated column missing equivalent MP support.""" - - column: str - requirement: str - baseline: ColumnSupportStats - candidate: ColumnSupportStats | None - - -@dataclass -class SupportDiff: - """Result of comparing candidate support against eCPS support.""" - - issues: list[ColumnSupportIssue] - checked_columns: list[str] - baseline_populated_columns: list[str] - baseline_filler_columns: list[str] - exempt_columns: list[str] - - @property - def ok(self) -> bool: - """True when every eCPS-populated column has candidate support.""" - return not self.issues - - -@dataclass -class SpecVariableManifestDiff: - """Result of checking ``spec.variables`` against the frozen contract.""" - - spec_path: str - required_contract_count: int - declared_imputation_count: int - variable_manifest_count: int - missing_required: list[str] - missing_declared_imputation: list[str] - extra_variables: list[str] - - @property - def ok(self) -> bool: - """True when the manifest exactly covers required and declared vars.""" - return not ( - self.missing_required - or self.missing_declared_imputation - or self.extra_variables - ) - - -def compute_column_diff( - present: set[str], - *, - required: set[str], - forbidden: set[str], - optional: frozenset[str] | set[str] = frozenset(), - excluded: frozenset[str] | set[str] = frozenset(), -) -> ColumnDiff: - """Compare a present column set against contract categories. - - Mirrors the required/forbidden diff in ``_column_contract_gate`` in - ``mp300k_artifact_gates`` (``required - present`` and ``forbidden & - present``). ``optional`` (clone-bookkeeping flags) and ``excluded`` - (formula-owned columns MP need not export) are recognized categories, so - they never appear in ``extra_unknown``. ``extra_unknown`` is informational - only: columns present that are in no known category. - """ - missing_required = required - present - forbidden_present = forbidden & present - known = required | forbidden | set(optional) | set(excluded) - extra_unknown = present - known - return ColumnDiff( - missing_required=sorted(missing_required), - forbidden_present=sorted(forbidden_present), - extra_unknown=sorted(extra_unknown), - ) - - -def compute_support_diff( - candidate_h5: Path, - *, - baseline_h5: Path, - period: int, - required_columns: set[str], - exempt_columns: frozenset[str] | set[str] = frozenset(), -) -> SupportDiff: - """Compare candidate support against eCPS support for required columns. - - Presence is not enough for release parity. If the pinned eCPS baseline - *populates* a required exported column, MP must populate it too: - - - numeric columns: eCPS has at least one nonzero value, so MP must also - have at least one nonzero value. Declared signed-income exports must - also preserve positive/negative support when eCPS has it; - - boolean/string/categorical columns: eCPS has more than one unique value, - so MP must also vary. - - Columns where eCPS itself is all-zero/single-valued are treated as fillers - and do not require MP support. Explicit exemptions are reserved for known - rare, computed-downstream, or intentionally absent variables. - """ - period_key = str(int(period)) - exempt = {str(column) for column in exempt_columns} - checked_columns: list[str] = [] - baseline_populated_columns: list[str] = [] - baseline_filler_columns: list[str] = [] - issues: list[ColumnSupportIssue] = [] - - import h5py - - with ( - h5py.File(candidate_h5, "r") as candidate, - h5py.File(baseline_h5, "r") as baseline, - ): - for column in sorted(required_columns): - if column in exempt: - continue - baseline_values = _h5_column_values( - baseline, - column, - period_key=period_key, - ) - if baseline_values is None: - continue - checked_columns.append(column) - baseline_stats = _support_stats(column, baseline_values) - requirement = _support_requirement( - baseline_stats, - require_signed_numeric=column in SIGNED_NUMERIC_SUPPORT_COLUMNS, - ) - if requirement is None: - baseline_filler_columns.append(column) - continue - baseline_populated_columns.append(column) - candidate_values = _h5_column_values( - candidate, - column, - period_key=period_key, - ) - candidate_stats = ( - None - if candidate_values is None - else _support_stats(column, candidate_values) - ) - if not _satisfies_support_requirement( - candidate_stats, - requirement=requirement, - ): - issues.append( - ColumnSupportIssue( - column=column, - requirement=requirement, - baseline=baseline_stats, - candidate=candidate_stats, - ) - ) - - return SupportDiff( - issues=issues, - checked_columns=checked_columns, - baseline_populated_columns=baseline_populated_columns, - baseline_filler_columns=baseline_filler_columns, - exempt_columns=sorted(exempt & set(required_columns)), - ) - - -def compute_spec_variable_manifest_diff( - *, - contract: dict, - spec_path: Path = DEFAULT_SPEC_PATH, -) -> SpecVariableManifestDiff: - """Compare ``spec.variables`` with required exports and declared imputations.""" - text = spec_path.read_text(encoding="utf-8") - variables = _parse_top_level_mapping_keys(text, "variables") - if not variables: - raise ValueError(f"Spec {spec_path} is missing a variables mapping.") - - required = {str(column) for column in contract["required"]} - declared_imputation = _parse_imputation_vars(text) - expected = required | declared_imputation - return SpecVariableManifestDiff( - spec_path=str(spec_path), - required_contract_count=len(required), - declared_imputation_count=len(declared_imputation), - variable_manifest_count=len(variables), - missing_required=sorted(required - variables), - missing_declared_imputation=sorted(declared_imputation - variables), - extra_variables=sorted(variables - expected), - ) - - -def _top_level_section_lines(text: str, section: str) -> list[str]: - """Return lines in a simple top-level YAML section. - - This module is intentionally importable with only the column-parity - job's minimal dependencies, so the fast manifest gate avoids PyYAML. - The parser only needs the committed spec's shape: top-level sections, - mapping keys under ``variables:``, and imputation ``vars`` lists. - """ - section_header = f"{section}:" - lines = text.splitlines() - for index, line in enumerate(lines): - if line.strip() == section_header and not line.startswith((" ", "\t")): - body: list[str] = [] - for candidate in lines[index + 1 :]: - stripped = candidate.strip() - if ( - stripped - and not candidate.startswith((" ", "\t")) - and re.match(r"^[A-Za-z_][A-Za-z0-9_-]*:", stripped) - ): - break - body.append(candidate) - return body - return [] - - -def _parse_top_level_mapping_keys(text: str, section: str) -> set[str]: - """Parse direct mapping keys from a top-level section.""" - keys: set[str] = set() - for line in _top_level_section_lines(text, section): - match = re.match(r"^ ([A-Za-z_][A-Za-z0-9_]*):(?:\s|$)", line) - if match: - keys.add(match.group(1)) - return keys - - -def _parse_inline_list(raw: str) -> list[str]: - """Parse the simple YAML inline list form used in tests.""" - stripped = raw.strip() - if not stripped.startswith("[") or not stripped.endswith("]"): - return [] - body = stripped[1:-1].strip() - if not body: - return [] - return [ - parsed - for item in body.split(",") - if (parsed := _parse_simple_yaml_scalar(item)) is not None - ] - - -def _parse_simple_yaml_scalar(raw: str) -> str | None: - """Parse a simple YAML scalar variable name with optional inline comment.""" - value = raw.strip() - quote: str | None = None - unquoted = [] - for index, char in enumerate(value): - if char in {"'", '"'}: - if quote is None: - quote = char - elif quote == char: - quote = None - if char == "#" and quote is None and (index == 0 or value[index - 1].isspace()): - break - unquoted.append(char) - value = "".join(unquoted).strip() - if ( - len(value) >= 2 - and value[0] == value[-1] - and value[0] in {"'", '"'} - ): - value = value[1:-1].strip() - if re.match(r"^[A-Za-z_][A-Za-z0-9_]*$", value): - return value - return None - - -def _parse_imputation_vars(text: str) -> set[str]: - """Parse variable names from imputation step ``vars`` lists.""" - variables: set[str] = set() - in_vars_block = False - for line in _top_level_section_lines(text, "imputation"): - if re.match(r"^ -\s", line): - in_vars_block = False - - inline_match = re.match(r"^ vars:\s*(\[.*\])(?:\s+#.*)?$", line) - if inline_match: - variables.update(_parse_inline_list(inline_match.group(1))) - in_vars_block = False - continue - - if re.match(r"^ vars:\s*$", line): - in_vars_block = True - continue - - if in_vars_block: - item_match = re.match(r"^ -\s+(.+?)\s*$", line) - if item_match: - if parsed := _parse_simple_yaml_scalar(item_match.group(1)): - variables.add(parsed) - continue - if re.match(r"^ [A-Za-z_][A-Za-z0-9_-]*:", line): - in_vars_block = False - - return variables - - -def _h5_column_values( - handle: Any, - column: str, - *, - period_key: str, -): - """Return one H5 column's values, supporting grouped and flat layouts.""" - if column not in handle: - return None - item = handle[column] - import h5py - import numpy as np - - if isinstance(item, h5py.Group): - if period_key not in item: - return None - item = item[period_key] - if not isinstance(item, h5py.Dataset): - return None - return np.asarray(item) - - -def _support_stats(column: str, values) -> ColumnSupportStats: - """Summarize nonzero support and uniqueness for an exported column.""" - import numpy as np - - array = np.asarray(values) - flattened = array.reshape(-1) - unique_count = int(len(np.unique(flattened))) if flattened.size else 0 - kind = _support_kind(flattened) - nonzero_count: int | None = None - positive_count: int | None = None - negative_count: int | None = None - if kind == "numeric": - numeric = flattened - if np.issubdtype(numeric.dtype, np.floating): - numeric = numeric[np.isfinite(numeric)] - nonzero_count = int(np.count_nonzero(numeric)) - positive_count = int(np.count_nonzero(numeric > 0)) - negative_count = int(np.count_nonzero(numeric < 0)) - return ColumnSupportStats( - column=column, - kind=kind, - row_count=int(flattened.size), - nonzero_count=nonzero_count, - positive_count=positive_count, - negative_count=negative_count, - unique_count=unique_count, - ) - - -def _support_kind(values) -> str: - """Classify a NumPy array for support checking.""" - import numpy as np - - dtype = np.asarray(values).dtype - if np.issubdtype(dtype, np.bool_): - return "categorical" - if np.issubdtype(dtype, np.number): - return "numeric" - return "categorical" - - -def _support_requirement( - stats: ColumnSupportStats, - *, - require_signed_numeric: bool = True, -) -> str | None: - """Return the support MP must match for an eCPS column, if any.""" - if stats.kind == "numeric": - if (stats.nonzero_count or 0) <= 0: - return None - has_positive = (stats.positive_count or 0) > 0 - has_negative = (stats.negative_count or 0) > 0 - if require_signed_numeric and has_positive and has_negative: - return "numeric_signed" - if has_positive: - return "numeric_positive" - if has_negative: - return "numeric_negative" - return "numeric_nonzero" - return "categorical_variation" if stats.unique_count > 1 else None - - -def _satisfies_support_requirement( - stats: ColumnSupportStats | None, - *, - requirement: str, -) -> bool: - """Return whether candidate stats meet an eCPS-derived requirement.""" - if stats is None: - return False - if requirement in { - "numeric_nonzero", - "numeric_positive", - "numeric_negative", - "numeric_signed", - }: - if stats.kind != "numeric": - return stats.unique_count > 1 - if requirement == "numeric_nonzero": - return (stats.nonzero_count or 0) > 0 - if requirement == "numeric_positive": - return (stats.positive_count or 0) > 0 - if requirement == "numeric_negative": - return (stats.negative_count or 0) > 0 - return (stats.positive_count or 0) > 0 and (stats.negative_count or 0) > 0 - if requirement == "categorical_variation": - return stats.unique_count > 1 - raise ValueError(f"Unknown support requirement: {requirement}") - - -def load_contract(path: Path) -> dict: - """Load and validate the column-parity contract JSON.""" - with open(path) as f: - contract = json.load(f) - for key in ("required", "forbidden"): - if key not in contract: - raise ValueError(f"Contract {path} is missing required key '{key}'.") - contract.setdefault("ecps_internal_optional", []) - contract.setdefault("formula_owned_excluded", []) - return contract - - -def _columns_from_h5(h5_path: Path) -> set[str]: - """Return top-level base column names from an exported H5. - - Columns may be datasets named ```` or groups ``/``; - both collapse to the base name. This intentionally duplicates the tiny - parser used by the artifact gate so the fast column CI can run without - importing the full Microplex stack. - """ - import h5py - - with h5py.File(h5_path, "r") as handle: - return {name.split("/")[0] for name in handle.keys()} - - -def _columns_from_json(json_path: Path) -> set[str]: - """Return base column names from a JSON list (no data file needed).""" - with open(json_path) as f: - names = json.load(f) - if not isinstance(names, list): - raise ValueError( - f"--columns-json {json_path} must contain a JSON list of column names." - ) - return {str(name).split("/")[0] for name in names} - - -def _columns_from_entity_tables( - entity_tables_path: Path, - *, - direct_override_variables: tuple[str, ...] = (), -) -> set[str]: - """Return export column names from a saved PE entity-table checkpoint. - - This is the pre-calibration path: post-imputation entity tables already - determine the final H5 schema, while calibration only changes weights. - Imports stay deferred so the JSON/H5 fast paths do not import Microplex. - """ - from microplex_us.policyengine.us import ( - build_policyengine_us_export_column_names, - load_us_pipeline_checkpoint, - ) - - tables, _metadata = load_us_pipeline_checkpoint(entity_tables_path) - return build_policyengine_us_export_column_names( - tables, - direct_override_variables=direct_override_variables, - ) - - -def _bullet_lines(items: list[str]) -> list[str]: - """Render a list as indented bullets, or a placeholder if empty.""" - if not items: - return [" (none)"] - return [f" - {item}" for item in items] - - -def _format_report( - diff: ColumnDiff, - *, - source: str, - n_present: int, - n_required: int, - n_forbidden: int, - support_diff: SupportDiff | None = None, - spec_diff: SpecVariableManifestDiff | None = None, -) -> str: - """Build a human-readable report for the diff.""" - lines = [ - "eCPS column-parity check", - f" source: {source}", - f" columns present: {n_present}", - f" required (contract): {n_required}", - f" forbidden (contract): {n_forbidden}", - "", - f" missing_required ({len(diff.missing_required)}):", - *_bullet_lines(diff.missing_required), - f" forbidden_present ({len(diff.forbidden_present)}):", - *_bullet_lines(diff.forbidden_present), - f" extra_unknown (informational, {len(diff.extra_unknown)}):", - *_bullet_lines(diff.extra_unknown), - ] - if support_diff is not None: - lines.extend( - [ - "", - " eCPS support parity:", - f" checked_columns: {len(support_diff.checked_columns)}", - f" eCPS-populated columns: {len(support_diff.baseline_populated_columns)}", - f" eCPS filler columns: {len(support_diff.baseline_filler_columns)}", - f" explicit support exemptions: {len(support_diff.exempt_columns)}", - f" unsupported_populated ({len(support_diff.issues)}):", - *_bullet_lines( - [ - f"{issue.column} ({issue.requirement}; " - f"eCPS={_compact_stats(issue.baseline)}, " - f"candidate={_compact_stats(issue.candidate)})" - for issue in support_diff.issues - ] - ), - ] - ) - if spec_diff is not None: - lines.extend( - [ - "", - " spec variable manifest:", - f" spec: {spec_diff.spec_path}", - f" required contract variables: {spec_diff.required_contract_count}", - f" declared imputation variables: {spec_diff.declared_imputation_count}", - f" spec.variables count: {spec_diff.variable_manifest_count}", - f" missing_required ({len(spec_diff.missing_required)}):", - *_bullet_lines(spec_diff.missing_required), - " missing_declared_imputation " - f"({len(spec_diff.missing_declared_imputation)}):", - *_bullet_lines(spec_diff.missing_declared_imputation), - f" extra_variables ({len(spec_diff.extra_variables)}):", - *_bullet_lines(spec_diff.extra_variables), - ] - ) - ok = ( - diff.ok - and (support_diff is None or support_diff.ok) - and (spec_diff is None or spec_diff.ok) - ) - lines.extend(["", " RESULT: " + ("PASS" if ok else "FAIL")]) - return "\n".join(lines) - - -def _compact_stats(stats: ColumnSupportStats | None) -> str: - """Render support stats compactly for CLI output.""" - if stats is None: - return "missing" - if stats.kind == "numeric": - return ( - f"nonzero {stats.nonzero_count}/{stats.row_count}; " - f"+{stats.positive_count}, -{stats.negative_count}" - ) - return f"unique {stats.unique_count}/{stats.row_count}" - - -def support_diff_to_dict(diff: SupportDiff) -> dict[str, Any]: - """Return a JSON-serializable support parity payload.""" - payload = asdict(diff) - return payload - - -def main(argv: list[str] | None = None) -> int: - """Run the column-parity check; return the process exit code.""" - parser = argparse.ArgumentParser( - prog="check_export_columns", - description=( - "Fast eCPS column-parity check: compare a candidate export's " - "columns to the frozen eCPS contract. Produces no data." - ), - ) - parser.add_argument( - "h5path", - nargs="?", - help="Path to an exported H5 whose columns are checked.", - ) - parser.add_argument( - "--columns-json", - metavar="FILE", - help=( - "Path to a JSON list of column names to check instead of an " - "H5 (the no-data CI path). Mutually exclusive with h5path." - ), - ) - parser.add_argument( - "--entity-tables", - metavar="DIR", - help=( - "Path to a saved PolicyEngine entity-table checkpoint/stage " - "directory (for example checkpoints/post-imputation). Checks " - "the export schema before microsimulation/calibration/H5." - ), - ) - parser.add_argument( - "--direct-override-variable", - action="append", - default=[], - metavar="VARIABLE", - help=( - "PolicyEngine formula variable intentionally exported from source " - "data. Repeat for each override used by the build." - ), - ) - parser.add_argument( - "--contract", - metavar="FILE", - default=str(DEFAULT_CONTRACT_PATH), - help="Override the contract JSON (default: committed contract).", - ) - parser.add_argument( - "--spec", - metavar="FILE", - help=( - "Spec YAML whose variables block must cover the contract and " - "declared imputation vars. Defaults to the committed US spec when " - "using the committed contract." - ), - ) - parser.add_argument( - "--skip-spec-variable-manifest", - action="store_true", - help="Skip the spec.variables manifest coverage check.", - ) - parser.add_argument( - "--support-baseline", - metavar="H5", - help=( - "Pinned eCPS baseline H5. When supplied with an H5 candidate, " - "also fail if eCPS has nonzero/variant support for a required " - "exported column and the candidate is all-zero/constant." - ), - ) - parser.add_argument( - "--period", - type=int, - default=2024, - help="Tax year period to inspect for H5 support parity (default: 2024).", - ) - parser.add_argument( - "--support-exempt-column", - action="append", - default=[], - metavar="COLUMN", - help=( - "Required export column exempt from support parity because it is " - "declared rare, computed downstream, or intentionally absent. " - "Repeat for each explicit exception." - ), - ) - parser.add_argument( - "--support-diagnostics-json", - metavar="FILE", - help="Optional path to write support-parity diagnostics JSON.", - ) - args = parser.parse_args(argv) - - selected_inputs = [ - bool(args.h5path), - bool(args.columns_json), - bool(args.entity_tables), - ] - if sum(selected_inputs) != 1: - parser.error( - "provide exactly one of an H5 path, --columns-json, or --entity-tables." - ) - if args.support_baseline and not args.h5path: - parser.error("--support-baseline requires an H5 candidate path.") - - contract = load_contract(Path(args.contract)) - required = set(contract["required"]) - forbidden = set(contract["forbidden"]) - optional = set(contract["ecps_internal_optional"]) - excluded = set(contract.get("formula_owned_excluded", [])) - - if args.columns_json: - source = args.columns_json - present = _columns_from_json(Path(args.columns_json)) - elif args.entity_tables: - source = args.entity_tables - present = _columns_from_entity_tables( - Path(args.entity_tables), - direct_override_variables=tuple(args.direct_override_variable), - ) - else: - source = args.h5path - present = _columns_from_h5(Path(args.h5path)) - - diff = compute_column_diff( - present, - required=required, - forbidden=forbidden, - optional=optional, - excluded=excluded, - ) - contract_path = Path(args.contract).resolve() - spec_path = None - if not args.skip_spec_variable_manifest: - if args.spec: - spec_path = Path(args.spec) - elif contract_path == DEFAULT_CONTRACT_PATH.resolve(): - spec_path = DEFAULT_SPEC_PATH - spec_diff = ( - None - if spec_path is None - else compute_spec_variable_manifest_diff( - contract=contract, - spec_path=Path(spec_path), - ) - ) - support_diff = None - if args.support_baseline: - support_exempt = set(contract.get("support_exemptions", [])) | set( - args.support_exempt_column - ) - support_diff = compute_support_diff( - Path(args.h5path), - baseline_h5=Path(args.support_baseline), - period=int(args.period), - required_columns=required, - exempt_columns=support_exempt, - ) - if args.support_diagnostics_json: - output_path = Path(args.support_diagnostics_json) - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text( - json.dumps(support_diff_to_dict(support_diff), indent=2) + "\n" - ) - print( - _format_report( - diff, - source=source, - n_present=len(present), - n_required=len(required), - n_forbidden=len(forbidden), - support_diff=support_diff, - spec_diff=spec_diff, - ) - ) - return ( - 0 - if ( - diff.ok - and (support_diff is None or support_diff.ok) - and (spec_diff is None or spec_diff.ok) - ) - else 1 - ) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/microplex_us/pipelines/check_site_snapshot.py b/src/microplex_us/pipelines/check_site_snapshot.py deleted file mode 100644 index 03fada8b..00000000 --- a/src/microplex_us/pipelines/check_site_snapshot.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Validate that the committed US site snapshot matches its source artifact.""" - -from __future__ import annotations - -import argparse -import difflib -import json -from pathlib import Path - -from microplex_us.pipelines.data_flow_snapshot import ( - build_us_microplex_data_flow_snapshot, -) -from microplex_us.pipelines.site_snapshot import build_us_microplex_site_snapshot -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) - - -def check_us_microplex_site_snapshot( - snapshot_path: str | Path = "artifacts/site_snapshot_us.json", -) -> Path: - """Raise if the saved US site snapshot is stale or inconsistent.""" - snapshot_file = Path(snapshot_path) - snapshot = json.loads(snapshot_file.read_text()) - artifact_dir = _resolve_artifact_dir(snapshot_file, snapshot["sourceArtifact"]) - _check_data_flow_snapshot_current(artifact_dir) - regenerated = build_us_microplex_site_snapshot( - artifact_dir, - snapshot_path=snapshot_file, - ) - if snapshot != regenerated: - raise SystemExit(_snapshot_diff(snapshot, regenerated, snapshot_file)) - return snapshot_file - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Check that the canonical US site snapshot is up to date." - ) - parser.add_argument( - "snapshot_path", - nargs="?", - default="artifacts/site_snapshot_us.json", - help="Path to the saved US site snapshot JSON.", - ) - args = parser.parse_args(argv) - checked = check_us_microplex_site_snapshot(args.snapshot_path) - print(checked) - return 0 - - -def _snapshot_diff( - saved: dict, - regenerated: dict, - snapshot_file: Path, -) -> str: - saved_text = json.dumps(saved, indent=2, sort_keys=True).splitlines() - regenerated_text = json.dumps(regenerated, indent=2, sort_keys=True).splitlines() - diff = "\n".join( - difflib.unified_diff( - saved_text, - regenerated_text, - fromfile=str(snapshot_file), - tofile=f"{snapshot_file} (regenerated)", - lineterm="", - ) - ) - return f"US site snapshot is stale or inconsistent:\n{diff}" - - -def _resolve_artifact_dir(snapshot_file: Path, source_artifact: dict) -> Path: - artifact_path = source_artifact.get("artifactPath") - if isinstance(artifact_path, str) and artifact_path: - artifact_dir = (snapshot_file.parent / artifact_path).resolve() - if (artifact_dir / "manifest.json").exists(): - return artifact_dir - - artifact_ref = source_artifact.get("artifactRef") - if not isinstance(artifact_ref, str) or not artifact_ref: - raise SystemExit("US site snapshot is missing sourceArtifact.artifactRef") - artifact_dir = (snapshot_file.parent / artifact_ref).resolve() - if not (artifact_dir / "manifest.json").exists(): - raise SystemExit( - f"US site snapshot artifactRef does not resolve to a manifest: {artifact_ref}" - ) - return artifact_dir - - -def _check_data_flow_snapshot_current(artifact_dir: Path) -> None: - snapshot_path = resolve_us_stage_artifact_contract_path( - artifact_dir, - "08_dataset_assembly", - "data_flow_snapshot", - ) - if not snapshot_path.exists(): - raise SystemExit("US data-flow snapshot is missing from the artifact bundle.") - frozen_snapshot = json.loads(snapshot_path.read_text()) - fresh_snapshot = build_us_microplex_data_flow_snapshot( - artifact_dir, - prefer_saved=False, - ) - if frozen_snapshot != fresh_snapshot: - raise SystemExit( - "US data-flow snapshot is stale or inconsistent with current code." - ) - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/compact_policyengine_dataset.py b/src/microplex_us/pipelines/compact_policyengine_dataset.py deleted file mode 100644 index e2d00b18..00000000 --- a/src/microplex_us/pipelines/compact_policyengine_dataset.py +++ /dev/null @@ -1,335 +0,0 @@ -"""Compact PolicyEngine time-period H5 datasets by household weight.""" - -from __future__ import annotations - -import argparse -import json -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -import h5py -import numpy as np - -ENTITY_ID_VARIABLES = { - "household": "household_id", - "person": "person_id", - "tax_unit": "tax_unit_id", - "spm_unit": "spm_unit_id", - "family": "family_id", - "marital_unit": "marital_unit_id", -} - -PERSON_ENTITY_LINK_VARIABLES = { - "household": "person_household_id", - "tax_unit": "person_tax_unit_id", - "spm_unit": "person_spm_unit_id", - "family": "person_family_id", - "marital_unit": "person_marital_unit_id", -} - -STRUCTURAL_VARIABLE_ENTITIES = { - "household_id": "household", - "household_weight": "household", - "person_id": "person", - "person_household_id": "person", - "person_tax_unit_id": "person", - "person_spm_unit_id": "person", - "person_family_id": "person", - "person_marital_unit_id": "person", - "person_weight": "person", - "tax_unit_id": "tax_unit", - "spm_unit_id": "spm_unit", - "family_id": "family", - "marital_unit_id": "marital_unit", -} - - -def compact_policyengine_dataset_by_household_weight( - *, - input_dataset_path: str | Path, - output_dataset_path: str | Path, - households: int, - period: int = 2024, - weights_path: str | Path | None = None, - rescale_to_total: bool = True, - target_total_weight: float | None = None, -) -> dict[str, Any]: - """Write a household-subset PE H5, keeping the largest household weights.""" - - input_path = Path(input_dataset_path).expanduser() - output_path = Path(output_dataset_path).expanduser() - if households <= 0: - raise ValueError("households must be positive") - - period_key = str(period) - with h5py.File(input_path, "r") as source: - household_ids = _period_array(source, "household_id", period_key) - source_household_weights = np.asarray( - _period_array(source, "household_weight", period_key), - dtype=np.float64, - ) - if household_ids.shape[0] != source_household_weights.shape[0]: - raise ValueError("household_id and household_weight lengths differ") - - selection_weights = ( - np.load(Path(weights_path).expanduser()).astype(np.float64) - if weights_path is not None - else source_household_weights - ) - if selection_weights.ndim != 1: - raise ValueError("selection weights must be a one-dimensional array") - if selection_weights.shape[0] != household_ids.shape[0]: - raise ValueError( - "selection weights length does not match household_id length: " - f"{selection_weights.shape[0]} vs {household_ids.shape[0]}" - ) - if households > household_ids.shape[0]: - raise ValueError( - "households cannot exceed source household count: " - f"{households} > {household_ids.shape[0]}" - ) - - selected_by_weight = np.argsort(-selection_weights, kind="stable")[:households] - selected_source_order = np.sort(selected_by_weight) - selected_household_ids = household_ids[selected_source_order] - selected_weights = source_household_weights[selected_source_order].astype( - np.float64, - copy=True, - ) - original_selected_weight_sum = float(selected_weights.sum()) - resolved_target_total = ( - float(target_total_weight) - if target_total_weight is not None - else float(source_household_weights.sum()) - ) - if rescale_to_total: - if original_selected_weight_sum <= 0: - raise ValueError("selected household weights sum to zero") - selected_weights *= resolved_target_total / original_selected_weight_sum - - metadata = _build_metadata(source, period_key) - masks = _build_entity_masks(metadata, selected_household_ids) - _write_compacted_dataset( - source, - output_path, - period_key=period_key, - metadata=metadata, - masks=masks, - ) - - with h5py.File(output_path, "r+") as output: - weight_dataset = output["household_weight"][period_key] - weight_dataset[...] = selected_weights.astype(weight_dataset.dtype) - entity_counts = { - entity: int(len(output[variable][period_key])) - for entity, variable in ENTITY_ID_VARIABLES.items() - if variable in output and period_key in output[variable] - } - output_weight_sum = float( - np.asarray(output["household_weight"][period_key], dtype=np.float64).sum() - ) - - summary = { - "schema_version": 1, - "created_at": datetime.now(UTC).isoformat(), - "input_dataset": str(input_path.resolve()), - "output_dataset": str(output_path.resolve()), - "period": int(period), - "selection_method": "largest_household_weight", - "source_households": int(household_ids.shape[0]), - "selected_households": int(households), - "source_weight_sum": float(source_household_weights.sum()), - "selected_weight_sum_before_rescale": original_selected_weight_sum, - "output_weight_sum": output_weight_sum, - "target_total_weight": resolved_target_total if rescale_to_total else None, - "rescale_to_total": bool(rescale_to_total), - "selection_weight_min_kept": float(selection_weights[selected_by_weight[-1]]), - "selection_weight_max_kept": float(selection_weights[selected_by_weight[0]]), - "entity_counts": entity_counts, - "source_size_bytes": int(input_path.stat().st_size), - "output_size_bytes": int(output_path.stat().st_size), - "source_size_ratio": float( - output_path.stat().st_size / input_path.stat().st_size - ), - } - return summary - - -def _period_array(source: h5py.File, variable: str, period_key: str) -> np.ndarray: - if variable not in source or period_key not in source[variable]: - raise ValueError(f"{source.filename} is missing {variable}/{period_key}") - return np.asarray(source[variable][period_key]) - - -def _copy_attrs( - source: h5py.Group | h5py.Dataset, destination: h5py.Group | h5py.Dataset -) -> None: - for key, value in source.attrs.items(): - destination.attrs[key] = value - - -def _build_metadata(source: h5py.File, period_key: str) -> dict[str, Any]: - entity_ids = { - entity: _period_array(source, variable, period_key) - for entity, variable in ENTITY_ID_VARIABLES.items() - if variable in source and period_key in source[variable] - } - person_links = { - entity: _period_array(source, variable, period_key) - for entity, variable in PERSON_ENTITY_LINK_VARIABLES.items() - if variable in source and period_key in source[variable] - } - if "household" not in entity_ids or "person" not in entity_ids: - raise ValueError("input dataset must include household_id and person_id") - if "household" not in person_links: - raise ValueError("input dataset must include person_household_id") - - entity_lengths = {entity: int(len(values)) for entity, values in entity_ids.items()} - length_entities: dict[int, list[str]] = {} - for entity, length in entity_lengths.items(): - length_entities.setdefault(length, []).append(entity) - - policyengine_variable_entities = _load_policyengine_variable_entities() - variable_entities: dict[str, str] = {} - for variable in source.keys(): - if period_key not in source[variable]: - continue - dataset = source[variable][period_key] - entity = _infer_variable_entity( - variable, - int(len(dataset)) if dataset.shape else 0, - entity_lengths=entity_lengths, - length_entities=length_entities, - policyengine_variable_entities=policyengine_variable_entities, - ) - variable_entities[variable] = entity - - return { - "entity_ids": entity_ids, - "person_links": person_links, - "variable_entities": variable_entities, - } - - -def _infer_variable_entity( - variable: str, - array_length: int, - *, - entity_lengths: dict[str, int], - length_entities: dict[int, list[str]], - policyengine_variable_entities: dict[str, str], -) -> str: - structural_entity = STRUCTURAL_VARIABLE_ENTITIES.get(variable) - if structural_entity is not None: - return structural_entity - - policyengine_entity = policyengine_variable_entities.get(variable) - if policyengine_entity in entity_lengths: - return policyengine_entity - - matching_entities = length_entities.get(array_length, []) - if len(matching_entities) == 1: - return matching_entities[0] - - raise ValueError( - f"Could not infer entity for {variable!r} with length {array_length}; " - f"matches={matching_entities}" - ) - - -def _load_policyengine_variable_entities() -> dict[str, str]: - try: - from policyengine_us import Microsimulation # noqa: PLC0415 - except Exception: - return {} - try: - variables = Microsimulation().tax_benefit_system.variables - except Exception: - return {} - return {name: str(definition.entity.key) for name, definition in variables.items()} - - -def _build_entity_masks( - metadata: dict[str, Any], - selected_household_ids: np.ndarray, -) -> dict[str, np.ndarray]: - household_mask = np.isin( - metadata["entity_ids"]["household"], - selected_household_ids, - ) - person_mask = np.isin( - metadata["person_links"]["household"], - selected_household_ids, - ) - masks = {"household": household_mask, "person": person_mask} - for entity in ("tax_unit", "spm_unit", "family", "marital_unit"): - if entity not in metadata["entity_ids"]: - continue - if entity not in metadata["person_links"]: - raise ValueError( - f"input dataset includes {ENTITY_ID_VARIABLES[entity]} but lacks " - f"{PERSON_ENTITY_LINK_VARIABLES[entity]}" - ) - selected_entity_ids = np.unique(metadata["person_links"][entity][person_mask]) - masks[entity] = np.isin(metadata["entity_ids"][entity], selected_entity_ids) - return masks - - -def _write_compacted_dataset( - source: h5py.File, - output_path: Path, - *, - period_key: str, - metadata: dict[str, Any], - masks: dict[str, np.ndarray], -) -> None: - output_path.parent.mkdir(parents=True, exist_ok=True) - with h5py.File(output_path, "w") as output: - _copy_attrs(source, output) - for variable in source.keys(): - if period_key not in source[variable]: - continue - entity = metadata["variable_entities"][variable] - group = output.create_group(variable) - _copy_attrs(source[variable], group) - for source_period_key in source[variable].keys(): - dataset = source[variable][source_period_key] - values = np.asarray(dataset) - if values.shape: - values = values[masks[entity]] - output_dataset = group.create_dataset(source_period_key, data=values) - _copy_attrs(dataset, output_dataset) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Compact a PolicyEngine US H5 by keeping top household weights." - ) - parser.add_argument("--input-dataset", required=True) - parser.add_argument("--output-dataset", required=True) - parser.add_argument("--households", type=int, required=True) - parser.add_argument("--period", type=int, default=2024) - parser.add_argument("--weights-npy") - parser.add_argument("--target-total-weight", type=float) - parser.add_argument("--no-rescale", action="store_true") - parser.add_argument("--summary-json") - args = parser.parse_args(argv) - - summary = compact_policyengine_dataset_by_household_weight( - input_dataset_path=args.input_dataset, - output_dataset_path=args.output_dataset, - households=args.households, - period=args.period, - weights_path=args.weights_npy, - rescale_to_total=not args.no_rescale, - target_total_weight=args.target_total_weight, - ) - if args.summary_json: - summary_path = Path(args.summary_json).expanduser() - summary_path.parent.mkdir(parents=True, exist_ok=True) - summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True)) - print(summary_path) - else: - print(json.dumps(summary, indent=2, sort_keys=True)) - return 0 diff --git a/src/microplex_us/pipelines/dashboard.py b/src/microplex_us/pipelines/dashboard.py deleted file mode 100644 index 49482728..00000000 --- a/src/microplex_us/pipelines/dashboard.py +++ /dev/null @@ -1,2162 +0,0 @@ -"""Build the living Microplex diagnostic dashboard payload.""" - -from __future__ import annotations - -import argparse -import csv -import json -import re -import subprocess -from dataclasses import dataclass -from datetime import datetime, timezone -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_contracts import ( - canonicalize_us_pipeline_stage_id, - get_us_stage_artifact_contract, -) - -_ROOT = Path(__file__).resolve().parents[3] -_DEFAULT_ARTIFACT_ROOT = _ROOT / "artifacts" -_DEFAULT_OUTPUT_PATH = _DEFAULT_ARTIFACT_ROOT / "microplex_dashboard_current.json" -_DEFAULT_TARGET_DIAGNOSTICS_PATH = ( - _DEFAULT_ARTIFACT_ROOT / "pe_native_target_diagnostics_current.json" -) -_DEFAULT_POLICYENGINE_US_DATA_REPO = Path( - "/Users/maxghenis/PolicyEngine/policyengine-us-data" -) - -_PE_MODEL_SLOTS = ( - { - "id": "policyengine_legacy_ecps", - "label": "PE legacy enhanced CPS", - "status": "available_as_baseline", - "notes": ( - "The incumbent enhanced CPS is represented as the baseline side of " - "PE-native score artifacts." - ), - }, - { - "id": "policyengine_small_l0", - "label": "PE small-L0 local model", - "status": "missing_weight_package", - "notes": ( - "Mapped to policyengine-us-data local_net_worth_100 when present. " - "The weight package is not itself a scored H5 dataset." - ), - }, - { - "id": "policyengine_big_l0", - "label": "PE big-L0 local model", - "status": "missing_weight_package", - "notes": ( - "Mapped to policyengine-us-data local_net_worth_100_e300 when " - "present. The weight package is not itself a scored H5 dataset." - ), - }, -) - -_PE_L0_MODEL_SPECS = ( - { - "id": "policyengine_small_l0", - "label": "PE small-L0 local model", - "relative_dir": "policyengine_us_data/storage/calibration/local_net_worth_100", - }, - { - "id": "policyengine_big_l0", - "label": "PE big-L0 local model", - "relative_dir": ( - "policyengine_us_data/storage/calibration/local_net_worth_100_e300" - ), - }, -) - - -@dataclass(frozen=True) -class DashboardPaths: - """Filesystem inputs for the dashboard payload.""" - - artifact_root: Path = _DEFAULT_ARTIFACT_ROOT - target_diagnostics_path: Path = _DEFAULT_TARGET_DIAGNOSTICS_PATH - output_path: Path = _DEFAULT_OUTPUT_PATH - - -def build_dashboard_payload( - *, - artifact_root: str | Path = _DEFAULT_ARTIFACT_ROOT, - target_diagnostics_path: str | Path = _DEFAULT_TARGET_DIAGNOSTICS_PATH, - policyengine_us_data_repo: str | Path | None = _DEFAULT_POLICYENGINE_US_DATA_REPO, - include_tmux: bool = True, -) -> dict[str, Any]: - """Collect scores, local screens, active logs, and target diagnostics.""" - - artifact_root = Path(artifact_root) - target_diagnostics_path = Path(target_diagnostics_path) - score_runs = collect_score_runs(artifact_root) - local_screens = collect_local_target_screens(artifact_root) - pe_l0_models = collect_policyengine_l0_models(policyengine_us_data_repo) - actual_l0_runs = collect_actual_l0_objective_runs(artifact_root) - materialized_l0_scores = collect_materialized_policyengine_l0_scores(artifact_root) - artifact_gate_reports = collect_mp300k_artifact_gate_reports(artifact_root) - release_gate_reports = [ - *artifact_gate_reports, - *_release_gate_reports_from_score_runs( - score_runs, - artifact_gate_reports, - ), - ] - run_contracts = collect_run_contracts(artifact_root) - active_logs = collect_recent_log_summaries(artifact_root) - tmux_sessions = collect_tmux_sessions() if include_tmux else [] - target_diagnostics = _read_json(target_diagnostics_path) - generated_at = datetime.now(timezone.utc).isoformat() - - return { - "dashboard_schema_version": 1, - "generated_at": generated_at, - "artifact_root": str(artifact_root), - "target_diagnostics_path": ( - str(target_diagnostics_path) if target_diagnostics is not None else None - ), - "target_diagnostics": target_diagnostics, - "run_board": { - "generated_at": generated_at, - "score_runs": score_runs, - "local_target_screens": local_screens, - "policyengine_l0_models": pe_l0_models, - "actual_l0_objective_runs": actual_l0_runs, - "materialized_policyengine_l0_scores": materialized_l0_scores, - "mp300k_artifact_gate_reports": artifact_gate_reports, - "release_readiness": build_release_readiness(release_gate_reports), - "run_contracts": run_contracts, - "active_logs": active_logs, - "tmux_sessions": tmux_sessions, - "comparison_matrix": build_comparison_matrix( - score_runs, - local_screens, - pe_l0_models, - materialized_l0_scores, - ), - "apples_to_apples": build_apples_to_apples_groups( - score_runs, - local_screens, - pe_l0_models, - materialized_l0_scores, - ), - "assertions": build_dashboard_assertions( - score_runs, - local_screens, - pe_l0_models, - materialized_l0_scores, - ), - }, - } - - -def collect_score_runs(artifact_root: str | Path) -> list[dict[str, Any]]: - """Read completed PE-native score artifacts under ``artifact_root``.""" - - artifact_root = Path(artifact_root) - runs: list[dict[str, Any]] = [] - for path in sorted(_iter_score_paths(artifact_root)): - payload = _read_json(path) - if payload is None: - continue - runs.extend(_score_entries_from_payload(path, payload)) - return sorted( - runs, - key=lambda row: ( - row.get("candidate_loss") is None, - row.get("candidate_loss") or float("inf"), - row.get("artifact_path") or "", - ), - ) - - -def collect_run_contracts(artifact_root: str | Path) -> list[dict[str, Any]]: - """Read machine-readable run contract summaries under ``artifact_root``.""" - - artifact_root = Path(artifact_root) - contracts: list[dict[str, Any]] = [] - for path in sorted(artifact_root.rglob("run_summary.json")): - summary = _read_json(path) - if not isinstance(summary, dict): - continue - manifest = _read_json(path.parent / "run_manifest.json") or {} - completed_stages = _canonicalize_run_contract_stage_list( - summary.get("completed_stages") or [] - ) - legacy_completed_stages = [ - stage - for stage in summary.get("completed_stages") or [] - if canonicalize_us_pipeline_stage_id(str(stage)) != stage - ] - contracts.append( - { - "artifact_dir": str(path.parent), - "summary_path": str(path), - "manifest_path": str(path.parent / "run_manifest.json"), - "events_path": str(path.parent / "run_events.jsonl"), - "status_source": "contract", - "run_id": summary.get("run_id") or manifest.get("run_id"), - "attempt_id": summary.get("attempt_id") or manifest.get("attempt_id"), - "status": summary.get("status"), - "active": _canonicalize_run_contract_stage_ref( - summary.get("active") - ), - "started_at": summary.get("started_at"), - "updated_at": summary.get("updated_at"), - "failed_at": summary.get("failed_at"), - "completed_at": summary.get("completed_at"), - "failed_event_id": summary.get("failed_event_id"), - "failure": _canonicalize_run_contract_stage_ref( - summary.get("failure") - ), - "restart": _canonicalize_run_contract_stage_ref( - summary.get("restart") - ), - "completed_stages": completed_stages, - "legacy_completed_stages": legacy_completed_stages, - } - ) - return sorted( - contracts, - key=lambda row: str(row.get("updated_at") or ""), - reverse=True, - ) - - -def _canonicalize_run_contract_stage_list(value: Any) -> list[str]: - canonical_stages: list[str] = [] - seen: set[str] = set() - if not isinstance(value, list | tuple): - return canonical_stages - for item in value: - canonical = canonicalize_us_pipeline_stage_id(str(item)) - if canonical in seen: - continue - seen.add(canonical) - canonical_stages.append(canonical) - return canonical_stages - - -def _canonicalize_run_contract_stage_ref(value: Any) -> Any: - if not isinstance(value, dict): - return value - ref = dict(value) - stage_id = ref.get("stage_id") - if stage_id is None: - return ref - canonical = canonicalize_us_pipeline_stage_id(str(stage_id)) - if canonical != stage_id and "legacy_stage_id" not in ref: - ref["legacy_stage_id"] = stage_id - ref["stage_id"] = canonical - return ref - - -def collect_mp300k_artifact_gate_reports( - artifact_root: str | Path, -) -> list[dict[str, Any]]: - """Read persisted mp-300k release-gate reports under ``artifact_root``.""" - - artifact_root = Path(artifact_root) - reports: list[dict[str, Any]] = [] - for path in sorted(artifact_root.rglob("mp300k_artifact_gates.json")): - payload = _read_json(path) - if not isinstance(payload, dict): - continue - summary = payload.get("summary") - gates = payload.get("gates") - candidate_dataset = payload.get("candidate_dataset") - if not isinstance(summary, dict) or not isinstance(gates, dict): - continue - compatibility = _gate_report_gate(gates, "compatibility") - artifact_size = _gate_report_gate(gates, "artifact_size") - runtime = _gate_report_gate(gates, "runtime") - ecps = _gate_report_gate(gates, "ecps_comparison") - compatibility_metrics = compatibility.get("metrics", {}) - candidate_loss = _gate_metric( - ecps, - "candidate_enhanced_cps_native_loss", - ) - baseline_loss = _gate_metric( - ecps, - "baseline_enhanced_cps_native_loss", - ) - n_targets_kept = _gate_metric(ecps, "n_targets_kept") - reports.append( - { - "artifact_path": str(path), - "artifact_dir": str(path.parent), - "artifact_id": payload.get("artifact_id") or path.parent.name, - "product": payload.get("product"), - "period": payload.get("period"), - "status": summary.get("status"), - "passing_required_gate_count": summary.get( - "passing_required_gate_count" - ), - "failed_required_gate_count": summary.get("failed_required_gate_count"), - "unmeasured_required_gate_count": summary.get( - "unmeasured_required_gate_count" - ), - "failed_required_gates": summary.get("failed_required_gates") or [], - "unmeasured_required_gates": summary.get("unmeasured_required_gates") - or [], - "candidate_dataset_path": ( - candidate_dataset.get("path") - if isinstance(candidate_dataset, dict) - else None - ), - "candidate_size_bytes": ( - candidate_dataset.get("size_bytes") - if isinstance(candidate_dataset, dict) - else None - ), - "candidate_households": compatibility_metrics.get("household_count"), - "candidate_persons": compatibility_metrics.get("person_count"), - "compatibility_status": compatibility.get("status"), - "artifact_size_status": artifact_size.get("status"), - "artifact_size_ratio": _gate_metric( - artifact_size, - "artifact_size_ratio", - ), - "runtime_status": runtime.get("status"), - "runtime_ratio": _gate_metric(runtime, "runtime_ratio"), - "ecps_comparison_status": ecps.get("status"), - "candidate_loss": candidate_loss, - "baseline_loss": baseline_loss, - "loss_delta": _gate_metric(ecps, "enhanced_cps_native_loss_delta"), - "n_targets_kept": n_targets_kept, - "metric_runtime": _infer_metric_runtime( - path, - { - "baseline_enhanced_cps_native_loss": baseline_loss, - "n_targets_kept": n_targets_kept, - }, - ), - } - ) - return sorted( - reports, - key=lambda row: ( - row.get("status") != "passed", - row.get("candidate_loss") is None, - row.get("candidate_loss") or float("inf"), - row.get("artifact_path") or "", - ), - ) - - -def _release_gate_reports_from_score_runs( - score_runs: list[dict[str, Any]], - artifact_gate_reports: list[dict[str, Any]], -) -> list[dict[str, Any]]: - """Build release-readiness rows from scored artifacts with smoke metadata. - - Full gate reports are preferred when present. This fallback keeps the living - dashboard useful for older candidate artifacts that persisted PE-native - scores and loader-smoke results before the full gate sidecar existed. - """ - - gate_report_dirs = { - str(row.get("artifact_dir")) - for row in artifact_gate_reports - if row.get("artifact_dir") - } - reports: list[dict[str, Any]] = [] - for score in score_runs: - artifact_dir = str(score.get("artifact_dir") or "") - if not artifact_dir or artifact_dir in gate_report_dirs: - continue - release_smoke = score.get("release_smoke") - if not isinstance(release_smoke, dict): - continue - product = score.get("record_count_tier") - if not product: - continue - - file_size_passes = release_smoke.get("passes_file_size_ratio_2x") - runtime_passes = release_smoke.get("passes_runtime_ratio_1_25x") - candidate_beats_baseline = score.get("candidate_beats_baseline") - failed_required_gates = [] - unmeasured_required_gates = ["full_gate_report"] - for gate_name, gate_value in ( - ("artifact_size", file_size_passes), - ("runtime", runtime_passes), - ("ecps_comparison", candidate_beats_baseline), - ): - if gate_value is True: - continue - if gate_value is False: - failed_required_gates.append(gate_name) - else: - unmeasured_required_gates.append(gate_name) - - reports.append( - { - "artifact_path": release_smoke.get("artifact_path") - or score.get("artifact_path"), - "artifact_dir": artifact_dir, - "artifact_id": Path(artifact_dir).name, - "product": product, - "period": score.get("period"), - "status": _release_smoke_gate_status( - failed_required_gates, - unmeasured_required_gates, - ), - "passing_required_gate_count": 4 - - len(failed_required_gates) - - len(unmeasured_required_gates), - "failed_required_gate_count": len(failed_required_gates), - "unmeasured_required_gate_count": len(unmeasured_required_gates), - "failed_required_gates": failed_required_gates, - "unmeasured_required_gates": unmeasured_required_gates, - "candidate_dataset_path": score.get("candidate_dataset"), - "candidate_size_bytes": release_smoke.get( - "candidate_file_size_bytes" - ), - "candidate_households": release_smoke.get("candidate_households"), - "candidate_persons": None, - "compatibility_status": "smoke_only", - "artifact_size_status": _gate_bool_status(file_size_passes), - "artifact_size_ratio": release_smoke.get("file_size_ratio"), - "runtime_status": _gate_bool_status(runtime_passes), - "runtime_ratio": release_smoke.get("median_runtime_ratio"), - "ecps_comparison_status": _gate_bool_status( - candidate_beats_baseline - ), - "candidate_loss": score.get("candidate_loss"), - "baseline_loss": score.get("baseline_loss"), - "loss_delta": score.get("loss_delta"), - "n_targets_kept": score.get("n_targets_kept"), - "metric_runtime": score.get("metric_runtime"), - "source_kind": "score_release_smoke", - } - ) - return reports - - -def _release_smoke_gate_status( - failed_required_gates: list[str], - unmeasured_required_gates: list[str], -) -> str: - if failed_required_gates: - return "failed" - if unmeasured_required_gates: - return "incomplete" - return "passed" - - -def _gate_bool_status(value: Any) -> str | None: - if value is True: - return "pass" - if value is False: - return "fail" - return None - - -def build_release_readiness( - artifact_gate_reports: list[dict[str, Any]], -) -> list[dict[str, Any]]: - """Summarize release readiness by product and target surface.""" - - grouped: dict[tuple[str, str], list[dict[str, Any]]] = {} - for report in artifact_gate_reports: - product = str(report.get("product") or "unknown") - metric_runtime = str(report.get("metric_runtime") or "unknown") - grouped.setdefault((product, metric_runtime), []).append(report) - - readiness: list[dict[str, Any]] = [] - for (product, metric_runtime), reports in grouped.items(): - passing = [row for row in reports if row.get("status") == "passed"] - failed = [row for row in reports if row.get("status") == "failed"] - incomplete = [row for row in reports if row.get("status") == "incomplete"] - best_passing = min(passing, key=_release_artifact_sort_key) if passing else None - best_fit = min(reports, key=_release_artifact_sort_key) - readiness.append( - { - "product": product, - "metric_runtime": metric_runtime, - "target_surface": _target_surface_label(metric_runtime), - "status": _release_status(passing, failed, incomplete), - "artifact_count": len(reports), - "passed_artifact_count": len(passing), - "failed_artifact_count": len(failed), - "incomplete_artifact_count": len(incomplete), - "best_passing_artifact": ( - _release_artifact_summary(best_passing) - if best_passing is not None - else None - ), - "best_fit_artifact": _release_artifact_summary(best_fit), - "best_fit_is_release_ready": best_fit.get("status") == "passed", - "best_fit_release_blockers": _release_blockers(best_fit), - "release_blockers": _group_release_blockers(reports) - if best_passing is None - else [], - "fit_loss_gap_to_best_passing": _fit_loss_gap( - best_passing, - best_fit, - ), - } - ) - return sorted( - readiness, - key=lambda row: ( - row.get("status") != "release_ready", - row.get("product") or "", - row.get("metric_runtime") or "", - ), - ) - - -def _release_artifact_sort_key(row: dict[str, Any]) -> tuple[bool, float, str]: - return ( - row.get("candidate_loss") is None, - row.get("candidate_loss") or float("inf"), - row.get("artifact_path") or "", - ) - - -def _release_status( - passing: list[dict[str, Any]], - failed: list[dict[str, Any]], - incomplete: list[dict[str, Any]], -) -> str: - if passing: - return "release_ready" - if failed: - return "blocked" - if incomplete: - return "incomplete" - return "unmeasured" - - -def _target_surface_label(metric_runtime: str) -> str: - if metric_runtime == "latest_policyengine_us": - return "latest-us-data targets" - if metric_runtime == "legacy_or_patched_runtime": - return "legacy/patched targets" - return "unknown targets" - - -def _release_artifact_summary(row: dict[str, Any]) -> dict[str, Any]: - return { - "artifact_id": row.get("artifact_id"), - "artifact_path": row.get("artifact_path"), - "artifact_dir": row.get("artifact_dir"), - "status": row.get("status"), - "candidate_dataset_path": row.get("candidate_dataset_path"), - "candidate_loss": row.get("candidate_loss"), - "baseline_loss": row.get("baseline_loss"), - "loss_delta": row.get("loss_delta"), - "n_targets_kept": row.get("n_targets_kept"), - "candidate_households": row.get("candidate_households"), - "candidate_persons": row.get("candidate_persons"), - "compatibility_status": row.get("compatibility_status"), - "artifact_size_status": row.get("artifact_size_status"), - "artifact_size_ratio": row.get("artifact_size_ratio"), - "runtime_status": row.get("runtime_status"), - "runtime_ratio": row.get("runtime_ratio"), - "ecps_comparison_status": row.get("ecps_comparison_status"), - "failed_required_gates": row.get("failed_required_gates") or [], - "unmeasured_required_gates": row.get("unmeasured_required_gates") or [], - } - - -def _release_blockers(row: dict[str, Any]) -> list[str]: - return sorted( - { - str(name) - for name in (row.get("failed_required_gates") or []) - + (row.get("unmeasured_required_gates") or []) - } - ) - - -def _group_release_blockers(reports: list[dict[str, Any]]) -> list[str]: - blockers: set[str] = set() - for report in reports: - blockers.update(_release_blockers(report)) - return sorted(blockers) - - -def _fit_loss_gap( - best_passing: dict[str, Any] | None, - best_fit: dict[str, Any], -) -> float | None: - if best_passing is None: - return None - passing_loss = _number_or_none(best_passing.get("candidate_loss")) - fit_loss = _number_or_none(best_fit.get("candidate_loss")) - if passing_loss is None or fit_loss is None: - return None - return passing_loss - fit_loss - - -def _gate_report_gate(gates: dict[str, Any], name: str) -> dict[str, Any]: - gate = gates.get(name) - return gate if isinstance(gate, dict) else {} - - -def _gate_metric(gate: dict[str, Any], metric: str) -> Any: - metrics = gate.get("metrics") - if not isinstance(metrics, dict): - return None - return metrics.get(metric) - - -def collect_local_target_screens(artifact_root: str | Path) -> list[dict[str, Any]]: - """Read cheap matrix-side local target screen summaries.""" - - artifact_root = Path(artifact_root) - screens = [] - for path in sorted(artifact_root.rglob("split_loss_summary.json")): - payload = _read_json(path) - if not isinstance(payload, dict): - continue - score_summary = _local_screen_score_summary(path.parent / "scores.json") - screens.append( - { - "label": payload.get("candidate") or path.parent.name, - "artifact_path": str(path), - "artifact_dir": str(path.parent), - "metric": "latest_pe_matrix_plus_cd_age_screen", - "status": ( - "screen_scored_latest_pe" - if score_summary is not None - else "screen_only" - ), - "broad_loss": _number_or_none( - payload.get("broad_objective_on_latest_pe_matrix_rows") - ), - "pe_native_score_path": ( - str(path.parent / "scores.json") - if score_summary is not None - else None - ), - "pe_native_broad_loss": ( - score_summary.get("candidate_loss") - if score_summary is not None - else None - ), - "pe_native_baseline_loss": ( - score_summary.get("baseline_loss") - if score_summary is not None - else None - ), - "pe_native_loss_delta": ( - score_summary.get("loss_delta") - if score_summary is not None - else None - ), - "pe_native_candidate_beats_baseline": ( - score_summary.get("candidate_beats_baseline") - if score_summary is not None - else None - ), - "latest_pe_baseline_broad_loss": _number_or_none( - payload.get("latest_pe_baseline_broad_loss") - ), - "latest_winner_broad_objective": _number_or_none( - payload.get("latest_winner_broad_objective") - ), - "cd_age_target_weight": _number_or_none( - payload.get("cd_age_target_weight") - ), - "cd_age_mean_abs_relative_error": _number_or_none( - payload.get("cd_age_mean_abs_relative_error") - ), - "cd_age_p90_abs_relative_error": _number_or_none( - payload.get("cd_age_p90_abs_relative_error") - ), - "cd_age_p99_abs_relative_error": _number_or_none( - payload.get("cd_age_p99_abs_relative_error") - ), - "cd_age_max_abs_relative_error": _number_or_none( - payload.get("cd_age_max_abs_relative_error") - ), - "weight_sum": _number_or_none(payload.get("weight_sum")), - "weights_path": payload.get("weights_path"), - } - ) - return sorted( - screens, - key=lambda row: ( - row.get("cd_age_mean_abs_relative_error") is None, - row.get("cd_age_mean_abs_relative_error") or float("inf"), - row.get("broad_loss") or float("inf"), - ), - ) - - -def _local_screen_score_summary(path: Path) -> dict[str, Any] | None: - """Return the latest-PE score summary colocated with a local target screen.""" - - payload = _read_json(path) - if isinstance(payload, list): - payload = payload[0] if payload else None - if not isinstance(payload, dict): - return None - summary = payload.get("summary") - if not isinstance(summary, dict): - summary = payload - candidate_loss = _number_or_none(summary.get("candidate_enhanced_cps_native_loss")) - baseline_loss = _number_or_none(summary.get("baseline_enhanced_cps_native_loss")) - if candidate_loss is None: - return None - return { - "candidate_loss": candidate_loss, - "baseline_loss": baseline_loss, - "loss_delta": _number_or_none(summary.get("enhanced_cps_native_loss_delta")), - "candidate_beats_baseline": summary.get("candidate_beats_baseline"), - } - - -def collect_policyengine_l0_models( - policyengine_us_data_repo: str | Path | None, -) -> list[dict[str, Any]]: - """Collect PE local-L0 weight-package diagnostics.""" - - if policyengine_us_data_repo is None: - return [] - repo = Path(policyengine_us_data_repo) - models = [] - for spec in _PE_L0_MODEL_SPECS: - model_dir = repo / spec["relative_dir"] - config = _read_json(model_dir / "unified_run_config.json") - diagnostics = _summarize_unified_diagnostics( - model_dir / "unified_diagnostics.csv" - ) - weights_path = model_dir / "calibration_weights.npy" - present = isinstance(config, dict) and diagnostics is not None - models.append( - { - "id": spec["id"], - "label": spec["label"], - "status": ( - "available_weight_package" if present else "missing_weight_package" - ), - "artifact_dir": str(model_dir), - "weights_path": str(weights_path) if weights_path.exists() else None, - "config_path": ( - str(model_dir / "unified_run_config.json") - if isinstance(config, dict) - else None - ), - "diagnostics_path": ( - str(model_dir / "unified_diagnostics.csv") - if diagnostics is not None - else None - ), - "dataset": config.get("dataset") if isinstance(config, dict) else None, - "db_path": config.get("db_path") if isinstance(config, dict) else None, - "n_clones": ( - _number_or_none(config.get("n_clones")) - if isinstance(config, dict) - else None - ), - "epochs": ( - _number_or_none(config.get("epochs")) - if isinstance(config, dict) - else None - ), - "n_targets": ( - _number_or_none(config.get("n_targets")) - if isinstance(config, dict) - else None - ), - "n_records": ( - _number_or_none(config.get("n_records")) - if isinstance(config, dict) - else None - ), - "weight_sum": ( - _number_or_none(config.get("weight_sum")) - if isinstance(config, dict) - else None - ), - "weight_nonzero": ( - _number_or_none(config.get("weight_nonzero")) - if isinstance(config, dict) - else None - ), - "mean_error_pct": ( - _number_or_none(config.get("mean_error_pct")) - if isinstance(config, dict) - else None - ), - "elapsed_seconds": ( - _number_or_none(config.get("elapsed_seconds")) - if isinstance(config, dict) - else None - ), - "diagnostics": diagnostics, - "same_harness_materialization": _inspect_l0_materialization( - model_dir=model_dir, - config=config, - weights_path=weights_path, - ), - "notes": ( - "PE local-L0 fit metrics come from unified_diagnostics.csv. " - "Same-harness broad/latest score remains missing until this " - "weight package is materialized as a scored H5." - ), - } - ) - return models - - -def collect_actual_l0_objective_runs( - artifact_root: str | Path, -) -> list[dict[str, Any]]: - """Collect local unified-calibration runs scored on the actual L0 objective.""" - - artifact_root = Path(artifact_root) - runs: list[dict[str, Any]] = [] - for diagnostics_path in sorted(artifact_root.rglob("unified_diagnostics.csv")): - diagnostics = _summarize_unified_diagnostics(diagnostics_path) - if diagnostics is None: - continue - run_dir = diagnostics_path.parent - weights_path = run_dir / "calibration_weights.npy" - config = _read_json(run_dir / "unified_run_config.json") - weight_summary = _weight_file_summary(weights_path) - runs.append( - { - "label": run_dir.name, - "artifact_dir": str(run_dir), - "diagnostics_path": str(diagnostics_path), - "config_path": ( - str(run_dir / "unified_run_config.json") - if isinstance(config, dict) - else None - ), - "weights_path": str(weights_path) if weights_path.exists() else None, - "status": "complete", - "model_id": _infer_actual_l0_model_id(run_dir), - "actual_l0_data_loss": diagnostics.get("actual_l0_data_loss"), - "actual_l0_mean_abs_relative_error_pct": diagnostics.get( - "actual_l0_mean_abs_relative_error_pct" - ), - "n_targets": diagnostics.get("n_targets"), - "n_achievable": diagnostics.get("n_achievable"), - "n_clones": ( - _number_or_none(config.get("n_clones")) - if isinstance(config, dict) - else None - ), - "epochs": ( - _number_or_none(config.get("epochs")) - if isinstance(config, dict) - else None - ), - "weights": weight_summary, - "diagnostics": diagnostics, - } - ) - return sorted( - runs, - key=lambda row: ( - row.get("actual_l0_data_loss") is None, - row.get("actual_l0_data_loss") or float("inf"), - row.get("artifact_dir") or "", - ), - ) - - -def collect_materialized_policyengine_l0_scores( - artifact_root: str | Path, -) -> list[dict[str, Any]]: - """Read PE local-area L0 materializations scored through broad diagnostics.""" - - artifact_root = Path(artifact_root) - scores: list[dict[str, Any]] = [] - for path in sorted( - artifact_root.rglob("pe_local_area_l0_state_stack_vs_legacy_ecps.json") - ): - payload = _read_json(path) - if not isinstance(payload, dict): - continue - summary = payload.get("summary") - if not isinstance(summary, dict): - continue - candidate_loss = _number_or_none(summary.get("to_loss")) - baseline_loss = _number_or_none(summary.get("from_loss")) - if candidate_loss is None or baseline_loss is None: - continue - scores.append( - { - "id": "policyengine_local_area_l0_state_stack", - "label": "PE local-area L0 state stack", - "status": "same_harness_scored_experimental", - "artifact_path": str(path), - "artifact_dir": str(path.parent), - "metric": payload.get("metric") - or "enhanced_cps_native_loss_target_delta", - "metric_runtime": "legacy_or_patched_runtime", - "candidate_loss": candidate_loss, - "baseline_loss": baseline_loss, - "candidate_beats_baseline": candidate_loss < baseline_loss, - "loss_delta": _number_or_none(summary.get("loss_delta")), - "n_targets": _number_or_none(summary.get("n_targets")), - "state_score_count": _number_or_none(payload.get("state_score_count")), - "state_weight_sum": _number_or_none(payload.get("state_weight_sum")), - "notes": ( - "This is an experimental materialized state-stack score. " - "It is a broad same-harness artifact, but it is not the " - "small-L0 or big-L0 weight package unless the source path " - "says so." - ), - } - ) - return sorted( - scores, - key=lambda row: ( - row.get("candidate_loss") is None, - row.get("candidate_loss") or float("inf"), - row.get("artifact_path") or "", - ), - ) - - -def collect_recent_log_summaries( - artifact_root: str | Path, *, limit: int = 12 -) -> list[dict[str, Any]]: - """Summarize recent logs with row-batch progress lines.""" - - artifact_root = Path(artifact_root) - paths = sorted( - (path for path in artifact_root.rglob("*.log") if path.is_file()), - key=lambda path: path.stat().st_mtime, - reverse=True, - )[:limit] - summaries = [] - for path in paths: - tail = _tail_text(path) - progress = _parse_row_batch_progress(tail) - summaries.append( - { - "path": str(path), - "modified_at": datetime.fromtimestamp( - path.stat().st_mtime, timezone.utc - ).isoformat(), - "progress": progress, - "last_lines": tail.splitlines()[-5:], - } - ) - return summaries - - -def collect_tmux_sessions() -> list[dict[str, Any]]: - """Return current tmux sessions when tmux is available.""" - - try: - completed = subprocess.run( - ["tmux", "ls"], - check=False, - capture_output=True, - text=True, - timeout=5, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - if completed.returncode != 0: - return [] - sessions = [] - for line in completed.stdout.splitlines(): - if not line.strip(): - continue - name = line.split(":", 1)[0] - if not _is_relevant_tmux_session(name): - continue - sessions.append({"name": name, "raw": line}) - return sorted( - sessions, key=lambda row: (not row["name"].startswith("mp_"), row["name"]) - ) - - -def build_comparison_matrix( - score_runs: list[dict[str, Any]], - local_screens: list[dict[str, Any]], - pe_l0_models: list[dict[str, Any]], - materialized_l0_scores: list[dict[str, Any]] | None = None, -) -> list[dict[str, Any]]: - """Build a compact answer matrix for the current PE comparison question.""" - - materialized_l0_scores = materialized_l0_scores or [] - best_latest = _best_score( - score_runs, - predicate=lambda row: ( - row.get("metric_runtime") == "latest_policyengine_us" - and row.get("model_id") == "microplex_current_best" - ), - ) - best_legacy = _best_score( - score_runs, - predicate=lambda row: ( - row.get("metric_runtime") == "legacy_or_patched_runtime" - and row.get("model_id") == "microplex_current_best" - ), - ) - best_local = local_screens[0] if local_screens else None - pe_l0_by_id = {row.get("id"): row for row in pe_l0_models} - - rows: list[dict[str, Any]] = [] - for slot in _PE_MODEL_SLOTS: - row = dict(slot) - if slot["id"] == "policyengine_legacy_ecps" and best_latest is not None: - row.update( - { - "latest_pe_broad_loss": best_latest.get("baseline_loss"), - "latest_pe_status": "available", - "legacy_metric_loss": ( - best_legacy.get("baseline_loss") - if best_legacy is not None - else None - ), - "legacy_metric_status": ( - "available" if best_legacy is not None else "missing" - ), - } - ) - elif slot["id"] in pe_l0_by_id: - model = pe_l0_by_id[slot["id"]] - diagnostics = model.get("diagnostics") or {} - latest_score = _best_model_metric_score( - score_runs, - model_id=str(slot["id"]), - metric_runtime="latest_policyengine_us", - ) - legacy_score = _best_model_metric_score( - score_runs, - model_id=str(slot["id"]), - metric_runtime="legacy_or_patched_runtime", - ) - row.update( - { - "status": ( - "same_harness_scored" - if latest_score is not None or legacy_score is not None - else model.get("status") - ), - "artifact_dir": model.get("artifact_dir"), - "latest_pe_broad_loss": ( - latest_score.get("candidate_loss") - if latest_score is not None - else None - ), - "latest_pe_status": ( - "scored" if latest_score is not None else "missing_h5_score" - ), - "legacy_metric_loss": ( - legacy_score.get("candidate_loss") - if legacy_score is not None - else None - ), - "legacy_metric_status": ( - "scored" if legacy_score is not None else "missing_h5_score" - ), - "pe_local_l0_mean_abs_error_pct": diagnostics.get( - "mean_abs_relative_error_pct" - ) - or model.get("mean_error_pct"), - "pe_local_l0_median_abs_error_pct": diagnostics.get( - "median_abs_relative_error_pct" - ), - "pe_local_l0_p90_abs_error_pct": diagnostics.get( - "p90_abs_relative_error_pct" - ), - "pe_local_l0_targets": diagnostics.get("n_targets") - or model.get("n_targets"), - "pe_local_l0_epochs": model.get("epochs"), - "pe_local_l0_weight_nonzero": model.get("weight_nonzero"), - "notes": ( - "Same-harness H5 score is available." - if latest_score is not None or legacy_score is not None - else model.get("notes") or row.get("notes") - ), - } - ) - else: - row.update( - { - "latest_pe_broad_loss": None, - "latest_pe_status": "missing", - "legacy_metric_loss": None, - "legacy_metric_status": "missing", - } - ) - rows.append(row) - - best_materialized_l0 = _best_materialized_l0_score(materialized_l0_scores) - if best_materialized_l0 is not None: - rows.append( - { - "id": best_materialized_l0.get("id"), - "label": best_materialized_l0.get("label"), - "status": best_materialized_l0.get("status"), - "latest_pe_broad_loss": None, - "latest_pe_status": None, - "legacy_metric_loss": best_materialized_l0.get("candidate_loss"), - "legacy_metric_baseline_loss": best_materialized_l0.get( - "baseline_loss" - ), - "legacy_metric_status": ( - "beats_legacy_pe_baseline" - if best_materialized_l0.get("candidate_beats_baseline") - else "worse_than_legacy_pe_baseline" - ), - "artifact_path": best_materialized_l0.get("artifact_path"), - "notes": best_materialized_l0.get("notes"), - } - ) - - rows.append( - { - "id": "microplex_current_best", - "label": "Microplex current best", - "status": "available" if best_latest is not None else "missing", - "latest_pe_broad_loss": ( - best_latest.get("candidate_loss") if best_latest is not None else None - ), - "latest_pe_baseline_loss": ( - best_latest.get("baseline_loss") if best_latest is not None else None - ), - "latest_pe_status": ( - "beats_legacy_pe_baseline" - if best_latest is not None - and best_latest.get("candidate_beats_baseline") - else "missing" - ), - "legacy_metric_loss": ( - best_legacy.get("candidate_loss") if best_legacy is not None else None - ), - "legacy_metric_baseline_loss": ( - best_legacy.get("baseline_loss") if best_legacy is not None else None - ), - "legacy_metric_status": ( - "beats_legacy_pe_baseline" - if best_legacy is not None - and best_legacy.get("candidate_beats_baseline") - else "missing" - ), - "local_cd_age_screen_loss": ( - best_local.get("broad_loss") if best_local is not None else None - ), - "local_cd_age_mare": ( - best_local.get("cd_age_mean_abs_relative_error") - if best_local is not None - else None - ), - "artifact_path": ( - best_latest.get("artifact_path") if best_latest is not None else None - ), - "record_count_tier": ( - best_latest.get("record_count_tier") - if best_latest is not None - else None - ), - "release_smoke": ( - best_latest.get("release_smoke") if best_latest is not None else None - ), - "notes": ( - "This is the best completed Microplex score found locally. " - "The CD-age row is a matrix screen until the latest-PE row-batch " - "score finishes." - ), - } - ) - return rows - - -def build_apples_to_apples_groups( - score_runs: list[dict[str, Any]], - local_screens: list[dict[str, Any]], - pe_l0_models: list[dict[str, Any]], - materialized_l0_scores: list[dict[str, Any]], -) -> list[dict[str, Any]]: - """Group comparisons that share an actual metric and target universe.""" - - best_latest = _best_score( - score_runs, - predicate=lambda row: ( - row.get("metric_runtime") == "latest_policyengine_us" - and row.get("model_id") == "microplex_current_best" - ), - ) - best_legacy = _best_score( - score_runs, - predicate=lambda row: ( - row.get("metric_runtime") == "legacy_or_patched_runtime" - and row.get("model_id") == "microplex_current_best" - ), - ) - best_local = local_screens[0] if local_screens else None - best_materialized_l0 = _best_materialized_l0_score(materialized_l0_scores) - pe_l0_by_id = {row.get("id"): row for row in pe_l0_models} - - latest_small = _best_model_metric_score( - score_runs, - model_id="policyengine_small_l0", - metric_runtime="latest_policyengine_us", - ) - latest_big = _best_model_metric_score( - score_runs, - model_id="policyengine_big_l0", - metric_runtime="latest_policyengine_us", - ) - legacy_small = _best_model_metric_score( - score_runs, - model_id="policyengine_small_l0", - metric_runtime="legacy_or_patched_runtime", - ) - legacy_big = _best_model_metric_score( - score_runs, - model_id="policyengine_big_l0", - metric_runtime="legacy_or_patched_runtime", - ) - - groups = [ - { - "id": "latest_pe_broad", - "label": "Latest PolicyEngine broad target loss", - "metric_scope": "same_harness_latest_pe_broad", - "status": ( - "complete" - if best_latest and latest_small and latest_big - else "partial" - if best_latest - else "missing" - ), - "rows": [ - _comparison_row( - model_id="policyengine_legacy_ecps", - label="PE legacy enhanced CPS", - score=( - best_latest.get("baseline_loss") - if best_latest is not None - else None - ), - status="scored_baseline" if best_latest else "missing", - ), - _comparison_row( - model_id="microplex_current_best", - label="Microplex current best", - score=( - best_latest.get("candidate_loss") - if best_latest is not None - else None - ), - status=( - "scored_candidate_beats_baseline" - if best_latest and best_latest.get("candidate_beats_baseline") - else "missing" - ), - artifact_path=( - best_latest.get("artifact_path") - if best_latest is not None - else None - ), - ), - _scored_or_missing_l0_row( - pe_l0_by_id, - "policyengine_small_l0", - latest_small, - ), - _scored_or_missing_l0_row( - pe_l0_by_id, - "policyengine_big_l0", - latest_big, - ), - ], - }, - { - "id": "legacy_broad", - "label": "Legacy broad target loss", - "metric_scope": "same_harness_legacy_broad", - "status": ( - "complete" - if best_legacy and legacy_small and legacy_big - else "partial" - if best_legacy - else "missing" - ), - "rows": [ - _comparison_row( - model_id="policyengine_legacy_ecps", - label="PE legacy enhanced CPS", - score=( - best_legacy.get("baseline_loss") - if best_legacy is not None - else None - ), - status="scored_baseline" if best_legacy else "missing", - ), - _comparison_row( - model_id="microplex_current_best", - label="Microplex current best", - score=( - best_legacy.get("candidate_loss") - if best_legacy is not None - else None - ), - status=( - "scored_candidate_beats_baseline" - if best_legacy and best_legacy.get("candidate_beats_baseline") - else "missing" - ), - artifact_path=( - best_legacy.get("artifact_path") - if best_legacy is not None - else None - ), - ), - _comparison_row( - model_id="policyengine_local_area_l0_state_stack", - label="PE local-area L0 state stack", - score=( - best_materialized_l0.get("candidate_loss") - if best_materialized_l0 is not None - else None - ), - status=( - best_materialized_l0.get("status") - if best_materialized_l0 is not None - else "missing" - ), - artifact_path=( - best_materialized_l0.get("artifact_path") - if best_materialized_l0 is not None - else None - ), - detail=( - "Experimental materialization" - if best_materialized_l0 is not None - else None - ), - ), - _scored_or_missing_l0_row( - pe_l0_by_id, - "policyengine_small_l0", - legacy_small, - ), - _scored_or_missing_l0_row( - pe_l0_by_id, - "policyengine_big_l0", - legacy_big, - ), - ], - }, - { - "id": "pe_local_l0_native", - "label": "PE local-L0 native target diagnostics", - "metric_scope": "pe_native_local_l0_diagnostics", - "status": "native_only", - "rows": [ - _native_pe_l0_row(pe_l0_by_id, "policyengine_small_l0"), - _native_pe_l0_row(pe_l0_by_id, "policyengine_big_l0"), - _comparison_row( - model_id="microplex_cd_age_screen", - label="Microplex CD-age screen", - score=( - 100 * best_local.get("cd_age_mean_abs_relative_error") - if best_local is not None - and best_local.get("cd_age_mean_abs_relative_error") is not None - else None - ), - status=( - "different_target_set_screen_only" - if best_local is not None - else "missing" - ), - artifact_path=( - best_local.get("artifact_path") - if best_local is not None - else None - ), - detail=( - "Displayed for tracking only; not used as a PE local-L0 " - "native comparison." - ), - ), - ], - }, - ] - return groups - - -def build_dashboard_assertions( - score_runs: list[dict[str, Any]], - local_screens: list[dict[str, Any]], - pe_l0_models: list[dict[str, Any]], - materialized_l0_scores: list[dict[str, Any]] | None = None, -) -> dict[str, Any]: - """State which comparison claims are supported by completed artifacts.""" - - materialized_l0_scores = materialized_l0_scores or [] - best_latest = _best_score( - score_runs, - predicate=lambda row: ( - row.get("metric_runtime") == "latest_policyengine_us" - and row.get("model_id") == "microplex_current_best" - ), - ) - best_legacy = _best_score( - score_runs, - predicate=lambda row: ( - row.get("metric_runtime") == "legacy_or_patched_runtime" - and row.get("model_id") == "microplex_current_best" - ), - ) - pe_l0_by_id = {row.get("id"): row for row in pe_l0_models} - small_l0_present = ( - pe_l0_by_id.get("policyengine_small_l0", {}).get("status") - == "available_weight_package" - ) - big_l0_present = ( - pe_l0_by_id.get("policyengine_big_l0", {}).get("status") - == "available_weight_package" - ) - best_materialized_l0 = _best_materialized_l0_score(materialized_l0_scores) - small_latest = _best_model_metric_score( - score_runs, - model_id="policyengine_small_l0", - metric_runtime="latest_policyengine_us", - ) - small_legacy = _best_model_metric_score( - score_runs, - model_id="policyengine_small_l0", - metric_runtime="legacy_or_patched_runtime", - ) - big_latest = _best_model_metric_score( - score_runs, - model_id="policyengine_big_l0", - metric_runtime="latest_policyengine_us", - ) - big_legacy = _best_model_metric_score( - score_runs, - model_id="policyengine_big_l0", - metric_runtime="legacy_or_patched_runtime", - ) - small_complete = bool(small_latest and small_legacy) - big_complete = bool(big_latest and big_legacy) - all_models_complete = bool( - best_latest and best_legacy and small_complete and big_complete - ) - best_latest_release_smoke = ( - best_latest.get("release_smoke") if isinstance(best_latest, dict) else None - ) - return { - "microplex_beats_legacy_ecps_latest_pe_broad": bool( - best_latest and best_latest.get("candidate_beats_baseline") - ), - "microplex_beats_legacy_ecps_legacy_metric": bool( - best_legacy and best_legacy.get("candidate_beats_baseline") - ), - "microplex_current_best_has_release_smoke": bool(best_latest_release_smoke), - "microplex_current_best_release_smoke_passes": bool( - isinstance(best_latest_release_smoke, dict) - and best_latest_release_smoke.get("passes_file_size_ratio_2x") - and best_latest_release_smoke.get("passes_runtime_ratio_1_25x") - ), - "microplex_vs_small_l0_complete": small_complete, - "microplex_vs_big_l0_complete": big_complete, - "microplex_vs_all_three_pe_models_on_both_metrics": all_models_complete, - "policyengine_small_l0_weight_package_available": small_l0_present, - "policyengine_big_l0_weight_package_available": big_l0_present, - "policyengine_materialized_l0_same_harness_available": bool( - best_materialized_l0 - ), - "local_cd_age_screen_available": bool(local_screens), - "apples_to_apples_groups_available": True, - "caveat": ( - "Small-L0 and big-L0 PE weight packages are wired into the run " - "board when available. The all-three-PE-model claim is supported " - "only when both materialized PE L0 packages have legacy and latest " - "same-harness scores." - ), - } - - -def _comparison_row( - *, - model_id: str, - label: str, - score: float | None, - status: str, - artifact_path: str | None = None, - detail: str | None = None, -) -> dict[str, Any]: - return { - "model_id": model_id, - "label": label, - "score": _number_or_none(score), - "status": status, - "artifact_path": artifact_path, - "detail": detail, - } - - -def _missing_h5_row( - pe_l0_by_id: dict[str, dict[str, Any]], model_id: str -) -> dict[str, Any]: - model = pe_l0_by_id.get(model_id) or {} - materialization = model.get("same_harness_materialization") - blocker = None - if isinstance(materialization, dict): - blocker = materialization.get("status") - return _comparison_row( - model_id=model_id, - label=str(model.get("label") or model_id), - score=None, - status="missing_same_harness_h5_score", - artifact_path=model.get("artifact_dir"), - detail=blocker, - ) - - -def _scored_or_missing_l0_row( - pe_l0_by_id: dict[str, dict[str, Any]], - model_id: str, - score: dict[str, Any] | None, -) -> dict[str, Any]: - if score is None: - return _missing_h5_row(pe_l0_by_id, model_id) - model = pe_l0_by_id.get(model_id) or {} - return _comparison_row( - model_id=model_id, - label=str(model.get("label") or model_id), - score=score.get("candidate_loss"), - status=( - "scored_candidate_beats_legacy_ecps" - if score.get("candidate_beats_baseline") - else "scored_candidate_worse_than_legacy_ecps" - ), - artifact_path=score.get("artifact_path"), - detail=( - f"{int(score['n_targets_kept']):,} targets" - if _number_or_none(score.get("n_targets_kept")) is not None - else None - ), - ) - - -def _native_pe_l0_row( - pe_l0_by_id: dict[str, dict[str, Any]], model_id: str -) -> dict[str, Any]: - model = pe_l0_by_id.get(model_id) or {} - diagnostics = model.get("diagnostics") or {} - score = diagnostics.get("mean_abs_relative_error_pct") or model.get( - "mean_error_pct" - ) - targets = diagnostics.get("n_targets") or model.get("n_targets") - return _comparison_row( - model_id=model_id, - label=str(model.get("label") or model_id), - score=_number_or_none(score), - status=( - "native_diagnostics_available" - if _number_or_none(score) is not None - else "missing_native_diagnostics" - ), - artifact_path=model.get("diagnostics_path") or model.get("artifact_dir"), - detail=( - f"{format(int(targets), ',')} PE-local targets" - if _number_or_none(targets) is not None - else None - ), - ) - - -def _best_materialized_l0_score( - rows: list[dict[str, Any]], -) -> dict[str, Any] | None: - candidates = [ - row for row in rows if _number_or_none(row.get("candidate_loss")) is not None - ] - if not candidates: - return None - return min(candidates, key=lambda row: row["candidate_loss"]) - - -def _weight_file_summary(path: Path) -> dict[str, Any] | None: - if not path.exists(): - return None - try: - import numpy as np - - weights = np.asarray(np.load(path), dtype=float) - except Exception: # pragma: no cover - defensive artifact read - return {"status": "unreadable", "path": str(path)} - return { - "status": "ok", - "path": str(path), - "records": int(weights.size), - "nonzero": int((weights > 0.0).sum()), - "greater_than_1": int((weights > 1.0).sum()), - "greater_than_100": int((weights > 100.0).sum()), - "sum": float(weights.sum()), - } - - -def _infer_actual_l0_model_id(run_dir: Path) -> str: - text = str(run_dir).lower() - if "microplex" in text or "mp_" in text: - return "microplex_actual_l0" - if "local_net_worth_100_e300" in text: - return "policyengine_big_l0" - if "local_net_worth_100" in text: - return "policyengine_small_l0" - return "unknown_actual_l0" - - -def _inspect_l0_materialization( - *, - model_dir: Path, - config: Any, - weights_path: Path, -) -> dict[str, Any]: - """Return a cheap compatibility check for materializing a PE-L0 package.""" - - result: dict[str, Any] = {"status": "unknown"} - if not weights_path.exists(): - result["status"] = "missing_weights" - return result - - try: - import numpy as np - - weights = np.load(weights_path, mmap_mode="r") - weight_count = int(weights.shape[0]) - result["weight_count"] = weight_count - except Exception as error: # pragma: no cover - defensive artifact read - result["status"] = "weights_unreadable" - result["error"] = str(error) - return result - - geography_path = model_dir / "geography.npz" - if geography_path.exists(): - try: - import numpy as np - - with np.load(geography_path, allow_pickle=True) as geography: - if "block_geoid" in geography: - result["geography_row_count"] = int( - geography["block_geoid"].shape[0] - ) - if "n_records" in geography: - result["geography_n_records"] = int(geography["n_records"][0]) - if "n_clones" in geography: - result["geography_n_clones"] = int(geography["n_clones"][0]) - except Exception as error: # pragma: no cover - defensive artifact read - result["geography_error"] = str(error) - - dataset_path = None - if isinstance(config, dict) and config.get("dataset"): - dataset_path = Path(str(config["dataset"])) - result["dataset_path"] = str(dataset_path) - if dataset_path is None or not dataset_path.exists(): - result["status"] = "source_h5_missing" - return result - - household_count = _h5_period_length(dataset_path, "household_id") - result["source_household_count"] = household_count - if household_count is None: - result["status"] = "source_h5_unreadable" - return result - - if household_count > 0 and weight_count % household_count == 0: - result["status"] = "materializable_against_current_source_h5" - result["implied_clone_count"] = weight_count // household_count - else: - result["status"] = "incompatible_current_source_h5" - result["detail"] = ( - "Weight count is not divisible by the current source H5 household " - "count; same-harness scoring needs the matching source dataset or " - "a regenerated L0 package." - ) - return result - - -def _h5_period_length(path: Path, variable: str) -> int | None: - try: - import h5py - - with h5py.File(path, "r") as handle: - if variable not in handle: - return None - obj = handle[variable] - if hasattr(obj, "keys"): - keys = list(obj.keys()) - if not keys: - return None - return int(obj[keys[0]].shape[0]) - return int(obj.shape[0]) - except Exception: # pragma: no cover - defensive artifact read - return None - - -def write_dashboard_payload( - output_path: str | Path = _DEFAULT_OUTPUT_PATH, - *, - artifact_root: str | Path = _DEFAULT_ARTIFACT_ROOT, - target_diagnostics_path: str | Path = _DEFAULT_TARGET_DIAGNOSTICS_PATH, - policyengine_us_data_repo: str | Path | None = _DEFAULT_POLICYENGINE_US_DATA_REPO, - include_tmux: bool = True, -) -> Path: - """Write the living dashboard JSON payload.""" - - payload = build_dashboard_payload( - artifact_root=artifact_root, - target_diagnostics_path=target_diagnostics_path, - policyengine_us_data_repo=policyengine_us_data_repo, - include_tmux=include_tmux, - ) - output_path = Path(output_path) - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - return output_path - - -def _iter_score_paths(artifact_root: Path) -> list[Path]: - native_scores_hint = get_us_stage_artifact_contract( - "09_validation_benchmarking", - "policyengine_native_scores", - ).path_hint - paths = list(artifact_root.rglob("scores.json")) - if native_scores_hint is not None: - paths.extend(artifact_root.rglob(native_scores_hint)) - paths.extend(artifact_root.rglob("*_score.json")) - return [path for path in paths if path.is_file()] - - -def _score_entries_from_payload(path: Path, payload: Any) -> list[dict[str, Any]]: - if isinstance(payload, list): - raw_entries = payload - elif isinstance(payload, dict) and "broad_loss" in payload: - raw_entries = [payload] - elif isinstance(payload, dict) and "summary" in payload: - raw_entries = [payload] - elif isinstance(payload, dict) and "candidate_enhanced_cps_native_loss" in payload: - raw_entries = [payload] - else: - return [] - - entries = [] - for index, item in enumerate(raw_entries): - if not isinstance(item, dict): - continue - if "candidate_enhanced_cps_native_loss" in item: - summary = item - broad_loss = item - else: - summary = ( - item.get("summary") if isinstance(item.get("summary"), dict) else {} - ) - broad_loss = ( - item.get("broad_loss") - if isinstance(item.get("broad_loss"), dict) - else {} - ) - candidate_loss = _number_or_none( - summary.get("candidate_enhanced_cps_native_loss") - ) - baseline_loss = _number_or_none( - summary.get("baseline_enhanced_cps_native_loss") - ) - if candidate_loss is None or baseline_loss is None: - continue - candidate_dataset = broad_loss.get("candidate_dataset") - baseline_dataset = broad_loss.get("baseline_dataset") - metric_runtime = _infer_metric_runtime(path, summary) - model_id = _infer_score_model_id(path, candidate_dataset) - label = _score_label(path, candidate_dataset, index) - release_smoke = _release_smoke_summary(path.parent) - record_count_tier = _infer_record_count_tier( - path, candidate_dataset - ) or _infer_record_count_tier_from_release_smoke(release_smoke) - entries.append( - { - "label": label, - "model_id": model_id, - "record_count_tier": record_count_tier, - "artifact_path": str(path), - "artifact_dir": str(path.parent), - "entry_index": index, - "metric": item.get("metric") or "pe_native_broad_loss", - "metric_runtime": metric_runtime, - "period": item.get("period") or summary.get("period") or 2024, - "candidate_dataset": candidate_dataset, - "baseline_dataset": baseline_dataset, - "candidate_loss": candidate_loss, - "baseline_loss": baseline_loss, - "loss_delta": _number_or_none( - summary.get("enhanced_cps_native_loss_delta") - ), - "candidate_beats_baseline": _candidate_beats_baseline( - summary, - candidate_loss, - baseline_loss, - ), - "candidate_unweighted_msre": _number_or_none( - summary.get("candidate_unweighted_msre") - ), - "baseline_unweighted_msre": _number_or_none( - summary.get("baseline_unweighted_msre") - ), - "n_targets_kept": _number_or_none(summary.get("n_targets_kept")), - "n_targets_total": _number_or_none(summary.get("n_targets_total")), - "candidate_weight_sum": _number_or_none( - broad_loss.get("candidate_weight_sum") - ), - "baseline_weight_sum": _number_or_none( - broad_loss.get("baseline_weight_sum") - ), - "release_smoke": release_smoke, - "source_kind": "scores_json", - } - ) - return entries - - -def _release_smoke_summary(artifact_dir: Path) -> dict[str, Any] | None: - """Read colocated lightweight release gate smoke output when present.""" - - path = artifact_dir / "runtime_smoke_loader.json" - payload = _read_json(path) - if not isinstance(payload, dict): - return None - - candidate = payload.get("candidate") - baseline = payload.get("baseline") - if not isinstance(candidate, dict) or not isinstance(baseline, dict): - return None - - runtime_ratio = _number_or_none( - payload.get("median_runtime_ratio") or payload.get("runtime_ratio") - ) - file_size_ratio = _number_or_none(payload.get("file_size_ratio")) - household_ratio = _number_or_none(payload.get("household_ratio")) - return { - "artifact_path": str(path), - "benchmark": payload.get("benchmark"), - "candidate_households": _number_or_none(candidate.get("households")), - "baseline_households": _number_or_none(baseline.get("households")), - "household_ratio": household_ratio, - "candidate_file_size_bytes": _number_or_none(candidate.get("file_size_bytes")), - "baseline_file_size_bytes": _number_or_none(baseline.get("file_size_bytes")), - "file_size_ratio": file_size_ratio, - "median_runtime_ratio": runtime_ratio, - "candidate_median_elapsed_seconds": _number_or_none( - candidate.get("median_elapsed_seconds") or candidate.get("elapsed_seconds") - ), - "baseline_median_elapsed_seconds": _number_or_none( - baseline.get("median_elapsed_seconds") or baseline.get("elapsed_seconds") - ), - "raw_candidate_household_weight_sum": _number_or_none( - candidate.get("raw_household_weight_sum") - ), - "raw_baseline_household_weight_sum": _number_or_none( - baseline.get("raw_household_weight_sum") - ), - "passes_file_size_ratio_2x": ( - None if file_size_ratio is None else file_size_ratio <= 2.0 - ), - "passes_runtime_ratio_1_25x": ( - None if runtime_ratio is None else runtime_ratio <= 1.25 - ), - } - - -def _candidate_beats_baseline( - summary: dict[str, Any], - candidate_loss: float, - baseline_loss: float, -) -> bool: - raw_value = summary.get("candidate_beats_baseline") - if isinstance(raw_value, bool): - return raw_value - if raw_value is None: - return candidate_loss < baseline_loss - if isinstance(raw_value, str): - lowered = raw_value.strip().lower() - if lowered in {"true", "1", "yes"}: - return True - if lowered in {"false", "0", "no"}: - return False - return bool(raw_value) - - -def _summarize_unified_diagnostics(path: Path) -> dict[str, Any] | None: - try: - with path.open(newline="") as file: - rows = list(csv.DictReader(file)) - except OSError: - return None - if not rows: - return None - - abs_errors = [] - actual_l0_abs_errors = [] - actual_l0_squared_errors = [] - achievable_count = 0 - for row in rows: - if str(row.get("achievable", "")).lower() == "true": - achievable_count += 1 - error = _number_or_none(row.get("abs_rel_error")) - if error is not None: - abs_errors.append(error) - estimate = _number_or_none(row.get("estimate")) - true_value = _number_or_none(row.get("true_value")) - if estimate is not None and true_value is not None: - actual_error = (estimate - true_value) / (true_value + 1.0) - actual_l0_abs_errors.append(abs(actual_error)) - actual_l0_squared_errors.append(actual_error * actual_error) - - sorted_errors = sorted(abs_errors) - return { - "n_targets": len(rows), - "n_achievable": achievable_count, - "actual_l0_objective": ("sum(((estimate - target) / (target + 1)) ** 2)"), - "actual_l0_data_loss": ( - sum(actual_l0_squared_errors) if actual_l0_squared_errors else None - ), - "actual_l0_mean_abs_relative_error_pct": ( - 100 * sum(actual_l0_abs_errors) / len(actual_l0_abs_errors) - if actual_l0_abs_errors - else None - ), - "mean_abs_relative_error_pct": ( - 100 * sum(abs_errors) / len(abs_errors) if abs_errors else None - ), - "median_abs_relative_error_pct": _percentile(sorted_errors, 0.5), - "p90_abs_relative_error_pct": _percentile(sorted_errors, 0.9), - "p99_abs_relative_error_pct": _percentile(sorted_errors, 0.99), - "max_abs_relative_error_pct": ( - 100 * sorted_errors[-1] if sorted_errors else None - ), - "share_under_10pct": _share_under(abs_errors, 0.10), - "share_under_25pct": _share_under(abs_errors, 0.25), - } - - -def _score_label(path: Path, candidate_dataset: Any, index: int) -> str: - artifact = path.parent.name - if isinstance(candidate_dataset, str): - dataset_name = Path(candidate_dataset).name - policyengine_dataset_hint = get_us_stage_artifact_contract( - "08_dataset_assembly", - "policyengine_dataset", - ).path_hint - if policyengine_dataset_hint is None or dataset_name != Path( - policyengine_dataset_hint - ).name: - return f"{artifact} / {dataset_name}" - if index: - return f"{artifact} / candidate {index + 1}" - return artifact - - -def _infer_metric_runtime(path: Path, summary: dict[str, Any]) -> str: - text = str(path).lower() - n_targets = _number_or_none(summary.get("n_targets_kept")) - baseline_loss = _number_or_none(summary.get("baseline_enhanced_cps_native_loss")) - if "legacy_targets" in text: - return "legacy_or_patched_runtime" - if "new_targets" in text: - return "latest_policyengine_us" - if n_targets == 2805 and baseline_loss == 0.09774356788921322: - return "legacy_or_patched_runtime" - if ( - "latest_us_data" in text - or n_targets in {2814, 2818} - or (baseline_loss is not None and baseline_loss > 0.15) - ): - return "latest_policyengine_us" - return "legacy_or_patched_runtime" - - -def _infer_score_model_id(path: Path, candidate_dataset: Any) -> str: - text_parts = [str(path).lower()] - if isinstance(candidate_dataset, str): - text_parts.append(candidate_dataset.lower()) - text_parts.append(Path(candidate_dataset).name.lower()) - text = " ".join(text_parts) - if "pe_small_l0" in text or "local_net_worth_100/" in text: - return "policyengine_small_l0" - if "pe_big_l0" in text or "local_net_worth_100_e300" in text: - return "policyengine_big_l0" - if "policyengine_local_area_l0" in text or "state_stack" in text: - return "policyengine_local_area_l0_state_stack" - return "microplex_current_best" - - -def _infer_record_count_tier(path: Path, candidate_dataset: Any) -> str | None: - """Infer product-style record-count tier labels such as ``mp-120k``.""" - - text_parts = [str(path).lower()] - if isinstance(candidate_dataset, str): - text_parts.append(candidate_dataset.lower()) - text_parts.append(Path(candidate_dataset).name.lower()) - text = " ".join(text_parts) - match = re.search(r"\bmp[-_]?(\d+(?:k|m))(?:\b|_)", text) - if match: - return f"mp-{match.group(1)}" - return None - - -def _infer_record_count_tier_from_release_smoke( - release_smoke: dict[str, Any] | None, -) -> str | None: - """Infer a product-style tier from measured household rows when available.""" - - if not isinstance(release_smoke, dict): - return None - households = _number_or_none(release_smoke.get("candidate_households")) - if households is None or households <= 0: - return None - if households >= 1_000_000: - return f"mp-{households / 1_000_000:.1f}m".replace(".0m", "m") - return f"mp-{round(households / 1_000)}k" - - -def _percentile(sorted_values: list[float], quantile: float) -> float | None: - if not sorted_values: - return None - if len(sorted_values) == 1: - return 100 * sorted_values[0] - position = quantile * (len(sorted_values) - 1) - lower = int(position) - upper = min(lower + 1, len(sorted_values) - 1) - weight = position - lower - return 100 * (sorted_values[lower] * (1 - weight) + sorted_values[upper] * weight) - - -def _share_under(values: list[float], threshold: float) -> float | None: - if not values: - return None - return sum(value < threshold for value in values) / len(values) - - -def _best_score(rows: list[dict[str, Any]], *, predicate: Any) -> dict[str, Any] | None: - candidates = [ - row - for row in rows - if predicate(row) and _number_or_none(row.get("candidate_loss")) is not None - ] - if not candidates: - return None - return min(candidates, key=lambda row: row["candidate_loss"]) - - -def _best_model_metric_score( - rows: list[dict[str, Any]], - *, - model_id: str, - metric_runtime: str, -) -> dict[str, Any] | None: - return _best_score( - rows, - predicate=lambda row: ( - row.get("model_id") == model_id - and row.get("metric_runtime") == metric_runtime - ), - ) - - -def _parse_row_batch_progress(text: str) -> dict[str, Any] | None: - pattern = re.compile( - r"PE-native row batch (?P[^:]+): " - r"(?P\d+)/(?P\d+) households " - r"\((?P[0-9.]+)s\)" - ) - matches = list(pattern.finditer(text)) - if not matches: - return None - match = matches[-1] - done = int(match.group("done")) - total = int(match.group("total")) - return { - "dataset": match.group("dataset"), - "households_done": done, - "households_total": total, - "fraction": done / total if total else None, - "elapsed_seconds": float(match.group("elapsed")), - } - - -def _is_relevant_tmux_session(name: str) -> bool: - lowered = name.lower() - return ( - lowered.startswith("mp_") - or "microplex" in lowered - or lowered.startswith("dashboard") - ) - - -def _tail_text(path: Path, max_bytes: int = 8192) -> str: - try: - with path.open("rb") as file: - file.seek(0, 2) - size = file.tell() - file.seek(max(size - max_bytes, 0)) - return file.read().decode("utf-8", errors="replace") - except OSError: - return "" - - -def _read_json(path: Path) -> Any | None: - try: - return json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - return None - - -def _number_or_none(value: Any) -> float | None: - try: - number = float(value) - except (TypeError, ValueError): - return None - if number != number: - return None - return number - - -def main(argv: list[str] | None = None) -> int: - """CLI for the living Microplex dashboard payload.""" - - parser = argparse.ArgumentParser( - description="Build the living Microplex diagnostic dashboard JSON." - ) - parser.add_argument("--artifact-root", default=str(_DEFAULT_ARTIFACT_ROOT)) - parser.add_argument( - "--target-diagnostics-path", - default=str(_DEFAULT_TARGET_DIAGNOSTICS_PATH), - help="Existing per-target diagnostics JSON to embed when available.", - ) - parser.add_argument( - "--policyengine-us-data-repo", - default=str(_DEFAULT_POLICYENGINE_US_DATA_REPO), - help=( - "Local policyengine-us-data checkout used to discover PE local-L0 " - "weight packages. Pass an empty string to skip discovery." - ), - ) - parser.add_argument("--output-path", default=str(_DEFAULT_OUTPUT_PATH)) - parser.add_argument( - "--no-tmux", - action="store_true", - help="Skip tmux session discovery for deterministic tests.", - ) - args = parser.parse_args(argv) - output = write_dashboard_payload( - args.output_path, - artifact_root=args.artifact_root, - target_diagnostics_path=args.target_diagnostics_path, - policyengine_us_data_repo=args.policyengine_us_data_repo or None, - include_tmux=not args.no_tmux, - ) - print(output) - return 0 diff --git a/src/microplex_us/pipelines/data_flow_snapshot.py b/src/microplex_us/pipelines/data_flow_snapshot.py deleted file mode 100644 index ca783742..00000000 --- a/src/microplex_us/pipelines/data_flow_snapshot.py +++ /dev/null @@ -1,914 +0,0 @@ -"""Artifact-backed data-flow snapshot helpers for the US microplex pipeline.""" - -from __future__ import annotations - -import json -from collections.abc import Iterable -from dataclasses import replace -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_manifest import ( - build_us_stage_manifest, - load_us_stage_manifest, - stage_summary_for_data_flow_snapshot, -) -from microplex_us.variables import ( - donor_imputation_block_specs, - variable_semantic_spec_for, -) - -DATA_FLOW_SNAPSHOT_SCHEMA_VERSION = 1 - - -def build_us_microplex_data_flow_snapshot( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - prefer_saved: bool = True, -) -> dict[str, Any]: - """Build one site-facing US data-flow snapshot from a saved artifact bundle.""" - artifact_root = Path(artifact_dir) - if prefer_saved and manifest_payload is None: - saved_snapshot = _load_saved_data_flow_snapshot(artifact_root) - if saved_snapshot is not None: - return saved_snapshot - - return _materialize_us_microplex_data_flow_snapshot( - artifact_root, - manifest_payload=manifest_payload, - ) - - -def require_saved_us_microplex_data_flow_snapshot( - artifact_dir: str | Path, -) -> dict[str, Any]: - """Load the saved canonical US data-flow snapshot or raise.""" - artifact_root = Path(artifact_dir) - snapshot_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - ) - if not snapshot_path.exists(): - raise FileNotFoundError( - f"US artifact bundle is missing data_flow_snapshot.json: {snapshot_path}" - ) - snapshot = json.loads(snapshot_path.read_text()) - if snapshot.get("schemaVersion") != DATA_FLOW_SNAPSHOT_SCHEMA_VERSION: - raise RuntimeError( - "US artifact bundle has a stale or unsupported data_flow_snapshot.json " - f"schema: {snapshot.get('schemaVersion')!r}" - ) - return snapshot - - -def write_us_microplex_data_flow_snapshot( - artifact_dir: str | Path, - output_path: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - assume_existing_stage_artifact_keys: Iterable[str] = (), -) -> Path: - """Write the canonical US data-flow snapshot JSON for one saved artifact bundle.""" - snapshot = _materialize_us_microplex_data_flow_snapshot( - artifact_dir, - manifest_payload=manifest_payload, - prefer_saved_stage_manifest=False, - assume_existing_stage_artifact_keys=( - *tuple(assume_existing_stage_artifact_keys), - "data_flow_snapshot", - ), - ) - destination = Path(output_path) - destination.parent.mkdir(parents=True, exist_ok=True) - _write_json_atomically(destination, snapshot) - return destination - - -def _materialize_us_microplex_data_flow_snapshot( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - prefer_saved_stage_manifest: bool = True, - assume_existing_stage_artifact_keys: Iterable[str] = (), -) -> dict[str, Any]: - artifact_root = Path(artifact_dir) - manifest = ( - dict(manifest_payload) - if manifest_payload is not None - else json.loads((artifact_root / "manifest.json").read_text()) - ) - synthesis = dict(manifest.get("synthesis", {})) - config = dict(manifest.get("config", {})) - stage_manifest = _resolve_data_flow_stage_manifest( - artifact_root, - manifest_payload=manifest, - prefer_saved=prefer_saved_stage_manifest, - assume_existing_artifact_keys=assume_existing_stage_artifact_keys, - ) - - source_names = tuple( - dict.fromkeys( - value - for value in ( - *list(synthesis.get("source_names", ())), - synthesis.get("scaffold_source"), - ) - if isinstance(value, str) and value - ) - ) - source_entries = [ - _source_snapshot_entry(source_name) - for source_name in source_names - ] - resolved_descriptors = [ - entry["descriptor"] - for entry in source_entries - if entry["descriptor"] is not None - ] - fusion_plan = ( - _build_fusion_plan(resolved_descriptors) - if resolved_descriptors - else None - ) - - donor_integrated_variables = tuple( - variable - for variable in synthesis.get("donor_integrated_variables", ()) - if isinstance(variable, str) - ) - semantic_variables = tuple( - dict.fromkeys( - [ - *donor_integrated_variables, - *[ - variable - for variable in synthesis.get("condition_vars", ()) - if isinstance(variable, str) - ], - *[ - variable - for variable in synthesis.get("target_vars", ()) - if isinstance(variable, str) - ], - *[ - variable - for variable in synthesis.get( - "donor_authoritative_override_variables", - (), - ) - if isinstance(variable, str) - ], - ] - ) - ) - - data_flow_snapshot = { - "schemaVersion": DATA_FLOW_SNAPSHOT_SCHEMA_VERSION, - "generatedAt": manifest.get("created_at"), - "coverageMode": "artifact_frozen", - "runtime": { - "sourceNames": list(source_names), - "scaffoldSource": synthesis.get("scaffold_source"), - "nSynthetic": config.get("n_synthetic"), - "rows": { - key: manifest.get("rows", {}).get(key) - for key in ("seed", "synthetic", "calibrated") - }, - "synthesisBackend": synthesis.get("backend"), - "conditionVars": list(synthesis.get("condition_vars", ())), - "targetVars": list(synthesis.get("target_vars", ())), - "donorIntegratedVariables": list(donor_integrated_variables), - "donorExcludedVariables": list( - synthesis.get("donor_excluded_variables", ()) - ), - "donorAuthoritativeOverrideVariables": list( - synthesis.get("donor_authoritative_override_variables", ()) - ), - "supportProxies": dict( - synthesis.get( - "state_program_support_proxies", - {"available": [], "missing": []}, - ) - ), - }, - "sources": [ - _serialize_source_snapshot_entry(entry) - for entry in source_entries - ], - "sharedCoverage": _build_shared_coverage_summary(fusion_plan, source_entries), - "donorBlocks": _build_donor_block_summary(donor_integrated_variables), - "semanticHighlights": _build_semantic_highlights(semantic_variables), - "stages": stage_summary_for_data_flow_snapshot(stage_manifest), - } - return data_flow_snapshot - - -def _load_saved_data_flow_snapshot(artifact_root: Path) -> dict[str, Any] | None: - snapshot_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - ) - if not snapshot_path.exists(): - return None - snapshot = json.loads(snapshot_path.read_text()) - if snapshot.get("schemaVersion") != DATA_FLOW_SNAPSHOT_SCHEMA_VERSION: - return None - return snapshot - - -def _resolve_data_flow_stage_manifest( - artifact_root: Path, - *, - manifest_payload: dict[str, Any], - prefer_saved: bool, - assume_existing_artifact_keys: Iterable[str], -) -> dict[str, Any]: - artifacts = dict(manifest_payload.get("artifacts", {})) - stage_manifest_name = artifacts.get("stage_manifest") - if prefer_saved and stage_manifest_name: - stage_manifest_path = Path(stage_manifest_name) - if not stage_manifest_path.is_absolute(): - stage_manifest_path = artifact_root / stage_manifest_path - if stage_manifest_path.exists(): - return load_us_stage_manifest(stage_manifest_path) - return build_us_stage_manifest( - artifact_root, - manifest_payload=manifest_payload, - assume_existing_artifact_keys=assume_existing_artifact_keys, - ) - - -def _source_snapshot_entry(source_name: str) -> dict[str, Any]: - fallback_entry = _fallback_source_snapshot_entry(source_name) - if fallback_entry is None: - fallback_entry = { - "name": source_name, - "descriptor": None, - "manifestName": None, - "notes": [], - } - try: - return _runtime_source_snapshot_entry(source_name) - except ImportError: - return fallback_entry - - -def _runtime_source_snapshot_entry(source_name: str) -> dict[str, Any]: - from microplex_us.data_sources.cps import ( - CPSASECParquetSourceProvider, - CPSASECSourceProvider, - ) - from microplex_us.data_sources.psid import PSIDSourceProvider - from microplex_us.data_sources.puf import PUFSourceProvider - - descriptor: Any | None = None - manifest_name: str | None = None - notes: list[str] = [] - - if source_name == "cps_asec_parquet": - descriptor = replace( - CPSASECParquetSourceProvider(data_dir=".").descriptor, - name=source_name, - ) - notes.append( - "This source was loaded from split household/person parquet files rather than " - "the Census download path." - ) - elif source_name == "cps_asec" or source_name.startswith("cps_asec_"): - descriptor = replace(CPSASECSourceProvider(download=False).descriptor, name=source_name) - notes.append( - "CPS coverage expands at load time from processed household and person tables; " - "the static provider descriptor intentionally stays minimal until a frame is materialized." - ) - elif source_name.startswith("irs_soi_puf"): - descriptor = replace(PUFSourceProvider().descriptor, name=source_name) - manifest_name = "puf" - notes.append( - "PUF is manifest-backed, so raw-to-canonical tax mappings are available even " - "without loading the microdata file." - ) - elif source_name.startswith("psid"): - descriptor = replace(PSIDSourceProvider(data_dir=".").descriptor, name=source_name) - notes.append( - "PSID is panel-backed and enters the US build as an optional donor family." - ) - - return { - "name": source_name, - "descriptor": descriptor, - "manifestName": manifest_name, - "notes": notes, - } - - -def _fallback_source_snapshot_entry(source_name: str) -> dict[str, Any] | None: - notes: list[str] = [] - if source_name == "cps_asec_parquet": - notes.append( - "This source was loaded from split household/person parquet files rather than " - "the Census download path." - ) - serialized = _fallback_cps_source_snapshot(source_name, notes) - elif source_name == "cps_asec" or source_name.startswith("cps_asec_"): - notes.append( - "CPS coverage expands at load time from processed household and person tables; " - "the static provider descriptor intentionally stays minimal until a frame is materialized." - ) - serialized = _fallback_cps_source_snapshot(source_name, notes) - elif source_name.startswith("irs_soi_puf"): - notes.append( - "PUF is manifest-backed, so raw-to-canonical tax mappings are available even " - "without loading the microdata file." - ) - serialized = _fallback_puf_source_snapshot(source_name, notes) - elif source_name.startswith("psid"): - notes.append( - "PSID is panel-backed and enters the US build as an optional donor family." - ) - serialized = _fallback_psid_source_snapshot(source_name, notes) - else: - return None - - return { - "name": source_name, - "descriptor": None, - "manifestName": serialized.get("manifestName"), - "notes": notes, - "serialized": serialized, - } - - -def _fallback_cps_source_snapshot( - source_name: str, - notes: list[str], -) -> dict[str, Any]: - return { - "name": source_name, - "resolved": True, - "shareability": "public", - "timeStructure": "repeated_cross_section", - "archetype": "household_income", - "population": None, - "description": None, - "manifestName": None, - "manifestBacked": False, - "observationCount": 2, - "observations": [ - { - "entity": "household", - "keyColumn": "household_id", - "weightColumn": "household_weight", - "periodColumn": None, - "variableCount": 1, - "sampleVariables": ["state_fips"], - }, - { - "entity": "person", - "keyColumn": "person_id", - "weightColumn": "weight", - "periodColumn": None, - "variableCount": 1, - "sampleVariables": ["age"], - }, - ], - "capabilitySummary": { - "authoritativeVariableCount": 2, - "conditionableVariableCount": 2, - "authoritativeOnlyVariables": [], - "nonConditionableVariables": [], - }, - "manifestMappings": None, - "notes": list(notes), - } - - -def _fallback_puf_source_snapshot( - source_name: str, - notes: list[str], -) -> dict[str, Any]: - return { - "name": source_name, - "resolved": True, - "shareability": "public", - "timeStructure": "repeated_cross_section", - "archetype": "tax_microdata", - "population": "US tax units", - "description": "IRS SOI Public Use File tax-unit mappings", - "manifestName": "puf", - "manifestBacked": True, - "observationCount": 2, - "observations": [ - { - "entity": "household", - "keyColumn": "household_id", - "weightColumn": "household_weight", - "periodColumn": "year", - "variableCount": 2, - "sampleVariables": ["state_fips", "tenure"], - }, - { - "entity": "person", - "keyColumn": "person_id", - "weightColumn": "weight", - "periodColumn": "year", - "variableCount": 3, - "sampleVariables": ["age", "sex", "income"], - }, - ], - "capabilitySummary": { - "authoritativeVariableCount": 2, - "conditionableVariableCount": 2, - "authoritativeOnlyVariables": [], - "nonConditionableVariables": ["income", "state_fips", "tenure"], - }, - "manifestMappings": _fallback_manifest_mappings("puf"), - "notes": list(notes), - } - - -def _fallback_psid_source_snapshot( - source_name: str, - notes: list[str], -) -> dict[str, Any]: - return { - "name": source_name, - "resolved": True, - "shareability": "restricted", - "timeStructure": "panel", - "archetype": "longitudinal_household", - "population": None, - "description": None, - "manifestName": None, - "manifestBacked": False, - "observationCount": 0, - "observations": [], - "capabilitySummary": { - "authoritativeVariableCount": 0, - "conditionableVariableCount": 0, - "authoritativeOnlyVariables": [], - "nonConditionableVariables": [], - }, - "manifestMappings": None, - "notes": list(notes), - } - - -def _fallback_manifest_mappings(manifest_name: str) -> dict[str, Any] | None: - manifest_path = Path(__file__).resolve().parents[1] / "manifests" / f"{manifest_name}.json" - if not manifest_path.exists(): - return None - payload = json.loads(manifest_path.read_text()) - observations = list(payload.get("observations", ())) - sample_mappings: list[dict[str, str]] = [] - mapped_column_count = 0 - for observation in observations: - columns = list(observation.get("columns", ())) - mapped_column_count += len(columns) - for column in columns: - if len(sample_mappings) >= 8: - break - sample_mappings.append( - { - "entity": str(observation.get("entity")), - "rawColumn": str(column.get("raw_column")), - "canonicalName": str(column.get("canonical_name")), - } - ) - return { - "observationCount": len(observations), - "mappedColumnCount": mapped_column_count, - "sampleMappings": sample_mappings, - } - - -def _serialize_source_snapshot_entry(entry: dict[str, Any]) -> dict[str, Any]: - serialized = entry.get("serialized") - if isinstance(serialized, dict): - return serialized - - descriptor = entry["descriptor"] - manifest_name = entry["manifestName"] - manifest = _load_runtime_source_manifest(manifest_name) - - if descriptor is None: - return { - "name": entry["name"], - "resolved": False, - "notes": list(entry["notes"]), - } - - variable_names = sorted(descriptor.all_variable_names) - authoritative_only = [ - variable - for variable in variable_names - if descriptor.is_authoritative_for(variable) - and not descriptor.allows_conditioning_on(variable) - ] - non_conditionable = [ - variable - for variable in variable_names - if not descriptor.allows_conditioning_on(variable) - ] - - manifest_mappings = None - if manifest is not None: - sample_mappings: list[dict[str, str]] = [] - mapped_column_count = 0 - for observation in manifest.observations: - mapped_column_count += len(observation.columns) - for column in observation.columns: - if len(sample_mappings) >= 8: - break - sample_mappings.append( - { - "entity": observation.entity.value, - "rawColumn": column.raw_column, - "canonicalName": column.canonical_name, - } - ) - manifest_mappings = { - "observationCount": len(manifest.observations), - "mappedColumnCount": mapped_column_count, - "sampleMappings": sample_mappings, - } - - return { - "name": descriptor.name, - "resolved": True, - "shareability": descriptor.shareability.value, - "timeStructure": descriptor.time_structure.value, - "archetype": descriptor.archetype.value if descriptor.archetype is not None else None, - "population": descriptor.population, - "description": descriptor.description, - "manifestName": manifest_name, - "manifestBacked": manifest is not None, - "observationCount": len(descriptor.observations), - "observations": [ - { - "entity": observation.entity.value, - "keyColumn": observation.key_column, - "weightColumn": observation.weight_column, - "periodColumn": observation.period_column, - "variableCount": len(observation.variable_names), - "sampleVariables": list(observation.variable_names[:8]), - } - for observation in descriptor.observations - ], - "capabilitySummary": { - "authoritativeVariableCount": sum( - 1 for variable in variable_names if descriptor.is_authoritative_for(variable) - ), - "conditionableVariableCount": sum( - 1 for variable in variable_names if descriptor.allows_conditioning_on(variable) - ), - "authoritativeOnlyVariables": authoritative_only[:8], - "nonConditionableVariables": non_conditionable[:8], - }, - "manifestMappings": manifest_mappings, - "notes": list(entry["notes"]), - } - - -def _write_json_atomically(path: Path, payload: dict[str, Any]) -> None: - temp_path = path.with_name(f".{path.name}.tmp") - temp_path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temp_path.replace(path) - - -def _build_fusion_plan(resolved_descriptors: list[Any]) -> Any | None: - try: - from microplex.fusion import FusionPlan - except ImportError: - return None - return FusionPlan.from_sources(resolved_descriptors) - - -def _load_runtime_source_manifest(manifest_name: str | None) -> Any | None: - if manifest_name is None: - return None - try: - from microplex_us.source_manifests import load_us_source_manifest - except ImportError: - return None - return load_us_source_manifest(manifest_name) - - -def _build_shared_coverage_summary( - fusion_plan: Any | None, - source_entries: list[dict[str, Any]] | None = None, -) -> dict[str, Any]: - if fusion_plan is None: - if source_entries: - fallback = _build_shared_coverage_summary_from_sources(source_entries) - if fallback is not None: - return fallback - return { - "sourceNames": [], - "entities": [], - } - - entity_summaries = [] - for entity in fusion_plan.output_entities: - entity_coverage = fusion_plan.coverage.get(entity, {}) - source_counts = { - source_name: sum( - 1 - for coverage in entity_coverage.values() - if source_name in coverage.sources - ) - for source_name in fusion_plan.source_names - } - entity_summaries.append( - { - "entity": entity.value, - "variableCount": len(entity_coverage), - "publicVariableCount": sum( - 1 for coverage in entity_coverage.values() if coverage.publicly_observed - ), - "syntheticReleaseVariableCount": sum( - 1 - for coverage in entity_coverage.values() - if coverage.requires_synthetic_release - ), - "sampleVariables": list(entity_coverage.keys())[:10], - "sourceCounts": [ - {"source": source_name, "variableCount": source_counts[source_name]} - for source_name in fusion_plan.source_names - ], - } - ) - - return { - "sourceNames": list(fusion_plan.source_names), - "entities": entity_summaries, - } - - -def _build_shared_coverage_summary_from_sources( - source_entries: list[dict[str, Any]], -) -> dict[str, Any] | None: - serialized_sources = [ - _serialize_source_snapshot_entry(entry) - for entry in source_entries - if entry.get("serialized") is not None - ] - if not serialized_sources: - return None - - source_names = [ - source["name"] - for source in serialized_sources - if source.get("resolved") - ] - coverage: dict[str, dict[str, set[str]]] = {} - for source in serialized_sources: - if not source.get("resolved"): - continue - source_name = str(source["name"]) - for observation in source.get("observations", ()): - entity = str(observation["entity"]) - entity_coverage = coverage.setdefault(entity, {}) - for variable in observation.get("sampleVariables", ()): - entity_coverage.setdefault(str(variable), set()).add(source_name) - - return { - "sourceNames": source_names, - "entities": [ - { - "entity": entity, - "variableCount": len(entity_coverage), - "publicVariableCount": len(entity_coverage), - "syntheticReleaseVariableCount": 0, - "sampleVariables": sorted(entity_coverage)[:10], - "sourceCounts": [ - { - "source": source_name, - "variableCount": sum( - 1 - for variable_sources in entity_coverage.values() - if source_name in variable_sources - ), - } - for source_name in source_names - ], - } - for entity, entity_coverage in coverage.items() - ], - } - - -def _build_donor_block_summary( - donor_integrated_variables: tuple[str, ...], -) -> list[dict[str, Any]]: - block_specs = donor_imputation_block_specs(donor_integrated_variables) - return [ - { - "id": f"block-{index + 1}", - "nativeEntity": _entity_value(block_spec.native_entity), - "conditionEntities": _entity_values(block_spec.condition_entities), - "modelVariables": list(block_spec.model_variables), - "restoredVariables": list(block_spec.restored_variables), - "matchStrategies": { - variable_name: strategy.value - for variable_name, strategy in block_spec.match_strategies.items() - }, - "prepareFrame": ( - block_spec.prepare_frame.__name__ - if block_spec.prepare_frame is not None - else None - ), - "restoreFrame": ( - block_spec.restore_frame.__name__ - if block_spec.restore_frame is not None - else None - ), - } - for index, block_spec in enumerate(block_specs) - ] - - -def _build_semantic_highlights( - variable_names: tuple[str, ...], -) -> list[dict[str, Any]]: - highlights: list[dict[str, Any]] = [] - for variable_name in variable_names: - spec = variable_semantic_spec_for(variable_name) - if ( - not spec.derived_from - and spec.support_family.value == "continuous" - and spec.donor_transform is None - and spec.donor_check is None - and spec.notes is None - ): - continue - highlights.append( - { - "variableName": variable_name, - "nativeEntity": _entity_value(spec.native_entity), - "conditionEntities": _entity_values(spec.condition_entities), - "supportFamily": spec.support_family.value, - "derivedFrom": list(spec.derived_from), - "donorMatchStrategy": spec.donor_match_strategy.value, - "hasDonorTransform": spec.donor_transform is not None, - "hasDonorCheck": spec.donor_check is not None, - "notes": spec.notes, - } - ) - return highlights - - -def _entity_value(entity: Any) -> str: - return str(getattr(entity, "value", entity)) - - -def _entity_values(entities: Iterable[Any]) -> list[str]: - values = [_entity_value(entity) for entity in entities] - if ( - "family" in values - and "spm_unit" in values - and "benefit_unit" not in values - ): - values.insert(values.index("spm_unit"), "benefit_unit") - return values - - -def _build_pipeline_stage_summary( - *, - synthesis: dict[str, Any], - calibration: dict[str, Any], - artifacts: dict[str, Any], - config: dict[str, Any], - donor_integrated_variables: tuple[str, ...], - source_names: tuple[str, ...], - manifest: dict[str, Any], -) -> list[dict[str, Any]]: - harness_summary = dict(manifest.get("policyengine_harness", {})) - native_scores_summary = dict(manifest.get("policyengine_native_scores", {})) - - return [ - { - "id": "source-mix", - "step": "01", - "title": "Source mix", - "summary": "Descriptor-backed source families declared for the saved run.", - "status": "ready" if source_names else "missing", - "metrics": [ - {"label": "Sources", "value": len(source_names)}, - {"label": "Scaffold", "value": synthesis.get("scaffold_source")}, - ], - "outputs": list(source_names), - }, - { - "id": "donor-integration", - "step": "02", - "title": "Donor integration", - "summary": "Authoritative donor variables projected onto the scaffold before synthesis.", - "status": "ready" if donor_integrated_variables else "inactive", - "metrics": [ - { - "label": "Integrated vars", - "value": len(donor_integrated_variables), - }, - { - "label": "Overrides", - "value": len( - synthesis.get("donor_authoritative_override_variables", ()) - ), - }, - ], - "outputs": list(donor_integrated_variables[:12]), - }, - { - "id": "synthesis", - "step": "03", - "title": "Synthesis", - "summary": "Seed rows become the candidate population under the configured backend.", - "status": "ready", - "metrics": [ - {"label": "Backend", "value": synthesis.get("backend")}, - {"label": "Conditions", "value": len(synthesis.get("condition_vars", ()))}, - {"label": "Targets", "value": len(synthesis.get("target_vars", ()))}, - {"label": "nSynthetic", "value": config.get("n_synthetic")}, - ], - "outputs": [ - f"seed={manifest.get('rows', {}).get('seed')}", - f"synthetic={manifest.get('rows', {}).get('synthetic')}", - ], - }, - { - "id": "calibration", - "step": "04", - "title": "Calibration", - "summary": "Target support and convergence remain attached to the saved run.", - "status": "ready" if calibration else "missing", - "metrics": [ - {"label": "Backend", "value": calibration.get("backend")}, - {"label": "Loaded", "value": calibration.get("n_loaded_targets")}, - {"label": "Supported", "value": calibration.get("n_supported_targets")}, - {"label": "Converged", "value": calibration.get("converged")}, - ], - "outputs": [ - f"calibrated={manifest.get('rows', {}).get('calibrated')}", - ], - }, - { - "id": "pe-export", - "step": "05", - "title": "PolicyEngine export", - "summary": "The runtime narrows to the PE-facing artifact contract before scoring.", - "status": "ready" if artifacts.get("policyengine_dataset") else "missing", - "metrics": [ - { - "label": "Dataset artifact", - "value": artifacts.get("policyengine_dataset"), - }, - { - "label": "Direct overrides", - "value": len(config.get("policyengine_direct_override_variables", ())), - }, - ], - "outputs": [ - value - for value in ( - artifacts.get("policyengine_dataset"), - artifacts.get("manifest"), - ) - if value - ], - }, - { - "id": "benchmark", - "step": "06", - "title": "PolicyEngine benchmark", - "summary": "Harness and native-loss diagnostics stay attached to the same artifact bundle.", - "status": "ready" if harness_summary or native_scores_summary else "missing", - "metrics": [ - { - "label": "Harness delta", - "value": harness_summary.get("mean_abs_relative_error_delta"), - }, - { - "label": "Native delta", - "value": native_scores_summary.get("enhanced_cps_native_loss_delta"), - }, - { - "label": "Win rate", - "value": harness_summary.get("target_win_rate"), - }, - ], - "outputs": [ - value - for value in ( - artifacts.get("policyengine_harness"), - artifacts.get("policyengine_native_scores"), - ) - if value - ], - }, - ] diff --git a/src/microplex_us/pipelines/donor_imputers.py b/src/microplex_us/pipelines/donor_imputers.py deleted file mode 100644 index 7bc08805..00000000 --- a/src/microplex_us/pipelines/donor_imputers.py +++ /dev/null @@ -1,346 +0,0 @@ -"""Donor imputer implementations for US pipeline donor synthesis.""" - -from __future__ import annotations - -import importlib.util -from typing import Any - -import numpy as np -import pandas as pd -from sklearn.ensemble import RandomForestClassifier - - -def _deduplicate_columns_preserve_first(frame: pd.DataFrame) -> pd.DataFrame: - """Return a frame with one column per label, keeping the first occurrence.""" - - if frame.columns.is_unique: - return frame - return frame.loc[:, ~frame.columns.duplicated()].copy() - - -class ColumnwiseQRFDonorImputer: - """Columnwise QRF donor imputer, optionally with zero-inflated support.""" - - def __init__( - self, - *, - condition_vars: list[str], - target_vars: list[str], - n_estimators: int = 100, - zero_inflated_vars: set[str] | None = None, - nonnegative_vars: set[str] | None = None, - zero_threshold: float = 0.05, - quantiles: tuple[float, ...] = (0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), - ) -> None: - self.condition_vars = list(condition_vars) - self.target_vars = list(target_vars) - self.n_estimators = int(n_estimators) - self.zero_inflated_vars = set(zero_inflated_vars or ()) - self.nonnegative_vars = set(nonnegative_vars or ()) - self.zero_threshold = float(zero_threshold) - self.quantiles = tuple(float(value) for value in quantiles) - self._models: dict[str, Any] = {} - self._zero_models: dict[str, RandomForestClassifier] = {} - - def fit( - self, - data: pd.DataFrame, - *, - weight_col: str | None = "weight", - epochs: int | None = None, - batch_size: int | None = None, - learning_rate: float | None = None, - verbose: bool = False, - ) -> ColumnwiseQRFDonorImputer: - del weight_col, epochs, batch_size, learning_rate, verbose - if importlib.util.find_spec("quantile_forest") is None: - raise ImportError( - "quantile-forest is required for donor_imputer_backend='qrf'" - ) - from quantile_forest import RandomForestQuantileRegressor - - self._models = {} - self._zero_models = {} - for column in self.target_vars: - subset = data[self.condition_vars + [column]].dropna() - if len(subset) < 25: - continue - x_values = subset[self.condition_vars].to_numpy(dtype=float) - y_values = subset[column].to_numpy(dtype=float) - if ( - column in self.zero_inflated_vars - and (y_values == 0).mean() >= self.zero_threshold - and (y_values == 0).sum() >= 10 - and (y_values != 0).sum() >= 10 - ): - # Gate trained as zero vs nonzero (both signs), not as - # zero-or-negative vs positive. The old `y > 0` label - # silently dropped every negative training row along - # with zeros, so the QRF below only ever saw positive - # rows and could never emit a negative prediction - the - # v7 bug that blanked the negative tail of capital - # gains, partnership income, farm income, etc. The - # `!= 0` label is the minimal fix; the full upgrade to - # the canonical regime-aware `microimpute.Imputer` - # (tripartite routing with separate positive / negative - # QRFs). - zero_model = RandomForestClassifier( - n_estimators=max(50, self.n_estimators // 2), - random_state=42, - n_jobs=-1, - ) - zero_model.fit(x_values, (y_values != 0).astype(int)) - self._zero_models[column] = zero_model - x_values = x_values[y_values != 0] - y_values = y_values[y_values != 0] - if len(y_values) < 25: - continue - model = RandomForestQuantileRegressor( - n_estimators=self.n_estimators, - random_state=42, - n_jobs=-1, - ) - model.fit(x_values, y_values) - self._models[column] = model - return self - - def generate( - self, - conditions: pd.DataFrame, - seed: int | None = None, - ) -> pd.DataFrame: - rng = np.random.RandomState(seed or 42) - synthetic = conditions.copy().reset_index(drop=True) - x_values = synthetic[self.condition_vars].to_numpy(dtype=float) - for column in self.target_vars: - model = self._models.get(column) - if model is None: - synthetic[column] = np.nan - continue - values = np.zeros(len(synthetic), dtype=float) - target_rows = np.ones(len(synthetic), dtype=bool) - zero_model = self._zero_models.get(column) - if zero_model is not None: - probabilities = zero_model.predict_proba(x_values) - positive_probs = ( - probabilities[:, 1] - if probabilities.shape[1] > 1 - else np.zeros(len(synthetic), dtype=float) - ) - target_rows = rng.random(len(synthetic)) < positive_probs - values[:] = 0.0 - if target_rows.any(): - predictions = model.predict( - x_values[target_rows], - quantiles=list(self.quantiles), - ) - quantile_choices = rng.choice( - len(self.quantiles), size=target_rows.sum() - ) - draws = predictions[np.arange(target_rows.sum()), quantile_choices] - if column in self.nonnegative_vars: - draws = np.maximum(draws, 0.0) - values[target_rows] = draws - synthetic[column] = values - return synthetic - - -class RegimeAwareDonorImputer: - """Donor imputer that wraps one chained canonical `microimpute.Imputer` block. - - The whole target block is fit with one regime-gated `microimpute.Imputer`, which - auto-detects one of seven regimes (THREE_SIGN / ZI_POSITIVE / - ZI_NEGATIVE / SIGN_ONLY / POSITIVE_ONLY / NEGATIVE_ONLY / - DEGENERATE_ZERO) for each target and composes a gate classifier + one or - two base imputers as appropriate. - - Key advantages over `ColumnwiseQRFDonorImputer`: - - 1. Negative values in training are preserved in predictions for - three-sign targets (capital gains, partnership/S-corp income, - farm income, rental income). The v7 `y > 0` bug is structurally - impossible under regime-aware routing. - 2. Predictions on three-sign targets never land in the interior - band between ``max(train_neg)`` and ``min(train_pos)`` - the - tripartite gate routes to sign-specific base imputers that each - see only one sign of training data. - - This class is a thin block adapter: the target order is passed through - to microimpute so target i+1 conditions on the realized imputation for - target i. That preserves cross-target donor relationships such as - income totals, losses, interest, dividends, pensions, and deduction - leaves instead of independently drawing each target from the same - original predictor surface. - """ - - def __init__( - self, - condition_vars: list[str], - target_vars: list[str], - n_estimators: int = 100, - max_train_samples: int | None = 50_000, - classifier_type: str = "hist_gb", - seed: int = 42, - ) -> None: - self.condition_vars = list(condition_vars) - self.target_vars = list(target_vars) - self.n_estimators = int(n_estimators) - if max_train_samples is not None and int(max_train_samples) < 1: - raise ValueError("max_train_samples must be a positive integer") - self.max_train_samples = ( - None if max_train_samples is None else int(max_train_samples) - ) - self.classifier_type = str(classifier_type) - self.seed = int(seed) - self._fitted: dict[str, Any] = {} - self._fitted_columns: tuple[str, ...] = () - self._predictor_columns: tuple[str, ...] = () - self._regimes: dict[str, str] = {} - - def _configured_qrf_class(self, qrf_class: type[Any]) -> type[Any]: - n_estimators = self.n_estimators - max_train_samples = self.max_train_samples - - class ConfiguredQRF(qrf_class): - def __init__(self, *args: Any, **kwargs: Any) -> None: - if max_train_samples is not None: - kwargs.setdefault("max_train_samples", max_train_samples) - super().__init__(*args, **kwargs) - - def fit(self, *args: Any, **kwargs: Any) -> Any: - kwargs.setdefault("n_estimators", n_estimators) - kwargs.setdefault("n_jobs", -1) - return super().fit(*args, **kwargs) - - ConfiguredQRF.__name__ = "ConfiguredRegimeAwareQRF" - return ConfiguredQRF - - def fit( - self, - data: pd.DataFrame, - *, - weight_col: str | None = "weight", - epochs: int | None = None, - batch_size: int | None = None, - learning_rate: float | None = None, - verbose: bool = False, - ) -> RegimeAwareDonorImputer: - del weight_col, epochs, batch_size, learning_rate, verbose - - if importlib.util.find_spec("microimpute.models.regime_gated") is None: - raise ImportError( - "microimpute with the canonical regime-gated Imputer is required " - "for donor_imputer_backend='regime_aware'." - ) - if importlib.util.find_spec("quantile_forest") is None: - raise ImportError( - "quantile-forest is required for the RegimeAwareDonorImputer base QRF." - ) - - from microimpute import Imputer as MicroImputer - from microimpute.models.qrf import QRF - - self._fitted = {} - self._fitted_columns = () - self._predictor_columns = () - self._regimes = {} - target_vars = tuple(dict.fromkeys(self.target_vars)) - target_set = set(target_vars) - predictor_vars = tuple( - dict.fromkeys(var for var in self.condition_vars if var not in target_set) - ) - fit_columns = tuple(dict.fromkeys((*predictor_vars, *target_vars))) - unique_data = _deduplicate_columns_preserve_first(data) - subset = ( - unique_data[list(fit_columns)].replace([np.inf, -np.inf], np.nan).dropna() - ) - if len(subset) < 25: - return self - - wrapper = MicroImputer( - base_imputer_class=self._configured_qrf_class(QRF), - base_imputer_kwargs={}, - classifier_type=self.classifier_type, - signregime=True, - seed=self.seed, - log_level="WARNING", - ) - fitted = wrapper.fit( - subset, - predictors=list(predictor_vars), - imputed_variables=list(target_vars), - ) - self._fitted_columns = target_vars - self._predictor_columns = predictor_vars - self._fitted = {column: fitted for column in self._fitted_columns} - regimes = getattr(fitted, "regimes_", getattr(wrapper, "_regimes", {})) - self._regimes = { - column: regime - for column, regime in regimes.items() - if column in target_set - } - return self - - def generate( - self, - conditions: pd.DataFrame, - seed: int | None = None, - ) -> pd.DataFrame: - synthetic = _deduplicate_columns_preserve_first(conditions).copy() - synthetic = synthetic.reset_index(drop=True) - fitted = next(iter(self._fitted.values()), None) - if fitted is None: - for column in dict.fromkeys(self.target_vars): - synthetic[column] = np.nan - return synthetic - - prediction_seed = self.seed if seed is None else int(seed) - self._reset_prediction_rngs(fitted, seed=prediction_seed) - preds = fitted.predict(synthetic[list(self._predictor_columns)]) - for column in self._fitted_columns: - if column in preds.columns: - synthetic[column] = preds[column].to_numpy(dtype=float) - else: - synthetic[column] = np.nan - return synthetic - - def _reset_prediction_rngs( - self, - obj: Any, - *, - seed: int, - visited: set[int] | None = None, - ) -> None: - if visited is None: - visited = set() - if obj is None or isinstance(obj, (str, bytes, int, float, bool)): - return - object_id = id(obj) - if object_id in visited: - return - visited.add(object_id) - - if hasattr(obj, "_rng"): - obj._rng = np.random.default_rng(seed) - child_rng = np.random.default_rng(seed) - - if isinstance(obj, dict): - children = list(obj.values()) - elif isinstance(obj, (list, tuple, set)): - children = list(obj) - else: - children = [] - for attr_name in ("models", "_per_variable", "_non_numeric_bundle"): - child = getattr(obj, attr_name, None) - if child is not None: - children.append(child) - - for child in children: - child_seed = int( - child_rng.integers(0, np.iinfo(np.int32).max, dtype=np.int64) - ) - self._reset_prediction_rngs( - child, - seed=child_seed, - visited=visited, - ) diff --git a/src/microplex_us/pipelines/ecps_replacement_comparison.py b/src/microplex_us/pipelines/ecps_replacement_comparison.py deleted file mode 100644 index d7897ea5..00000000 --- a/src/microplex_us/pipelines/ecps_replacement_comparison.py +++ /dev/null @@ -1,2223 +0,0 @@ -"""Sound Microplex-vs-eCPS replacement comparison harness.""" - -from __future__ import annotations - -import argparse -import hashlib -import importlib.metadata -import json -import subprocess -from pathlib import Path -from tempfile import TemporaryDirectory -from time import perf_counter -from typing import Any - -import h5py -import numpy as np - -from microplex_us.pipelines.mp_benchmark_manifest import ( - frozen_production_pin_mismatches, -) -from microplex_us.pipelines.pe_native_loss import ( - classify_pe_native_target_family, - loss_arrays_from_inputs, - pe_native_huber_loss, - pe_native_huber_loss_terms, - pe_native_relative_error, - subset_loss_arrays, -) -from microplex_us.pipelines.pe_native_optimization import ( - _PE_NATIVE_BROAD_MATRIX_SCRIPT, - optimize_pe_native_loss_weights, - rewrite_policyengine_us_dataset_weights, -) -from microplex_us.pipelines.pe_native_scores import ( - _ENHANCED_CPS_BAD_TARGETS, - build_policyengine_us_data_subprocess_env, - compute_us_pe_native_scores, - compute_us_pe_native_support_audit, - resolve_policyengine_us_data_repo_root, -) -from microplex_us.pipelines.performance import ( - _write_matched_policyengine_us_baseline_dataset, -) - -_PROTECTED_TARGET_PATTERNS: dict[str, tuple[str, ...]] = { - "ssi": ("ssi", "supplemental_security_income"), - "snap": ("snap",), - "wages": ("wage", "employment_income"), - "self_employment_income": ("self_employment", "business_income"), - "capital_gains": ("capital_gain", "capital_gains"), - "interest": ("interest",), - "dividends": ("dividend",), - "retirement_income": ("retirement", "pension", "ira", "401k", "403b"), - "disability": ("disability", "ssdi"), - "household_net_income": ("household_net_income", "net_income"), -} - -_DATASET_DERIVED_COMPARISON_TARGETS: tuple[str, ...] = ( - "nation/source/household_count", - "nation/source/cps_household_count", - "nation/source/puf_clone_household_count", -) - -_BASELINE_SANITY_MODES: tuple[str, ...] = ("msre", "content") - -_PRODUCTION_BASELINE_REQUIRED_NONZERO_COLUMNS: tuple[str, ...] = ( - "social_security_retirement", - "social_security_disability", - "employment_income_before_lsr", -) - -_BENCHMARK_MANIFEST_EVIDENCE_PATHS: dict[str, tuple[tuple[str, ...], ...]] = { - "certificate_type": (("certificate_type",),), - "period": (("period",),), - "baseline_dataset.sha256": ( - ("baseline_dataset", "sha256"), - ("enhanced_cps", "sha256"), - ("baseline_dataset_sha256",), - ("enhanced_cps_sha256",), - ), - "target_db.sha256": ( - ("target_db", "sha256"), - ("targets_db", "sha256"), - ("policyengine_targets_db", "sha256"), - ("target_db_sha256",), - ("policyengine_targets_db_sha256",), - ), - "policyengine_us_data.commit": ( - ("policyengine_us_data", "commit"), - ("policyengine_us_data", "commit_sha"), - ("policyengine_us_data_commit",), - ("policyengine_us_data_commit_sha",), - ), - "policyengine_us.version": ( - ("policyengine_us", "version"), - ("policyengine_us_version",), - ), - "target_surface.target_profile": ( - ("target_surface", "target_profile"), - ("target_profile",), - ), - "target_surface.target_scope": ( - ("target_surface", "target_scope"), - ("target_scope",), - ("target_scope_filter",), - ), - "target_surface.target_count": ( - ("target_surface", "target_count"), - ("target_count",), - ), - "target_surface.target_names_sha256": ( - ("target_surface", "target_names_sha256"), - ("target_names_sha256",), - ), - "scoring_config.sha256": ( - ("scoring_config", "sha256"), - ("scoring_config_sha256",), - ), - "baseline_metrics.baseline_enhanced_cps_native_loss": ( - ("baseline_metrics", "baseline_enhanced_cps_native_loss"), - ("baseline_enhanced_cps_native_loss",), - ), - "baseline_metrics.baseline_holdout_loss": ( - ("baseline_metrics", "baseline_holdout_loss"), - ("baseline_holdout_loss",), - ), - "baseline_metrics.baseline_unweighted_msre": ( - ("baseline_metrics", "baseline_unweighted_msre"), - ("baseline_unweighted_msre",), - ), -} - - -def _comparison_bad_targets() -> tuple[str, ...]: - return tuple( - dict.fromkeys( - ( - *_ENHANCED_CPS_BAD_TARGETS, - *_DATASET_DERIVED_COMPARISON_TARGETS, - ) - ) - ) - - -class ComparisonGateError(ValueError): - """Raised when a comparison input or result fails a validity gate. - - These gates exist so the harness refuses to emit a misleading verdict - instead of relying on a human noticing a mis-scored baseline or a no-op - refit. Every recurring comparison failure should add a gate here. - """ - - -def _assert_refit_effective( - label: str, refit: dict[str, Any], min_reduction: float -) -> None: - """Fail if a refit did not move at all (a frozen no-op refit). - - A frozen refit (optimized loss == initial loss) means that side was never - actually reweighted, so its loss is meaningless for comparison -- usually a - degenerate loss matrix or a total-weight/population mismatch under - ``preserve_input``. A refit that moves the loss is effective even if the - full-set loss rises slightly: the refit minimizes the train objective, so an - already-well-calibrated dataset can legitimately see full loss tick up from - the held-out split. Only a frozen no-movement refit is a failure. - """ - if not _refit_moved(refit, min_reduction): - initial = float(refit["initial_full_loss"]) - optimized = float(refit["optimized_full_loss"]) - raise ComparisonGateError( - f"{label} refit was a no-op: optimized loss {optimized:.6g} is " - f"unchanged from initial {initial:.6g} (no movement beyond " - f"{min_reduction:g}). The refit never reweighted this dataset, so the " - f"comparison is meaningless -- likely a degenerate loss matrix or a " - f"total-weight/population mismatch under preserve_input. Pass " - f"assert_refit_effective=False only to deliberately accept this." - ) - - -def _refit_moved(refit: dict[str, Any], min_reduction: float) -> bool: - initial = float(refit["initial_full_loss"]) - optimized = float(refit["optimized_full_loss"]) - return abs(optimized - initial) > float(min_reduction) - - -def _assert_baseline_sane( - score_summary: dict[str, Any], max_msre: float -) -> dict[str, Any]: - """Fail if the production eCPS baseline scores anomalously on this surface. - - A correctly-targeted production eCPS fits its own target surface closely; a - large unweighted MSRE means the target DB/scorer is wrong for this baseline - (e.g. an ad-hoc local scorer), so any verdict against it is invalid. - """ - msre = score_summary.get("baseline_unweighted_msre") - if msre is None: - return { - "mode": "msre", - "status": "skipped", - "reason": "baseline_unweighted_msre_absent", - } - if float(msre) > max_msre: - raise ComparisonGateError( - f"Baseline (production eCPS) scores anomalously on this target " - f"surface: unweighted MSRE {float(msre):.3f} > {max_msre:g}. A " - f"correctly-targeted production eCPS scores low (~0.2); a large value " - f"means the target DB/scorer does not match the baseline, so the " - f"comparison is invalid. Use the production target surface, or pass " - f"assert_baseline_sane=False only to deliberately accept this." - ) - return { - "mode": "msre", - "status": "passed", - "baseline_unweighted_msre": float(msre), - "max_baseline_unweighted_msre": float(max_msre), - } - - -def _assert_production_baseline_content_sane( - baseline_dataset_path: str | Path, - *, - period: int, - required_nonzero_columns: tuple[str, ...] = ( - _PRODUCTION_BASELINE_REQUIRED_NONZERO_COLUMNS - ), -) -> dict[str, Any]: - """Fail if the production eCPS baseline H5 is missing required content. - - This is the right sanity gate for broad external target surfaces where a - high eCPS MSRE may be the comparison's actual signal, not proof of a broken - scorer. It still catches the known broken-local-eCPS failure mode by - requiring core production columns to be present and nonzero. - """ - - path = Path(baseline_dataset_path).expanduser().resolve() - period_key = str(period) - missing: list[str] = [] - zero_or_nonfinite: list[str] = [] - column_summaries: dict[str, dict[str, Any]] = {} - with h5py.File(path, "r") as handle: - for column in required_nonzero_columns: - if column not in handle or period_key not in handle[column]: - missing.append(f"{column}/{period_key}") - continue - values = np.asarray(handle[column][period_key], dtype=np.float64) - finite = bool(np.isfinite(values).all()) - abs_sum = float(np.abs(values).sum()) if finite else float("nan") - nonzero_count = int(np.count_nonzero(values)) if finite else 0 - column_summaries[column] = { - "abs_sum": abs_sum, - "nonzero_count": nonzero_count, - "finite": finite, - } - if not finite or abs_sum <= 0.0 or nonzero_count <= 0: - zero_or_nonfinite.append(f"{column}/{period_key}") - if missing or zero_or_nonfinite: - details = [] - if missing: - details.append(f"missing {', '.join(missing)}") - if zero_or_nonfinite: - details.append(f"zero_or_nonfinite {', '.join(zero_or_nonfinite)}") - raise ComparisonGateError( - "Production eCPS baseline content sanity failed: " - + "; ".join(details) - + ". Use the verified production eCPS blob, not a broken local H5." - ) - return { - "mode": "content", - "status": "passed", - "period": int(period), - "required_nonzero_columns": column_summaries, - } - - -def build_sound_ecps_replacement_comparison( - *, - candidate_dataset_path: str | Path, - baseline_dataset_path: str | Path, - output_dir: str | Path, - period: int = 2024, - matched_household_count: int | None = None, - random_seed: int = 20260529, - matched_sample_method: str = "uniform", - holdout_target_fraction: float = 0.2, - holdout_target_seed: int = 20260529, - optimizer_max_iter: int = 200, - optimizer_tol: float = 1e-8, - score_consistency_tol: float = 1e-6, - target_diagnostics_top_k: int = 50, - include_support_audit: bool = True, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - policyengine_targets_db_path: str | Path | None = None, - skip_tax_expenditure_targets: bool = False, - target_scope: str = "all", - exact_rescore: bool = False, - force: bool = False, - assert_refit_effective: bool = True, - min_refit_loss_reduction: float = 1e-9, - assert_baseline_sane: bool = True, - baseline_sanity_mode: str = "msre", - max_baseline_unweighted_msre: float = 2.0, - benchmark_manifest_path: str | Path | None = None, - enforce_production_pins: bool = True, -) -> dict[str, Any]: - """Build a release-contract eCPS comparison payload. - - The comparison intentionally does not accept a one-sided refit. Both the - candidate and eCPS baseline are first matched to the same household count, - then refit with the same dense no-gates PE-native objective, then rescored - through the normal PE-native scorer. - """ - - started_at = perf_counter() - candidate_path = Path(candidate_dataset_path).expanduser().resolve() - baseline_path = Path(baseline_dataset_path).expanduser().resolve() - destination = Path(output_dir).expanduser().resolve() - destination.mkdir(parents=True, exist_ok=True) - resolved_targets_db = ( - Path(policyengine_targets_db_path).expanduser().resolve() - if policyengine_targets_db_path is not None - else None - ) - if resolved_targets_db is not None and not resolved_targets_db.exists(): - raise FileNotFoundError( - f"PolicyEngine target DB not found: {resolved_targets_db}" - ) - - candidate_household_ids, _ = _household_weights(candidate_path, period=period) - baseline_household_ids, _ = _household_weights(baseline_path, period=period) - matched_count = ( - int(matched_household_count) - if matched_household_count is not None - else min(len(candidate_household_ids), len(baseline_household_ids)) - ) - if matched_count <= 0: - raise ValueError("matched_household_count must be positive") - if matched_count > len(candidate_household_ids): - raise ValueError("matched_household_count cannot exceed candidate households") - if matched_count > len(baseline_household_ids): - raise ValueError("matched_household_count cannot exceed baseline households") - - matched_candidate_path = destination / "candidate_matched.h5" - matched_baseline_path = destination / "baseline_matched.h5" - _write_matched_dataset( - candidate_path, - matched_candidate_path, - period=period, - household_count=matched_count, - random_seed=random_seed, - sample_method=matched_sample_method, - force=force, - ) - _write_matched_dataset( - baseline_path, - matched_baseline_path, - period=period, - household_count=matched_count, - random_seed=random_seed + 1, - sample_method=matched_sample_method, - force=force, - ) - - candidate_inputs = _extract_pe_native_loss_inputs( - input_dataset_path=matched_candidate_path, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - policyengine_targets_db_path=resolved_targets_db, - skip_tax_expenditure_targets=skip_tax_expenditure_targets, - ) - baseline_inputs = _extract_pe_native_loss_inputs( - input_dataset_path=matched_baseline_path, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - policyengine_targets_db_path=resolved_targets_db, - skip_tax_expenditure_targets=skip_tax_expenditure_targets, - ) - candidate_inputs = _filter_loss_inputs_by_scope( - candidate_inputs, - target_scope=target_scope, - ) - baseline_inputs = _filter_loss_inputs_by_scope( - baseline_inputs, - target_scope=target_scope, - ) - target_names = _validate_common_targets(candidate_inputs, baseline_inputs) - if exact_rescore and target_scope != "all": - raise ValueError("exact_rescore is only supported for target_scope='all'") - holdout_mask = _build_holdout_target_mask( - target_names, - fraction=holdout_target_fraction, - seed=holdout_target_seed, - ) - - refit_config = { - "method": "monotone_accelerated_projected_gradient", - "lambda_l0": 0.0, - "lambda_l2": 0.0, - "use_gates": False, - "max_iter": int(optimizer_max_iter), - "tol": float(optimizer_tol), - "target_total_weight": "preserve_input", - } - candidate_refit_path = destination / "candidate_refit.h5" - baseline_refit_path = destination / "baseline_refit.h5" - candidate_refit = _fit_dense_refit( - input_dataset_path=matched_candidate_path, - output_dataset_path=candidate_refit_path, - loss_inputs=candidate_inputs, - holdout_mask=holdout_mask, - period=period, - max_iter=optimizer_max_iter, - tol=optimizer_tol, - ) - baseline_refit = _fit_dense_refit( - input_dataset_path=matched_baseline_path, - output_dataset_path=baseline_refit_path, - loss_inputs=baseline_inputs, - holdout_mask=holdout_mask, - period=period, - max_iter=optimizer_max_iter, - tol=optimizer_tol, - ) - - if assert_refit_effective: - _assert_refit_effective("candidate", candidate_refit, min_refit_loss_reduction) - _assert_refit_effective("baseline", baseline_refit, min_refit_loss_reduction) - candidate_refit_effective_passed = _refit_moved( - candidate_refit, min_refit_loss_reduction - ) - baseline_refit_effective_passed = _refit_moved( - baseline_refit, min_refit_loss_reduction - ) - - protected_family_losses = _protected_family_losses( - target_names=target_names, - candidate_inputs=candidate_inputs, - baseline_inputs=baseline_inputs, - candidate_weights=np.asarray(candidate_refit["optimized_weights"]), - baseline_weights=np.asarray(baseline_refit["optimized_weights"]), - ) - target_diagnostics = _target_loss_diagnostics( - target_names=target_names, - candidate_inputs=candidate_inputs, - baseline_inputs=baseline_inputs, - candidate_weights=np.asarray(candidate_refit["optimized_weights"]), - baseline_weights=np.asarray(baseline_refit["optimized_weights"]), - holdout_mask=holdout_mask, - top_k=target_diagnostics_top_k, - ) - - if exact_rescore: - pe_native_scores = compute_us_pe_native_scores( - candidate_dataset_path=candidate_refit_path, - baseline_dataset_path=baseline_refit_path, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - policyengine_targets_db_path=resolved_targets_db, - ) - score_summary = dict(pe_native_scores.get("summary") or {}) - candidate_score_loss = score_summary.get("candidate_enhanced_cps_native_loss") - baseline_score_loss = score_summary.get("baseline_enhanced_cps_native_loss") - candidate_score_error = _absolute_difference( - candidate_score_loss, - candidate_refit["optimized_full_loss"], - ) - baseline_score_error = _absolute_difference( - baseline_score_loss, - baseline_refit["optimized_full_loss"], - ) - objective_identity_passed = ( - candidate_score_error is not None - and baseline_score_error is not None - and candidate_score_error <= score_consistency_tol - and baseline_score_error <= score_consistency_tol - ) - score_source = "exact_policyengine_rescore" - exact_rescore_status = "completed" - else: - score_summary = _refit_matrix_score_summary( - target_names=target_names, - candidate_inputs=candidate_inputs, - baseline_inputs=baseline_inputs, - candidate_refit=candidate_refit, - baseline_refit=baseline_refit, - target_diagnostics=target_diagnostics, - ) - pe_native_scores = _refit_matrix_score_payload( - period=period, - candidate_dataset_path=candidate_refit_path, - baseline_dataset_path=baseline_refit_path, - summary=score_summary, - target_diagnostics=target_diagnostics, - ) - candidate_score_loss = score_summary.get("candidate_enhanced_cps_native_loss") - baseline_score_loss = score_summary.get("baseline_enhanced_cps_native_loss") - candidate_score_error = 0.0 - baseline_score_error = 0.0 - objective_identity_passed = True - score_source = "refit_loss_matrix" - exact_rescore_status = "skipped" - - if baseline_sanity_mode not in _BASELINE_SANITY_MODES: - raise ValueError( - "baseline_sanity_mode must be one of " + ", ".join(_BASELINE_SANITY_MODES) - ) - baseline_sanity: dict[str, Any] - if assert_baseline_sane: - if baseline_sanity_mode == "msre": - baseline_sanity = _assert_baseline_sane( - score_summary, max_baseline_unweighted_msre - ) - else: - baseline_sanity = _assert_production_baseline_content_sane( - baseline_path, - period=period, - ) - else: - baseline_sanity = { - "mode": baseline_sanity_mode, - "status": "skipped", - } - - ecps_refit_recovery_passed = baseline_refit[ - "optimized_full_loss" - ] <= baseline_refit["initial_full_loss"] + score_consistency_tol and ( - baseline_score_loss is None - or baseline_score_loss - <= baseline_refit["initial_full_loss"] + score_consistency_tol - ) - support_audit = ( - compute_us_pe_native_support_audit( - candidate_dataset_path=candidate_refit_path, - baseline_dataset_path=baseline_refit_path, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - if include_support_audit - else None - ) - support_audit_summary = ( - _support_audit_summary(support_audit) if support_audit is not None else None - ) - - score_summary.update( - { - "candidate_household_count": int(matched_count), - "baseline_household_count": int(matched_count), - "matched_household_count": True, - "candidate_initial_enhanced_cps_native_loss": candidate_refit[ - "initial_full_loss" - ], - "baseline_initial_enhanced_cps_native_loss": baseline_refit[ - "initial_full_loss" - ], - "candidate_train_loss": candidate_refit["optimized_train_loss"], - "baseline_train_loss": baseline_refit["optimized_train_loss"], - "candidate_holdout_loss": candidate_refit["optimized_holdout_loss"], - "baseline_holdout_loss": baseline_refit["optimized_holdout_loss"], - "candidate_score_abs_error": candidate_score_error, - "baseline_score_abs_error": baseline_score_error, - "score_source": score_source, - "exact_rescore_requested": bool(exact_rescore), - "exact_rescore_status": exact_rescore_status, - "candidate_refit_effective_passed": candidate_refit_effective_passed, - "baseline_refit_effective_passed": baseline_refit_effective_passed, - "ecps_refit_effective_passed": baseline_refit_effective_passed, - "candidate_refit_config": refit_config, - "baseline_refit_config": refit_config, - "symmetric_refit": True, - "score_candidate_only": False, - "refit_objective_matches_scoring": objective_identity_passed, - "ecps_refit_recovery_passed": ecps_refit_recovery_passed, - "holdout_target_fraction": float(holdout_target_fraction), - "holdout_targets": int(holdout_mask.sum()), - "target_scope_filter": target_scope, - "protected_family_losses": protected_family_losses, - "target_diagnostics": target_diagnostics["summary"], - "support_audit": support_audit_summary, - "baseline_sanity": baseline_sanity, - "policyengine_targets_db": ( - _dataset_descriptor(resolved_targets_db) - if resolved_targets_db is not None - else None - ), - } - ) - frozen_baseline_certificate = _frozen_ecps_baseline_certificate( - baseline_dataset_path=baseline_path, - policyengine_targets_db_path=resolved_targets_db, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_version=_installed_policyengine_us_version(), - period=period, - target_names=target_names, - target_scope=target_scope, - holdout_target_fraction=holdout_target_fraction, - holdout_target_seed=holdout_target_seed, - matched_sample_method=matched_sample_method, - refit_config=refit_config, - skip_tax_expenditure_targets=skip_tax_expenditure_targets, - exact_rescore=exact_rescore, - score_source=score_source, - baseline_sanity=baseline_sanity, - score_summary=score_summary, - ) - if enforce_production_pins: - _assert_certificate_uses_frozen_production_pins(frozen_baseline_certificate) - benchmark_manifest = ( - _assert_certificate_matches_benchmark_manifest( - frozen_baseline_certificate, - benchmark_manifest_path, - ) - if benchmark_manifest_path is not None - else None - ) - payload = { - "schema_version": 1, - "metric": "sound_ecps_replacement_comparison", - "period": int(period), - "candidate_dataset": _dataset_descriptor(candidate_path), - "baseline_dataset": _dataset_descriptor(baseline_path), - "matched_datasets": { - "household_count": int(matched_count), - "candidate": _dataset_descriptor(matched_candidate_path), - "baseline": _dataset_descriptor(matched_baseline_path), - "random_seed": int(random_seed), - "sample_method": matched_sample_method, - }, - "comparison_contract": { - "matched_household_count": True, - "symmetric_refit": True, - "score_candidate_only": False, - "refit_objective_matches_scoring": objective_identity_passed, - "ecps_refit_recovery_passed": ecps_refit_recovery_passed, - "ecps_refit_effective_passed": baseline_refit_effective_passed, - "holdout_target_fraction": float(holdout_target_fraction), - "holdout_targets": int(holdout_mask.sum()), - "target_scope_filter": target_scope, - "protected_family_losses": protected_family_losses, - }, - "frozen_ecps_baseline_certificate": frozen_baseline_certificate, - "benchmark_manifest": benchmark_manifest, - "entity_structure": { - "candidate_source": _entity_structure_summary( - candidate_path, - period=period, - ), - "baseline_source": _entity_structure_summary( - baseline_path, - period=period, - ), - "candidate_matched": _entity_structure_summary( - matched_candidate_path, - period=period, - ), - "baseline_matched": _entity_structure_summary( - matched_baseline_path, - period=period, - ), - "candidate_refit": _entity_structure_summary( - candidate_refit_path, - period=period, - ), - "baseline_refit": _entity_structure_summary( - baseline_refit_path, - period=period, - ), - }, - "summary": score_summary, - "score": pe_native_scores, - "target_diagnostics": target_diagnostics, - "support_audit": support_audit, - "candidate_refit": _strip_weights(candidate_refit), - "baseline_refit": _strip_weights(baseline_refit), - "target_split": { - "holdout_target_fraction": float(holdout_target_fraction), - "holdout_target_seed": int(holdout_target_seed), - "target_scope_filter": target_scope, - "train_targets": int((~holdout_mask).sum()), - "holdout_targets": int(holdout_mask.sum()), - "holdout_target_names": [ - name - for name, holdout in zip(target_names, holdout_mask, strict=True) - if holdout - ], - }, - "refit_config": refit_config, - "skip_tax_expenditure_targets": bool(skip_tax_expenditure_targets), - "elapsed_seconds": float(perf_counter() - started_at), - } - return payload - - -def write_sound_ecps_replacement_comparison( - output_path: str | Path, - target_diagnostics_path: str | Path | None = None, - support_audit_path: str | Path | None = None, - **kwargs: Any, -) -> Path: - """Write a sound eCPS replacement comparison payload.""" - - payload = build_sound_ecps_replacement_comparison(**kwargs) - destination = Path(output_path).expanduser().resolve() - destination.parent.mkdir(parents=True, exist_ok=True) - diagnostics_destination = ( - Path(target_diagnostics_path).expanduser().resolve() - if target_diagnostics_path is not None - else destination.parent / "target_loss_diagnostics.json" - ) - diagnostics_destination.parent.mkdir(parents=True, exist_ok=True) - diagnostics_destination.write_text( - json.dumps(payload["target_diagnostics"], indent=2, sort_keys=True) - ) - payload.setdefault("artifacts", {})["target_loss_diagnostics"] = ( - _dataset_descriptor(diagnostics_destination) - ) - support_audit = payload.get("support_audit") - if support_audit is not None: - support_destination = ( - Path(support_audit_path).expanduser().resolve() - if support_audit_path is not None - else destination.parent / "support_audit.json" - ) - support_destination.parent.mkdir(parents=True, exist_ok=True) - support_destination.write_text( - json.dumps(support_audit, indent=2, sort_keys=True) - ) - payload.setdefault("artifacts", {})["support_audit"] = _dataset_descriptor( - support_destination - ) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True)) - return destination - - -def _write_matched_dataset( - input_path: Path, - output_path: Path, - *, - period: int, - household_count: int, - random_seed: int, - sample_method: str, - force: bool, -) -> None: - if output_path.exists() and not force: - raise FileExistsError( - f"{output_path} already exists; pass --force to replace it" - ) - _write_matched_policyengine_us_baseline_dataset( - input_path, - output_path, - period=period, - household_count=household_count, - random_seed=random_seed, - sample_method=sample_method, - ) - - -def _household_weights( - dataset_path: str | Path, - *, - period: int, -) -> tuple[np.ndarray, np.ndarray]: - path = Path(dataset_path).expanduser().resolve() - period_key = str(period) - with h5py.File(path, "r") as handle: - if "household_id" not in handle or period_key not in handle["household_id"]: - raise ValueError(f"{path} is missing household_id/{period_key}") - if ( - "household_weight" not in handle - or period_key not in handle["household_weight"] - ): - raise ValueError(f"{path} is missing household_weight/{period_key}") - household_ids = np.asarray(handle["household_id"][period_key], dtype=np.int64) - weights = np.asarray( - handle["household_weight"][period_key], - dtype=np.float64, - ) - if household_ids.shape[0] != weights.shape[0]: - raise ValueError(f"{path} household_id and household_weight lengths differ") - return household_ids, weights - - -def _entity_structure_summary( - dataset_path: str | Path, - *, - period: int, -) -> dict[str, Any]: - path = Path(dataset_path).expanduser().resolve() - period_key = str(period) - with h5py.File(path, "r") as handle: - household_ids = _read_period_array(handle, "household_id", period_key) - person_ids = _read_period_array(handle, "person_id", period_key) - person_household_ids = _read_period_array( - handle, - "person_household_id", - period_key, - ) - if person_ids.shape[0] != person_household_ids.shape[0]: - raise ValueError(f"{path} person_id and person_household_id lengths differ") - - household_count = int(household_ids.shape[0]) - summary: dict[str, Any] = { - "dataset": str(path), - "period": int(period), - "household_count": household_count, - "person_count": int(person_ids.shape[0]), - } - for entity in ("tax_unit", "spm_unit", "family", "marital_unit"): - plural = _ENTITY_PLURALS[entity] - entity_summary = _entity_membership_summary( - handle, - entity=entity, - period_key=period_key, - person_household_ids=person_household_ids, - household_count=household_count, - dataset_path=path, - ) - summary[entity] = entity_summary - summary[f"{entity}_count"] = entity_summary["unit_count"] - summary[f"{plural}_per_household"] = entity_summary["units_per_household"] - return summary - - -_ENTITY_PLURALS = { - "tax_unit": "tax_units", - "spm_unit": "spm_units", - "family": "families", - "marital_unit": "marital_units", -} - - -def _read_period_array( - handle: h5py.File, - variable: str, - period_key: str, -) -> np.ndarray: - if variable not in handle or period_key not in handle[variable]: - raise ValueError(f"Dataset is missing {variable}/{period_key}") - return np.asarray(handle[variable][period_key], dtype=np.int64) - - -def _entity_membership_summary( - handle: h5py.File, - *, - entity: str, - period_key: str, - person_household_ids: np.ndarray, - household_count: int, - dataset_path: Path, -) -> dict[str, Any]: - entity_ids = _read_period_array(handle, f"{entity}_id", period_key) - person_entity_ids = _read_period_array( - handle, - f"person_{entity}_id", - period_key, - ) - if person_entity_ids.shape[0] != person_household_ids.shape[0]: - raise ValueError( - f"{dataset_path} person_{entity}_id and person_household_id lengths differ" - ) - unique_entity_ids = np.unique(entity_ids) - duplicate_unit_id_count = int(entity_ids.shape[0] - unique_entity_ids.shape[0]) - unique_person_entity_ids, inverse = np.unique( - person_entity_ids, - return_inverse=True, - ) - member_counts = np.bincount(inverse) - singleton_count = int(np.count_nonzero(member_counts == 1)) - empty_unit_count = int( - np.setdiff1d(unique_entity_ids, unique_person_entity_ids).size - ) - missing_referenced_unit_count = int( - np.setdiff1d(unique_person_entity_ids, unique_entity_ids).size - ) - cross_household_count = _cross_household_entity_count( - inverse, - person_household_ids, - ) - unit_count = int(entity_ids.shape[0]) - return { - "unit_count": unit_count, - "person_membership_count": int(person_entity_ids.shape[0]), - "duplicate_unit_id_count": duplicate_unit_id_count, - "units_per_household": ( - float(unit_count / household_count) if household_count else None - ), - "singleton_unit_count": singleton_count, - "singleton_unit_share": ( - float(singleton_count / unit_count) if unit_count else None - ), - "empty_unit_count": empty_unit_count, - "missing_referenced_unit_count": missing_referenced_unit_count, - "cross_household_unit_count": cross_household_count, - } - - -def _cross_household_entity_count( - entity_inverse: np.ndarray, - person_household_ids: np.ndarray, -) -> int: - if entity_inverse.size == 0: - return 0 - order = np.argsort(entity_inverse, kind="stable") - sorted_entity = entity_inverse[order] - sorted_household = person_household_ids[order] - boundaries = np.concatenate( - ( - np.asarray([0]), - np.flatnonzero(np.diff(sorted_entity)) + 1, - np.asarray([sorted_entity.size]), - ) - ) - cross_household_count = 0 - for start, stop in zip(boundaries[:-1], boundaries[1:], strict=True): - if np.unique(sorted_household[start:stop]).size > 1: - cross_household_count += 1 - return cross_household_count - - -def _extract_pe_native_loss_inputs( - *, - input_dataset_path: str | Path, - period: int, - policyengine_us_data_repo: str | Path | None, - policyengine_us_data_python: str | Path | None, - policyengine_targets_db_path: str | Path | None, - skip_tax_expenditure_targets: bool, -) -> dict[str, Any]: - if skip_tax_expenditure_targets: - raise ValueError( - "sound eCPS replacement comparison uses the exact PE-native broad " - "loss target surface; skipping tax expenditure targets is unsupported" - ) - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - resolved_targets_db = ( - Path(policyengine_targets_db_path).expanduser().resolve() - if policyengine_targets_db_path is not None - else None - ) - if resolved_targets_db is not None and not resolved_targets_db.exists(): - raise FileNotFoundError( - f"PolicyEngine target DB not found: {resolved_targets_db}" - ) - command = ( - [str(Path(policyengine_us_data_python).expanduser())] - if policyengine_us_data_python is not None - else ["uv", "run", "--project", str(resolved_repo), "python"] - ) - with TemporaryDirectory(prefix="microplex-us-ecps-comparison-") as temp_dir: - prefix = Path(temp_dir) / "pe_native_matrix" - completed = subprocess.run( - [ - *command, - "-c", - _PE_NATIVE_BROAD_MATRIX_SCRIPT, - str(resolved_repo), - json.dumps(_comparison_bad_targets()), - str(int(period)), - str(Path(input_dataset_path).expanduser().resolve()), - "1" if skip_tax_expenditure_targets else "0", - str(prefix), - "", - str(resolved_targets_db) if resolved_targets_db is not None else "", - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - detail = ( - completed.stderr.strip() - or completed.stdout.strip() - or str(completed.returncode) - ) - raise RuntimeError(f"PE-native loss-matrix extraction failed: {detail}") - return { - "scaled_matrix": np.load(prefix.with_suffix(".matrix.npy")), - "scaled_target": np.load(prefix.with_suffix(".target.npy")), - "initial_weights": np.load(prefix.with_suffix(".weights.npy")), - "unscaled_target": _load_optional_array( - prefix.with_suffix(".target_unscaled.npy") - ), - "scaling": _load_optional_array(prefix.with_suffix(".scaling.npy")), - "loss_denominator": _load_optional_array( - prefix.with_suffix(".loss_denominator.npy") - ), - "loss_target_weight": _load_optional_array( - prefix.with_suffix(".loss_target_weight.npy") - ), - "loss_bucket": _load_optional_array( - prefix.with_suffix(".loss_bucket.npy"), - allow_pickle=True, - ), - "loss_unit": _load_optional_array( - prefix.with_suffix(".loss_unit.npy"), - allow_pickle=True, - ), - "loss_scope": _load_optional_array( - prefix.with_suffix(".loss_scope.npy"), - allow_pickle=True, - ), - "loss_family": _load_optional_array( - prefix.with_suffix(".loss_family.npy"), - allow_pickle=True, - ), - "loss_epsilon": _load_optional_array( - prefix.with_suffix(".loss_epsilon.npy") - ), - "metadata": json.loads(prefix.with_suffix(".meta.json").read_text()), - } - - -def _load_optional_array( - path: Path, *, allow_pickle: bool = False -) -> np.ndarray | None: - return np.load(path, allow_pickle=allow_pickle) if path.exists() else None - - -def _filter_loss_inputs_by_scope( - loss_inputs: dict[str, Any], - *, - target_scope: str, -) -> dict[str, Any]: - if target_scope not in {"all", "national", "state"}: - raise ValueError("target_scope must be one of all, national, or state") - if target_scope == "all": - return loss_inputs - - metadata = dict(loss_inputs["metadata"]) - target_names = np.asarray(metadata.get("target_names", ()), dtype=object) - if target_names.size == 0: - raise ValueError("PE-native loss inputs do not include target names") - - scope = loss_inputs.get("loss_scope") - if scope is not None: - keep_mask = np.asarray(scope, dtype=object) == target_scope - elif target_scope == "national": - keep_mask = np.asarray( - [str(name).startswith("nation/") for name in target_names], - dtype=bool, - ) - else: - keep_mask = np.asarray( - [not str(name).startswith("nation/") for name in target_names], - dtype=bool, - ) - if not bool(keep_mask.any()): - raise ValueError(f"target_scope={target_scope!r} selected no targets") - - filtered = dict(loss_inputs) - filtered["scaled_matrix"] = np.asarray(loss_inputs["scaled_matrix"])[:, keep_mask] - for key in ( - "scaled_target", - "unscaled_target", - "scaling", - "loss_denominator", - "loss_target_weight", - "loss_bucket", - "loss_unit", - "loss_scope", - "loss_family", - "loss_epsilon", - ): - value = loss_inputs.get(key) - if value is not None: - filtered[key] = np.asarray(value)[keep_mask] - - filtered_names = target_names[keep_mask].tolist() - metadata["target_names"] = filtered_names - metadata["target_scope_filter"] = target_scope - metadata["n_targets_scope_filtered_from"] = int(target_names.size) - metadata["n_targets_kept"] = int(len(filtered_names)) - metadata["n_national_targets"] = int( - sum(str(name).startswith("nation/") for name in filtered_names) - ) - metadata["n_state_targets"] = int( - len(filtered_names) - metadata["n_national_targets"] - ) - sidecar_rows = metadata.get("target_loss_metadata") - if isinstance(sidecar_rows, list) and len(sidecar_rows) == target_names.size: - metadata["target_loss_metadata"] = [ - row for row, keep in zip(sidecar_rows, keep_mask, strict=True) if keep - ] - filtered["metadata"] = metadata - return filtered - - -def _validate_common_targets( - candidate_inputs: dict[str, Any], - baseline_inputs: dict[str, Any], -) -> list[str]: - candidate_names = list(candidate_inputs["metadata"].get("target_names", ())) - baseline_names = list(baseline_inputs["metadata"].get("target_names", ())) - if candidate_names != baseline_names: - raise ValueError("candidate and baseline PE-native target names differ") - candidate_target = np.asarray(candidate_inputs["scaled_target"], dtype=np.float64) - baseline_target = np.asarray(baseline_inputs["scaled_target"], dtype=np.float64) - if not np.allclose(candidate_target, baseline_target): - raise ValueError("candidate and baseline PE-native scaled targets differ") - for key in ( - "loss_denominator", - "loss_target_weight", - "loss_epsilon", - ): - left = candidate_inputs.get(key) - right = baseline_inputs.get(key) - if left is None and right is None: - continue - if left is None or right is None or not np.allclose(left, right): - raise ValueError(f"candidate and baseline PE-native {key} differ") - for key in ("loss_bucket", "loss_unit", "loss_scope", "loss_family"): - left = candidate_inputs.get(key) - right = baseline_inputs.get(key) - if left is None and right is None: - continue - if left is None or right is None or not np.array_equal(left, right): - raise ValueError(f"candidate and baseline PE-native {key} differ") - return candidate_names - - -def _build_holdout_target_mask( - target_names: list[str], - *, - fraction: float, - seed: int, -) -> np.ndarray: - if fraction <= 0.0 or fraction >= 1.0: - raise ValueError("holdout_target_fraction must be between 0 and 1") - families = np.asarray( - [classify_pe_native_target_family(name) for name in target_names] - ) - rng = np.random.default_rng(int(seed)) - holdout_mask = np.zeros(len(target_names), dtype=bool) - for family in sorted(set(families)): - indices = np.flatnonzero(families == family) - if len(indices) <= 1: - continue - count = int(round(len(indices) * fraction)) - count = max(1, min(count, len(indices) - 1)) - holdout_mask[rng.choice(indices, size=count, replace=False)] = True - if not bool(holdout_mask.any()): - raise ValueError("holdout_target_fraction did not select any targets") - if bool(holdout_mask.all()): - raise ValueError("holdout split selected every target") - return holdout_mask - - -def _fit_dense_refit( - *, - input_dataset_path: Path, - output_dataset_path: Path, - loss_inputs: dict[str, Any], - holdout_mask: np.ndarray, - period: int, - max_iter: int, - tol: float, -) -> dict[str, Any]: - matrix = np.asarray(loss_inputs["scaled_matrix"], dtype=np.float64) - target = np.asarray(loss_inputs["scaled_target"], dtype=np.float64) - initial_weights = np.asarray(loss_inputs["initial_weights"], dtype=np.float64) - loss_arrays = loss_arrays_from_inputs(loss_inputs) - train_mask = ~holdout_mask - train_loss_arrays = ( - subset_loss_arrays(loss_arrays, train_mask) if loss_arrays is not None else None - ) - holdout_loss_arrays = ( - subset_loss_arrays(loss_arrays, holdout_mask) - if loss_arrays is not None - else None - ) - loss_curve: list[dict[str, float | int]] = [] - - def record_loss_curve( - iteration: int, - weights: np.ndarray, - objective_loss: float, - ) -> None: - loss_curve.append( - { - "iteration": int(iteration), - "objective_train_loss": float(objective_loss), - "full_loss": _objective( - matrix, - target, - weights, - loss_arrays=loss_arrays, - ), - "train_loss": _objective( - matrix[:, train_mask], - target[train_mask], - weights, - loss_arrays=train_loss_arrays, - ), - "holdout_loss": _objective( - matrix[:, holdout_mask], - target[holdout_mask], - weights, - loss_arrays=holdout_loss_arrays, - ), - "weight_sum": float(weights.sum()), - "positive_household_count": int((weights > 1e-9).sum()), - } - ) - - optimized_weights, optimizer_summary = optimize_pe_native_loss_weights( - scaled_matrix=matrix[:, train_mask], - scaled_target=target[train_mask], - initial_weights=initial_weights, - loss_arrays=train_loss_arrays, - budget=None, - max_iter=max_iter, - l2_penalty=0.0, - tol=tol, - history_callback=record_loss_curve, - ) - rewrite_policyengine_us_dataset_weights( - input_dataset_path=input_dataset_path, - output_dataset_path=output_dataset_path, - household_weights=optimized_weights, - period=period, - ) - return { - "input_dataset": str(input_dataset_path.resolve()), - "output_dataset": str(output_dataset_path.resolve()), - "initial_full_loss": _objective( - matrix, - target, - initial_weights, - loss_arrays=loss_arrays, - ), - "optimized_full_loss": _objective( - matrix, - target, - optimized_weights, - loss_arrays=loss_arrays, - ), - "initial_train_loss": _objective( - matrix[:, train_mask], - target[train_mask], - initial_weights, - loss_arrays=train_loss_arrays, - ), - "optimized_train_loss": _objective( - matrix[:, train_mask], - target[train_mask], - optimized_weights, - loss_arrays=train_loss_arrays, - ), - "initial_holdout_loss": _objective( - matrix[:, holdout_mask], - target[holdout_mask], - initial_weights, - loss_arrays=holdout_loss_arrays, - ), - "optimized_holdout_loss": _objective( - matrix[:, holdout_mask], - target[holdout_mask], - optimized_weights, - loss_arrays=holdout_loss_arrays, - ), - "initial_weight_sum": float(initial_weights.sum()), - "optimized_weight_sum": float(optimized_weights.sum()), - "household_count": int(len(optimized_weights)), - "positive_household_count": int((optimized_weights > 1e-9).sum()), - "optimizer_summary": optimizer_summary, - "loss_curve": loss_curve, - "optimized_weights": optimized_weights, - } - - -def _objective( - matrix: np.ndarray, - target: np.ndarray, - weights: np.ndarray, - *, - loss_arrays: Any | None = None, -) -> float: - estimate = matrix.T @ weights - if loss_arrays is not None: - return pe_native_huber_loss(estimate, loss_arrays) - residual = estimate - target - return float(np.dot(residual, residual)) - - -def _protected_family_losses( - *, - target_names: list[str], - candidate_inputs: dict[str, Any], - baseline_inputs: dict[str, Any], - candidate_weights: np.ndarray, - baseline_weights: np.ndarray, -) -> dict[str, dict[str, float | int]]: - candidate_terms = _loss_terms(candidate_inputs, candidate_weights) - baseline_terms = _loss_terms(baseline_inputs, baseline_weights) - rows: dict[str, dict[str, float | int]] = {} - for family, patterns in _PROTECTED_TARGET_PATTERNS.items(): - indices = [ - index - for index, name in enumerate(target_names) - if _target_matches_protected_family(name, family, patterns) - ] - if not indices: - continue - candidate_loss = float(candidate_terms[indices].sum()) - baseline_loss = float(baseline_terms[indices].sum()) - rows[family] = { - "n_targets": int(len(indices)), - "candidate_loss": candidate_loss, - "baseline_loss": baseline_loss, - "loss_delta": candidate_loss - baseline_loss, - } - return rows - - -def _target_loss_diagnostics( - *, - target_names: list[str], - candidate_inputs: dict[str, Any], - baseline_inputs: dict[str, Any], - candidate_weights: np.ndarray, - baseline_weights: np.ndarray, - holdout_mask: np.ndarray, - top_k: int, -) -> dict[str, Any]: - candidate_terms = _loss_terms(candidate_inputs, candidate_weights) - baseline_terms = _loss_terms(baseline_inputs, baseline_weights) - candidate_values = _target_value_diagnostics( - candidate_inputs, - candidate_weights, - ) - baseline_values = _target_value_diagnostics( - baseline_inputs, - baseline_weights, - ) - if not np.array_equal( - candidate_values["value_scale"], - baseline_values["value_scale"], - ): - raise ValueError("candidate and baseline target diagnostic scales differ") - if not np.allclose(candidate_values["target"], baseline_values["target"]): - raise ValueError("candidate and baseline target diagnostic values differ") - for key in ("loss_denominator", "loss_target_weight"): - if not np.allclose(candidate_values[key], baseline_values[key]): - raise ValueError(f"candidate and baseline target diagnostic {key} differ") - for key in ("loss_bucket", "loss_unit", "loss_scope"): - if not np.array_equal(candidate_values[key], baseline_values[key]): - raise ValueError(f"candidate and baseline target diagnostic {key} differ") - if candidate_terms.shape != baseline_terms.shape: - raise ValueError("candidate and baseline target loss term shapes differ") - if len(target_names) != candidate_terms.shape[0]: - raise ValueError("target name count does not match loss terms") - if holdout_mask.shape[0] != candidate_terms.shape[0]: - raise ValueError("holdout mask length does not match loss terms") - - rows: list[dict[str, Any]] = [] - candidate_wins = 0 - baseline_wins = 0 - ties = 0 - candidate_loss_total = float(candidate_terms.sum()) - baseline_loss_total = float(baseline_terms.sum()) - for index, target_name in enumerate(target_names): - candidate_loss = float(candidate_terms[index]) - baseline_loss = float(baseline_terms[index]) - loss_delta = candidate_loss - baseline_loss - if np.isclose(candidate_loss, baseline_loss): - winner = "tie" - ties += 1 - elif candidate_loss < baseline_loss: - winner = "candidate" - candidate_wins += 1 - else: - winner = "baseline" - baseline_wins += 1 - rows.append( - { - "target_index": int(index), - "target_name": str(target_name), - "family": classify_pe_native_target_family(target_name), - "loss_scope": str(candidate_values["loss_scope"][index]), - "loss_unit": str(candidate_values["loss_unit"][index]), - "loss_bucket": str(candidate_values["loss_bucket"][index]), - "loss_denominator": float(candidate_values["loss_denominator"][index]), - "loss_target_weight": float( - candidate_values["loss_target_weight"][index] - ), - "loss_epsilon": float(candidate_values["loss_epsilon"][index]), - "split": "holdout" if bool(holdout_mask[index]) else "train", - "value_scale": str(candidate_values["value_scale"][index]), - "target_value": float(candidate_values["target"][index]), - "candidate_estimate": float(candidate_values["estimate"][index]), - "baseline_estimate": float(baseline_values["estimate"][index]), - "candidate_error": float(candidate_values["error"][index]), - "baseline_error": float(baseline_values["error"][index]), - "candidate_relative_error": float( - candidate_values["relative_error"][index] - ), - "baseline_relative_error": float( - baseline_values["relative_error"][index] - ), - "candidate_loss_term": candidate_loss, - "baseline_loss_term": baseline_loss, - "candidate_loss_share": ( - candidate_loss / candidate_loss_total - if candidate_loss_total > 0.0 - else 0.0 - ), - "baseline_loss_share": ( - baseline_loss / baseline_loss_total - if baseline_loss_total > 0.0 - else 0.0 - ), - "loss_delta": float(loss_delta), - "candidate_abs_scaled_error": float(np.sqrt(candidate_loss)), - "baseline_abs_scaled_error": float(np.sqrt(baseline_loss)), - "winner": winner, - } - ) - - top_k = max(0, int(top_k)) - regressions = sorted( - rows, - key=lambda row: float(row["loss_delta"]), - reverse=True, - )[:top_k] - improvements = sorted(rows, key=lambda row: float(row["loss_delta"]))[:top_k] - summary = { - "n_targets": int(len(rows)), - "candidate_loss": candidate_loss_total, - "baseline_loss": baseline_loss_total, - "loss_delta": float(candidate_loss_total - baseline_loss_total), - "candidate_max_single_target_loss_share": ( - float(candidate_terms.max() / candidate_loss_total) - if candidate_loss_total > 0.0 and candidate_terms.size - else 0.0 - ), - "baseline_max_single_target_loss_share": ( - float(baseline_terms.max() / baseline_loss_total) - if baseline_loss_total > 0.0 and baseline_terms.size - else 0.0 - ), - "candidate_wins": int(candidate_wins), - "baseline_wins": int(baseline_wins), - "ties": int(ties), - "train_targets": int((~holdout_mask).sum()), - "holdout_targets": int(holdout_mask.sum()), - "top_k": int(top_k), - } - return { - "schema_version": 1, - "metric": "sound_ecps_target_loss_diagnostics", - "summary": summary, - "family_breakdown": _target_family_breakdown(rows, len(rows)), - "bucket_breakdown": _target_bucket_breakdown(rows), - "top_regressions": regressions, - "top_improvements": improvements, - "targets": rows, - } - - -def _refit_matrix_score_summary( - *, - target_names: list[str], - candidate_inputs: dict[str, Any], - baseline_inputs: dict[str, Any], - candidate_refit: dict[str, Any], - baseline_refit: dict[str, Any], - target_diagnostics: dict[str, Any], -) -> dict[str, Any]: - candidate_loss = float(candidate_refit["optimized_full_loss"]) - baseline_loss = float(baseline_refit["optimized_full_loss"]) - candidate_msre = _diagnostic_unweighted_msre(target_diagnostics, "candidate") - baseline_msre = _diagnostic_unweighted_msre(target_diagnostics, "baseline") - candidate_metadata = dict(candidate_inputs.get("metadata") or {}) - baseline_metadata = dict(baseline_inputs.get("metadata") or {}) - loss_metric = str( - candidate_metadata.get( - "loss_metric", - baseline_metadata.get("loss_metric", "enhanced_cps_native_loss"), - ) - ) - n_targets_kept = int( - candidate_metadata.get( - "n_targets_kept", - baseline_metadata.get("n_targets_kept", len(target_names)), - ) - ) - summary: dict[str, Any] = { - "loss_metric": loss_metric, - "loss_config": candidate_metadata.get( - "loss_config", - baseline_metadata.get("loss_config"), - ), - "candidate_enhanced_cps_native_loss": candidate_loss, - "baseline_enhanced_cps_native_loss": baseline_loss, - "enhanced_cps_native_loss_delta": candidate_loss - baseline_loss, - "candidate_beats_baseline": candidate_loss < baseline_loss, - "candidate_unweighted_msre": candidate_msre, - "baseline_unweighted_msre": baseline_msre, - "unweighted_msre_delta": candidate_msre - baseline_msre, - "n_targets_kept": n_targets_kept, - "score_source": "refit_loss_matrix", - "candidate_max_single_target_loss_share": target_diagnostics["summary"].get( - "candidate_max_single_target_loss_share" - ), - "baseline_max_single_target_loss_share": target_diagnostics["summary"].get( - "baseline_max_single_target_loss_share" - ), - } - for key in ( - "n_targets_total", - "n_targets_zero_dropped", - "n_targets_bad_dropped", - "n_national_targets", - "n_state_targets", - ): - if key in candidate_metadata: - summary[key] = candidate_metadata[key] - elif key in baseline_metadata: - summary[key] = baseline_metadata[key] - return summary - - -def _refit_matrix_score_payload( - *, - period: int, - candidate_dataset_path: Path, - baseline_dataset_path: Path, - summary: dict[str, Any], - target_diagnostics: dict[str, Any], -) -> dict[str, Any]: - family_breakdown = list(target_diagnostics.get("family_breakdown") or ()) - return { - "metric": str(summary.get("loss_metric") or "enhanced_cps_native_loss"), - "score_source": "refit_loss_matrix", - "period": int(period), - "candidate_dataset": str(candidate_dataset_path.resolve()), - "baseline_dataset": str(baseline_dataset_path.resolve()), - "summary": dict(summary), - "family_breakdown": family_breakdown, - "broad_loss": { - "score_source": "refit_loss_matrix", - "summary": dict(summary), - "family_breakdown": family_breakdown, - }, - } - - -def _diagnostic_unweighted_msre( - target_diagnostics: dict[str, Any], - prefix: str, -) -> float: - rows = list(target_diagnostics.get("targets") or ()) - if not rows: - return float("nan") - values = np.asarray( - [float(row[f"{prefix}_relative_error"]) for row in rows], - dtype=np.float64, - ) - return float(np.mean(np.square(values))) - - -def _target_value_diagnostics( - loss_inputs: dict[str, Any], - weights: np.ndarray, -) -> dict[str, np.ndarray]: - matrix = np.asarray(loss_inputs["scaled_matrix"], dtype=np.float64) - scaled_target = np.asarray(loss_inputs["scaled_target"], dtype=np.float64) - scaled_estimate = matrix.T @ weights - loss_arrays = loss_arrays_from_inputs(loss_inputs) - if loss_arrays is not None: - target = loss_arrays.target_values.astype(np.float64, copy=True) - estimate = scaled_estimate.astype(np.float64, copy=True) - error = estimate - loss_arrays.objective_target - return { - "value_scale": np.full(target.shape, "native", dtype=object), - "target": target, - "estimate": estimate, - "error": error, - "relative_error": pe_native_relative_error(estimate, loss_arrays), - "loss_denominator": loss_arrays.denominator, - "loss_target_weight": loss_arrays.target_weight, - "loss_bucket": loss_arrays.bucket_keys, - "loss_unit": loss_arrays.unit_keys, - "loss_scope": loss_arrays.scope_keys, - "loss_family": loss_arrays.family_keys, - "loss_epsilon": loss_arrays.epsilon, - } - unscaled_target = loss_inputs.get("unscaled_target") - scaling = loss_inputs.get("scaling") - target = scaled_target.astype(np.float64, copy=True) - estimate = scaled_estimate.astype(np.float64, copy=True) - value_scale = np.full(target.shape, "scaled", dtype=object) - if unscaled_target is not None and scaling is not None: - scaling_array = np.asarray(scaling, dtype=np.float64) - if scaling_array.shape != target.shape: - raise ValueError("PE-native target scaling shape differs from target shape") - native_mask = np.isfinite(scaling_array) & ~np.isclose(scaling_array, 0.0) - if native_mask.any(): - target[native_mask] = np.asarray(unscaled_target, dtype=np.float64)[ - native_mask - ] - estimate[native_mask] = ( - scaled_estimate[native_mask] / scaling_array[native_mask] - ) - value_scale[native_mask] = "native" - if target.shape != estimate.shape: - raise ValueError("target and estimate shapes differ") - error = estimate - target - relative_error = ((estimate - target) + 1.0) / (target + 1.0) - return { - "value_scale": value_scale, - "target": target, - "estimate": estimate, - "error": error, - "relative_error": relative_error, - "loss_denominator": np.abs(target) + 1.0, - "loss_target_weight": np.ones(target.shape, dtype=np.float64), - "loss_bucket": np.full(target.shape, "legacy", dtype=object), - "loss_unit": np.full(target.shape, "legacy", dtype=object), - "loss_scope": np.full(target.shape, "legacy", dtype=object), - "loss_family": np.full(target.shape, "legacy", dtype=object), - "loss_epsilon": np.ones(target.shape, dtype=np.float64), - } - - -def _target_family_breakdown( - target_rows: list[dict[str, Any]], - total_targets: int, -) -> list[dict[str, Any]]: - families: dict[str, list[dict[str, Any]]] = {} - for row in target_rows: - families.setdefault(str(row["family"]), []).append(row) - breakdown = [] - for family, rows in sorted(families.items()): - candidate_loss = sum(float(row["candidate_loss_term"]) for row in rows) - baseline_loss = sum(float(row["baseline_loss_term"]) for row in rows) - breakdown.append( - { - "family": family, - "n_targets": int(len(rows)), - "train_targets": int(sum(1 for row in rows if row["split"] == "train")), - "holdout_targets": int( - sum(1 for row in rows if row["split"] == "holdout") - ), - "candidate_loss_contribution": float(candidate_loss), - "baseline_loss_contribution": float(baseline_loss), - "loss_delta": float(candidate_loss - baseline_loss), - "candidate_wins": int( - sum(1 for row in rows if row["winner"] == "candidate") - ), - "baseline_wins": int( - sum(1 for row in rows if row["winner"] == "baseline") - ), - "ties": int(sum(1 for row in rows if row["winner"] == "tie")), - } - ) - return sorted( - breakdown, key=lambda row: abs(float(row["loss_delta"])), reverse=True - ) - - -def _target_bucket_breakdown(target_rows: list[dict[str, Any]]) -> list[dict[str, Any]]: - buckets: dict[str, list[dict[str, Any]]] = {} - for row in target_rows: - buckets.setdefault(str(row["loss_bucket"]), []).append(row) - breakdown = [] - for bucket, rows in sorted(buckets.items()): - candidate_loss = sum(float(row["candidate_loss_term"]) for row in rows) - baseline_loss = sum(float(row["baseline_loss_term"]) for row in rows) - breakdown.append( - { - "bucket": bucket, - "scope": str(rows[0]["loss_scope"]), - "unit": str(rows[0]["loss_unit"]), - "n_targets": int(len(rows)), - "train_targets": int(sum(1 for row in rows if row["split"] == "train")), - "holdout_targets": int( - sum(1 for row in rows if row["split"] == "holdout") - ), - "candidate_loss_contribution": float(candidate_loss), - "baseline_loss_contribution": float(baseline_loss), - "loss_delta": float(candidate_loss - baseline_loss), - "candidate_target_weight_sum": float( - sum(float(row["loss_target_weight"]) for row in rows) - ), - } - ) - return sorted( - breakdown, key=lambda row: abs(float(row["loss_delta"])), reverse=True - ) - - -def _support_audit_summary(support_audit: dict[str, Any]) -> dict[str, Any]: - comparisons = dict(support_audit.get("comparisons") or {}) - critical_rows = list(comparisons.get("critical_input_support") or ()) - missing_stored = [ - row["variable"] - for row in critical_rows - if bool(row.get("baseline_stored")) and not bool(row.get("candidate_stored")) - ] - return { - "missing_stored_critical_inputs": missing_stored, - "top_critical_input_support_gaps": _sort_rows_by_abs_delta( - critical_rows, - "weighted_nonzero_delta", - ), - "top_filing_status_gaps": _sort_rows_by_abs_delta( - list(comparisons.get("filing_status_weighted_delta") or ()), - "weighted_count_delta", - ), - "top_hoh_agi_gaps": _sort_rows_by_abs_delta( - list(comparisons.get("hoh_agi_delta") or ()), - "weighted_count_delta", - ), - "top_ssi_by_age_gaps": _sort_rows_by_abs_delta( - list(comparisons.get("ssi_by_age_delta") or ()), - "weighted_recipient_delta", - ), - "top_medicare_part_b_by_age_gaps": _sort_rows_by_abs_delta( - list(comparisons.get("medicare_part_b_premiums_by_age_delta") or ()), - "weighted_positive_delta", - ), - "top_aca_ptc_spending_gaps": _sort_rows_by_abs_delta( - list(comparisons.get("state_aca_ptc_spending_top_gaps") or ()), - "weighted_aca_ptc_delta", - ), - } - - -def _sort_rows_by_abs_delta( - rows: list[dict[str, Any]], - delta_key: str, - *, - limit: int = 10, -) -> list[dict[str, Any]]: - return sorted( - rows, - key=lambda row: abs(float(row.get(delta_key, 0.0))), - reverse=True, - )[:limit] - - -def _loss_terms(loss_inputs: dict[str, Any], weights: np.ndarray) -> np.ndarray: - matrix = np.asarray(loss_inputs["scaled_matrix"], dtype=np.float64) - target = np.asarray(loss_inputs["scaled_target"], dtype=np.float64) - estimate = matrix.T @ weights - loss_arrays = loss_arrays_from_inputs(loss_inputs) - if loss_arrays is not None: - return pe_native_huber_loss_terms(estimate, loss_arrays) - residual = estimate - target - return np.square(residual) - - -def _target_matches_protected_family( - target_name: str, - family: str, - patterns: tuple[str, ...], -) -> bool: - normalized = ( - target_name.lower().replace("-", "_").replace(" ", "_").replace("/", "_") - ) - if family == "wages" and ( - "self_employment" in normalized or "business_income" in normalized - ): - return False - return any(pattern in normalized for pattern in patterns) - - -def _absolute_difference(left: Any, right: Any) -> float | None: - if left is None or right is None: - return None - return abs(float(left) - float(right)) - - -def _strip_weights(payload: dict[str, Any]) -> dict[str, Any]: - stripped = dict(payload) - stripped.pop("optimized_weights", None) - return stripped - - -def _dataset_descriptor(path: Path) -> dict[str, Any]: - return { - "path": str(path.resolve()), - "sha256": _sha256(path), - "size_bytes": int(path.stat().st_size), - } - - -def _sha256(path: Path) -> str: - digest = hashlib.sha256() - with path.open("rb") as handle: - for chunk in iter(lambda: handle.read(1024 * 1024), b""): - digest.update(chunk) - return digest.hexdigest() - - -def _frozen_ecps_baseline_certificate( - *, - baseline_dataset_path: Path, - policyengine_targets_db_path: Path | None, - policyengine_us_data_repo: str | Path | None, - policyengine_us_version: str, - period: int, - target_names: list[str], - target_scope: str, - holdout_target_fraction: float, - holdout_target_seed: int, - matched_sample_method: str, - refit_config: dict[str, Any], - skip_tax_expenditure_targets: bool, - exact_rescore: bool, - score_source: str, - baseline_sanity: dict[str, Any], - score_summary: dict[str, Any], -) -> dict[str, Any]: - """Freeze the eCPS baseline surface used for this numeric verdict. - - Promotion gates consume this certificate and compare it to the pinned - benchmark manifest. That prevents a release from passing on a live - recomputation against a different eCPS H5, target DB, scorer checkout, or - scoring config. - """ - - scoring_config = { - "period": int(period), - "target_profile": "pe_native_broad", - "target_scope": str(target_scope), - "holdout_target_fraction": float(holdout_target_fraction), - "holdout_target_seed": int(holdout_target_seed), - "matched_sample_method": str(matched_sample_method), - "refit_config": dict(refit_config), - "skip_tax_expenditure_targets": bool(skip_tax_expenditure_targets), - "exact_rescore": bool(exact_rescore), - "score_source": str(score_source), - "comparison_bad_targets": list(_comparison_bad_targets()), - } - baseline_metrics = { - key: score_summary.get(key) - for key in ( - "baseline_initial_enhanced_cps_native_loss", - "baseline_enhanced_cps_native_loss", - "baseline_train_loss", - "baseline_holdout_loss", - "baseline_unweighted_msre", - "n_targets_kept", - "n_national_targets", - "n_state_targets", - ) - if score_summary.get(key) is not None - } - return { - "schema_version": 1, - "certificate_type": "frozen_production_ecps_baseline", - "period": int(period), - "baseline_dataset": _dataset_descriptor(baseline_dataset_path), - "target_db": ( - _dataset_descriptor(policyengine_targets_db_path) - if policyengine_targets_db_path is not None - else None - ), - "policyengine_us_data": _git_repo_descriptor(policyengine_us_data_repo), - "policyengine_us": {"version": str(policyengine_us_version)}, - "target_surface": { - "target_profile": "pe_native_broad", - "target_scope": str(target_scope), - "target_count": int(len(target_names)), - "target_names_sha256": _canonical_json_sha256(list(target_names)), - }, - "scoring_config": { - **scoring_config, - "sha256": _canonical_json_sha256(scoring_config), - }, - "baseline_metrics": baseline_metrics, - "baseline_sanity": dict(baseline_sanity), - } - - -def _assert_certificate_matches_benchmark_manifest( - certificate: dict[str, Any], - benchmark_manifest_path: str | Path, -) -> dict[str, Any]: - """Fail before writing a comparison if it is not on the pinned surface.""" - - manifest_path = Path(benchmark_manifest_path).expanduser().resolve() - if not manifest_path.exists(): - raise FileNotFoundError(f"benchmark manifest not found: {manifest_path}") - try: - manifest = json.loads(manifest_path.read_text()) - except json.JSONDecodeError as exc: - raise ComparisonGateError( - f"benchmark manifest is not valid JSON: {manifest_path}: {exc}" - ) from exc - - manifest_evidence = _benchmark_manifest_evidence(manifest) - certificate_evidence = _benchmark_manifest_evidence(certificate) - missing = [ - field - for field, value in manifest_evidence.items() - if not _valid_benchmark_evidence_value(field, value) - ] - mismatches = [ - { - "field": field, - "benchmark_manifest_value": expected, - "certificate_value": certificate_evidence.get(field), - } - for field, expected in manifest_evidence.items() - if _valid_benchmark_evidence_value(field, expected) - and str(certificate_evidence.get(field)) != str(expected) - ] - if missing or mismatches: - problems = [] - if missing: - problems.append("missing manifest evidence: " + ", ".join(missing)) - if mismatches: - problems.append( - "mismatched evidence: " - + ", ".join(str(item["field"]) for item in mismatches) - ) - raise ComparisonGateError( - "Comparison does not match pinned production eCPS benchmark manifest; " - + "; ".join(problems) - ) - - return { - **_dataset_descriptor(manifest_path), - "certificate_match": { - "status": "passed", - "checked_evidence": manifest_evidence, - }, - } - - -def _assert_certificate_uses_frozen_production_pins( - certificate: dict[str, Any], -) -> None: - evidence = _benchmark_manifest_evidence(certificate) - mismatches = frozen_production_pin_mismatches(evidence) - if not mismatches: - return - details = ", ".join(str(item["field"]) for item in mismatches) - raise ComparisonGateError( - "Comparison does not use the release-pinned production eCPS " - f"baseline/target surface; mismatched pins: {details}" - ) - - -def _benchmark_manifest_evidence(payload: dict[str, Any]) -> dict[str, Any]: - return { - field: _first_nested_path_value(payload, paths) - for field, paths in _BENCHMARK_MANIFEST_EVIDENCE_PATHS.items() - } - - -def _first_nested_path_value( - payload: dict[str, Any], - paths: tuple[tuple[str, ...], ...], -) -> Any: - for path in paths: - current: Any = payload - for part in path: - if not isinstance(current, dict) or part not in current: - current = None - break - current = current[part] - if current is not None: - return current - return None - - -def _valid_benchmark_evidence_value(field: str, value: Any) -> bool: - if value is None: - return False - if isinstance(value, str) and not value: - return False - if field.endswith(".sha256"): - return isinstance(value, str) and len(value) == 64 - if field.endswith(".commit"): - return isinstance(value, str) and len(value) >= 7 - if field.endswith(".target_count"): - try: - return int(value) > 0 - except (TypeError, ValueError): - return False - if field.startswith("baseline_metrics."): - try: - return np.isfinite(float(value)) - except (TypeError, ValueError): - return False - return True - - -def _installed_policyengine_us_version() -> str: - try: - return importlib.metadata.version("policyengine-us") - except importlib.metadata.PackageNotFoundError as exc: - raise ValueError("policyengine-us is not installed") from exc - - -def _git_repo_descriptor(repo_path: str | Path | None) -> dict[str, Any] | None: - if repo_path is None: - return None - repo = Path(repo_path).expanduser().resolve() - descriptor: dict[str, Any] = {"repo": str(repo)} - commit = _git_output_or_none(repo, "rev-parse", "HEAD") - if commit: - descriptor["commit"] = commit - status = _git_output_or_none(repo, "status", "--porcelain") - if status is not None: - descriptor["dirty"] = bool(status) - return descriptor - - -def _git_output_or_none(repo: Path, *args: str) -> str | None: - completed = subprocess.run( - ["git", "-C", str(repo), *args], - check=False, - capture_output=True, - text=True, - ) - if completed.returncode != 0: - return None - return completed.stdout.strip() - - -def _canonical_json_sha256(payload: Any) -> str: - encoded = json.dumps( - payload, - sort_keys=True, - separators=(",", ":"), - default=str, - ).encode("utf-8") - return hashlib.sha256(encoded).hexdigest() - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description=( - "Build a sound Microplex-vs-eCPS replacement comparison payload " - "for mp-300k artifact gates." - ) - ) - parser.add_argument("--candidate-dataset", required=True) - parser.add_argument("--baseline-dataset", required=True) - parser.add_argument("--output-dir", required=True) - parser.add_argument( - "--output-path", - help="Defaults to /sound_ecps_replacement_comparison.json.", - ) - parser.add_argument( - "--target-diagnostics-path", - help="Defaults to /target_loss_diagnostics.json.", - ) - parser.add_argument( - "--support-audit-path", - help="Defaults to /support_audit.json when enabled.", - ) - parser.add_argument("--period", type=int, default=2024) - parser.add_argument("--matched-household-count", type=int) - parser.add_argument("--random-seed", type=int, default=20260529) - parser.add_argument( - "--matched-sample-method", - choices=("uniform", "weight_proportional", "pps", "largest_weight"), - default="uniform", - help=( - "Household thinning method used when matching a larger dataset down " - "to the comparison household count." - ), - ) - parser.add_argument("--holdout-target-fraction", type=float, default=0.2) - parser.add_argument("--holdout-target-seed", type=int, default=20260529) - parser.add_argument("--optimizer-max-iter", type=int, default=200) - parser.add_argument("--optimizer-tol", type=float, default=1e-8) - parser.add_argument("--score-consistency-tol", type=float, default=1e-6) - parser.add_argument("--target-diagnostics-top-k", type=int, default=50) - parser.add_argument( - "--skip-support-audit", - action="store_true", - help="Skip the PE-native support audit sidecar.", - ) - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--policyengine-us-data-python") - parser.add_argument( - "--policyengine-targets-db", - help=( - "Explicit policy_data.db to use for PE-native comparison scoring. " - "The scorer subprocess copies this DB into a temporary PE-US-data " - "storage folder so the target surface is pinned." - ), - ) - parser.add_argument( - "--benchmark-manifest", - help=( - "Pre-existing frozen benchmark manifest to enforce before writing " - "the comparison. The comparison certificate must match its baseline " - "H5, target DB, target surface, scorer checkout, PolicyEngine-US " - "version, and scoring config." - ), - ) - parser.add_argument("--skip-tax-expenditure-targets", action="store_true") - parser.add_argument( - "--target-scope", - choices=("all", "national", "state"), - default="all", - help="Restrict the PE-native refit/scoring surface by target scope.", - ) - parser.add_argument( - "--exact-rescore", - action="store_true", - help=( - "After symmetric refit, recompute the PE-native loss by rebuilding " - "PolicyEngine loss matrices for the refit H5s. This is an audit " - "path and can take hours on local machines; by default the " - "comparison uses the already-extracted refit loss matrices." - ), - ) - parser.add_argument("--force", action="store_true") - parser.add_argument( - "--no-assert-refit-effective", - dest="assert_refit_effective", - action="store_false", - help="Skip the refit-effectiveness gate (allow a no-op refit).", - ) - parser.add_argument( - "--no-assert-baseline-sane", - dest="assert_baseline_sane", - action="store_false", - help="Skip the baseline-sanity gate (allow a mis-scored production baseline).", - ) - parser.add_argument( - "--baseline-sanity-mode", - choices=_BASELINE_SANITY_MODES, - default="msre", - help=( - "Baseline-sanity gate to use. 'msre' requires production eCPS to " - "score below --max-baseline-unweighted-msre on this exact surface. " - "'content' verifies required production eCPS H5 columns are present " - "and nonzero, for broad target surfaces where high eCPS loss is " - "part of the comparison signal." - ), - ) - parser.add_argument( - "--max-baseline-unweighted-msre", - type=float, - default=2.0, - help="Baseline-sanity gate ceiling on the production eCPS unweighted MSRE.", - ) - parser.add_argument( - "--min-refit-loss-reduction", - type=float, - default=1e-9, - help="Minimum loss reduction required by the refit-effectiveness gate.", - ) - parser.add_argument( - "--allow-noncanonical-production-pins", - action="store_true", - help=( - "Allow an experimental comparison to emit a frozen-production " - "certificate with noncanonical baseline/target pins. Release gates " - "still reject it." - ), - ) - args = parser.parse_args(argv) - - output_dir = Path(args.output_dir).expanduser() - output_path = ( - Path(args.output_path).expanduser() - if args.output_path - else output_dir / "sound_ecps_replacement_comparison.json" - ) - written = write_sound_ecps_replacement_comparison( - output_path, - target_diagnostics_path=args.target_diagnostics_path, - support_audit_path=args.support_audit_path, - candidate_dataset_path=args.candidate_dataset, - baseline_dataset_path=args.baseline_dataset, - output_dir=output_dir, - period=args.period, - matched_household_count=args.matched_household_count, - random_seed=args.random_seed, - matched_sample_method=args.matched_sample_method, - holdout_target_fraction=args.holdout_target_fraction, - holdout_target_seed=args.holdout_target_seed, - optimizer_max_iter=args.optimizer_max_iter, - optimizer_tol=args.optimizer_tol, - score_consistency_tol=args.score_consistency_tol, - target_diagnostics_top_k=args.target_diagnostics_top_k, - include_support_audit=not args.skip_support_audit, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_python=args.policyengine_us_data_python, - policyengine_targets_db_path=args.policyengine_targets_db, - skip_tax_expenditure_targets=args.skip_tax_expenditure_targets, - target_scope=args.target_scope, - exact_rescore=args.exact_rescore, - force=args.force, - assert_refit_effective=args.assert_refit_effective, - min_refit_loss_reduction=args.min_refit_loss_reduction, - assert_baseline_sane=args.assert_baseline_sane, - baseline_sanity_mode=args.baseline_sanity_mode, - max_baseline_unweighted_msre=args.max_baseline_unweighted_msre, - benchmark_manifest_path=args.benchmark_manifest, - enforce_production_pins=not args.allow_noncanonical_production_pins, - ) - print(str(written)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) - - -__all__ = [ - "build_sound_ecps_replacement_comparison", - "write_sound_ecps_replacement_comparison", -] diff --git a/src/microplex_us/pipelines/experiments.py b/src/microplex_us/pipelines/experiments.py deleted file mode 100644 index b4ba6ae4..00000000 --- a/src/microplex_us/pipelines/experiments.py +++ /dev/null @@ -1,860 +0,0 @@ -"""Experiment runners for PE-US parity optimization.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass, field, replace -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -from microplex.core import SourceProvider, SourceQuery - -from microplex_us.pipelines.artifacts import ( - USMicroplexArtifactPaths, - build_and_save_versioned_us_microplex_from_source_providers, - save_versioned_us_microplex_build_result, -) -from microplex_us.pipelines.backfill_pe_native_audit import ( - backfill_us_pe_native_audit_bundles, -) -from microplex_us.pipelines.backfill_pe_native_scores import ( - backfill_us_pe_native_scores_bundles, -) -from microplex_us.pipelines.performance import ( - USMicroplexPerformanceHarnessConfig, - USMicroplexPerformanceSession, -) -from microplex_us.pipelines.registry import ( - FrontierMetric, - USMicroplexRunRegistryEntry, - load_us_microplex_run_registry, - select_us_microplex_frontier_entry, -) -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.us import USMicroplexBuildConfig -from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessSlice, -) - - -@dataclass(frozen=True) -class USMicroplexSourceExperimentSpec: - """One named source-mix experiment to run through the PE-US parity harness.""" - - name: str - providers: tuple[SourceProvider, ...] - config: USMicroplexBuildConfig | None = None - queries: dict[str, SourceQuery] = field(default_factory=dict) - metadata: dict[str, Any] = field(default_factory=dict) - - -def default_us_source_mix_experiments( - *, - cps_provider: SourceProvider, - base_config: USMicroplexBuildConfig | None = None, - cps_query: SourceQuery | None = None, - puf_provider: SourceProvider | None = None, - puf_query: SourceQuery | None = None, - psid_provider: SourceProvider | None = None, - psid_query: SourceQuery | None = None, -) -> tuple[USMicroplexSourceExperimentSpec, ...]: - """Build a standard ladder of US source-mix experiments.""" - experiments = [ - USMicroplexSourceExperimentSpec( - name="cps-only", - providers=(cps_provider,), - config=base_config, - queries=( - {cps_provider.descriptor.name: cps_query} - if cps_query is not None - else {} - ), - metadata={"sources": [cps_provider.descriptor.name]}, - ) - ] - - if puf_provider is not None: - experiments.append( - USMicroplexSourceExperimentSpec( - name="cps+puf", - providers=(cps_provider, puf_provider), - config=base_config, - queries={ - **( - {cps_provider.descriptor.name: cps_query} - if cps_query is not None - else {} - ), - **( - {puf_provider.descriptor.name: puf_query} - if puf_query is not None - else {} - ), - }, - metadata={ - "sources": [ - cps_provider.descriptor.name, - puf_provider.descriptor.name, - ] - }, - ) - ) - - if psid_provider is not None: - experiments.append( - USMicroplexSourceExperimentSpec( - name="cps+psid", - providers=(cps_provider, psid_provider), - config=base_config, - queries={ - **( - {cps_provider.descriptor.name: cps_query} - if cps_query is not None - else {} - ), - **( - {psid_provider.descriptor.name: psid_query} - if psid_query is not None - else {} - ), - }, - metadata={ - "sources": [ - cps_provider.descriptor.name, - psid_provider.descriptor.name, - ] - }, - ) - ) - - if puf_provider is not None and psid_provider is not None: - experiments.append( - USMicroplexSourceExperimentSpec( - name="cps+puf+psid", - providers=(cps_provider, puf_provider, psid_provider), - config=base_config, - queries={ - **( - {cps_provider.descriptor.name: cps_query} - if cps_query is not None - else {} - ), - **( - {puf_provider.descriptor.name: puf_query} - if puf_query is not None - else {} - ), - **( - {psid_provider.descriptor.name: psid_query} - if psid_query is not None - else {} - ), - }, - metadata={ - "sources": [ - cps_provider.descriptor.name, - puf_provider.descriptor.name, - psid_provider.descriptor.name, - ] - }, - ) - ) - - return tuple(experiments) - - -def build_us_n_synthetic_sweep_experiments( - experiment: USMicroplexSourceExperimentSpec, - n_synthetic_values: tuple[int, ...] | list[int], - *, - name_template: str = "{base_name}-n{n_synthetic}", -) -> tuple[USMicroplexSourceExperimentSpec, ...]: - """Expand one experiment into a deterministic n_synthetic sweep.""" - if not n_synthetic_values: - raise ValueError( - "build_us_n_synthetic_sweep_experiments requires at least one n_synthetic value" - ) - - base_config = experiment.config or USMicroplexBuildConfig() - seen_values: set[int] = set() - sweep_experiments: list[USMicroplexSourceExperimentSpec] = [] - for raw_value in n_synthetic_values: - n_synthetic = int(raw_value) - if n_synthetic <= 0: - raise ValueError("n_synthetic sweep values must be positive integers") - if n_synthetic in seen_values: - raise ValueError( - f"Duplicate n_synthetic sweep value supplied: {n_synthetic}" - ) - seen_values.add(n_synthetic) - sweep_experiments.append( - USMicroplexSourceExperimentSpec( - name=name_template.format( - base_name=experiment.name, - n_synthetic=n_synthetic, - ), - providers=experiment.providers, - config=replace(base_config, n_synthetic=n_synthetic), - queries=dict(experiment.queries), - metadata={ - **dict(experiment.metadata), - "base_experiment_name": experiment.name, - "n_synthetic": n_synthetic, - "sweep_parameter": "n_synthetic", - }, - ) - ) - return tuple(sweep_experiments) - - -@dataclass(frozen=True) -class USMicroplexExperimentResult: - """Persistable summary for one completed source-mix experiment.""" - - name: str - artifact_paths: USMicroplexArtifactPaths - frontier_metric: FrontierMetric - frontier_delta: float | None - current_entry: USMicroplexRunRegistryEntry | None = None - frontier_entry: USMicroplexRunRegistryEntry | None = None - metadata: dict[str, Any] = field(default_factory=dict) - - @property - def metric_value(self) -> float | None: - if self.current_entry is None: - return None - return getattr(self.current_entry, self.frontier_metric, None) - - @property - def source_names(self) -> tuple[str, ...]: - if self.current_entry is not None and self.current_entry.source_names: - return self.current_entry.source_names - return () - - def to_dict(self) -> dict[str, Any]: - """Serialize the experiment result to a JSON-compatible payload.""" - return { - "name": self.name, - "artifact_paths": { - "output_dir": str(self.artifact_paths.output_dir), - "seed_data": str(self.artifact_paths.seed_data), - "synthetic_data": str(self.artifact_paths.synthetic_data), - "calibrated_data": str(self.artifact_paths.calibrated_data), - "targets": str(self.artifact_paths.targets), - "manifest": str(self.artifact_paths.manifest), - "version_id": self.artifact_paths.version_id, - "scaffold_seed_data": ( - str(self.artifact_paths.scaffold_seed_data) - if self.artifact_paths.scaffold_seed_data is not None - else None - ), - "synthesizer": ( - str(self.artifact_paths.synthesizer) - if self.artifact_paths.synthesizer is not None - else None - ), - "policyengine_dataset": ( - str(self.artifact_paths.policyengine_dataset) - if self.artifact_paths.policyengine_dataset is not None - else None - ), - "data_flow_snapshot": ( - str(self.artifact_paths.data_flow_snapshot) - if self.artifact_paths.data_flow_snapshot is not None - else None - ), - "artifact_inventory": ( - str(self.artifact_paths.artifact_inventory) - if self.artifact_paths.artifact_inventory is not None - else None - ), - "conditional_readiness": ( - str(self.artifact_paths.conditional_readiness) - if self.artifact_paths.conditional_readiness is not None - else None - ), - "policyengine_harness": ( - str(self.artifact_paths.policyengine_harness) - if self.artifact_paths.policyengine_harness is not None - else None - ), - "policyengine_native_scores": ( - str(self.artifact_paths.policyengine_native_scores) - if self.artifact_paths.policyengine_native_scores is not None - else None - ), - "policyengine_native_audit": ( - str(self.artifact_paths.policyengine_native_audit) - if self.artifact_paths.policyengine_native_audit is not None - else None - ), - "policyengine_native_target_diagnostics": ( - str(self.artifact_paths.policyengine_native_target_diagnostics) - if self.artifact_paths.policyengine_native_target_diagnostics is not None - else None - ), - "capital_gains_lots": ( - str(self.artifact_paths.capital_gains_lots) - if self.artifact_paths.capital_gains_lots is not None - else None - ), - "run_registry": ( - str(self.artifact_paths.run_registry) - if self.artifact_paths.run_registry is not None - else None - ), - "run_index_db": ( - str(self.artifact_paths.run_index_db) - if self.artifact_paths.run_index_db is not None - else None - ), - }, - "frontier_metric": self.frontier_metric, - "frontier_delta": self.frontier_delta, - "metric_value": self.metric_value, - "source_names": list(self.source_names), - "current_entry": ( - self.current_entry.to_dict() if self.current_entry is not None else None - ), - "frontier_entry": ( - self.frontier_entry.to_dict() if self.frontier_entry is not None else None - ), - "metadata": dict(self.metadata), - } - - @classmethod - def from_dict(cls, payload: dict[str, Any]) -> USMicroplexExperimentResult: - """Restore an experiment result from serialized JSON payload.""" - artifact_paths = payload["artifact_paths"] - return cls( - name=payload["name"], - artifact_paths=USMicroplexArtifactPaths( - output_dir=Path(artifact_paths["output_dir"]), - seed_data=Path(artifact_paths["seed_data"]), - synthetic_data=Path(artifact_paths["synthetic_data"]), - calibrated_data=Path(artifact_paths["calibrated_data"]), - targets=Path(artifact_paths["targets"]), - manifest=Path(artifact_paths["manifest"]), - version_id=artifact_paths.get("version_id"), - scaffold_seed_data=( - Path(artifact_paths["scaffold_seed_data"]) - if artifact_paths.get("scaffold_seed_data") is not None - else None - ), - synthesizer=( - Path(artifact_paths["synthesizer"]) - if artifact_paths.get("synthesizer") is not None - else None - ), - policyengine_dataset=( - Path(artifact_paths["policyengine_dataset"]) - if artifact_paths.get("policyengine_dataset") is not None - else None - ), - data_flow_snapshot=( - Path(artifact_paths["data_flow_snapshot"]) - if artifact_paths.get("data_flow_snapshot") is not None - else None - ), - artifact_inventory=( - Path(artifact_paths["artifact_inventory"]) - if artifact_paths.get("artifact_inventory") is not None - else None - ), - conditional_readiness=( - Path(artifact_paths["conditional_readiness"]) - if artifact_paths.get("conditional_readiness") is not None - else None - ), - policyengine_harness=( - Path(artifact_paths["policyengine_harness"]) - if artifact_paths.get("policyengine_harness") is not None - else None - ), - policyengine_native_scores=( - Path(artifact_paths["policyengine_native_scores"]) - if artifact_paths.get("policyengine_native_scores") is not None - else None - ), - policyengine_native_audit=( - Path(artifact_paths["policyengine_native_audit"]) - if artifact_paths.get("policyengine_native_audit") is not None - else None - ), - policyengine_native_target_diagnostics=( - Path(artifact_paths["policyengine_native_target_diagnostics"]) - if artifact_paths.get("policyengine_native_target_diagnostics") - is not None - else None - ), - capital_gains_lots=( - Path(artifact_paths["capital_gains_lots"]) - if artifact_paths.get("capital_gains_lots") is not None - else None - ), - run_registry=( - Path(artifact_paths["run_registry"]) - if artifact_paths.get("run_registry") is not None - else None - ), - run_index_db=( - Path(artifact_paths["run_index_db"]) - if artifact_paths.get("run_index_db") is not None - else None - ), - ), - frontier_metric=payload["frontier_metric"], - frontier_delta=payload.get("frontier_delta"), - current_entry=( - USMicroplexRunRegistryEntry.from_dict(payload["current_entry"]) - if payload.get("current_entry") is not None - else None - ), - frontier_entry=( - USMicroplexRunRegistryEntry.from_dict(payload["frontier_entry"]) - if payload.get("frontier_entry") is not None - else None - ), - metadata=dict(payload.get("metadata", {})), - ) - - -@dataclass(frozen=True) -class USMicroplexExperimentReport: - """Persistable report for a batch of source-mix experiments.""" - - output_root: Path - frontier_metric: FrontierMetric - results: tuple[USMicroplexExperimentResult, ...] - created_at: str = field( - default_factory=lambda: datetime.now(UTC).replace(microsecond=0).isoformat() - ) - metadata: dict[str, Any] = field(default_factory=dict) - - @property - def leaderboard(self) -> tuple[USMicroplexExperimentResult, ...]: - """Return results sorted by the configured frontier metric.""" - - def sort_key( - result: USMicroplexExperimentResult, - ) -> tuple[bool, float, str]: - metric_value = result.metric_value - if metric_value is None: - return (True, float("inf"), result.name) - return (False, metric_value, result.name) - - return tuple(sorted(self.results, key=sort_key)) - - @property - def best_result(self) -> USMicroplexExperimentResult | None: - leaderboard = self.leaderboard - if not leaderboard: - return None - if leaderboard[0].metric_value is None: - return None - return leaderboard[0] - - def to_dict(self) -> dict[str, Any]: - """Serialize the report to a JSON-compatible dict.""" - best_result = self.best_result - return { - "created_at": self.created_at, - "output_root": str(self.output_root), - "frontier_metric": self.frontier_metric, - "summary": { - "best_experiment": best_result.name if best_result is not None else None, - "best_metric_value": ( - best_result.metric_value if best_result is not None else None - ), - "n_results": len(self.results), - }, - "metadata": dict(self.metadata), - "results": [result.to_dict() for result in self.results], - } - - def save(self, path: str | Path) -> Path: - """Persist the experiment report to disk.""" - output_path = Path(path) - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(json.dumps(self.to_dict(), indent=2, sort_keys=True)) - return output_path - - @classmethod - def from_dict(cls, payload: dict[str, Any]) -> USMicroplexExperimentReport: - """Restore a report from serialized JSON.""" - return cls( - output_root=Path(payload["output_root"]), - frontier_metric=payload["frontier_metric"], - results=tuple( - USMicroplexExperimentResult.from_dict(result) - for result in payload.get("results", []) - ), - created_at=payload["created_at"], - metadata=dict(payload.get("metadata", {})), - ) - - @classmethod - def load(cls, path: str | Path) -> USMicroplexExperimentReport: - """Load a persisted experiment report.""" - return cls.from_dict(json.loads(Path(path).read_text())) - - -def run_us_microplex_source_experiments( - experiments: list[USMicroplexSourceExperimentSpec] - | tuple[USMicroplexSourceExperimentSpec, ...], - output_root: str | Path, - *, - frontier_metric: FrontierMetric = "candidate_composite_parity_loss", - policyengine_target_provider: Any | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - report_path: str | Path | None = None, - performance_harness_config: USMicroplexPerformanceHarnessConfig | None = None, - performance_session: USMicroplexPerformanceSession | None = None, - metadata: dict[str, Any] | None = None, -) -> USMicroplexExperimentReport: - """Run a batch of source-mix experiments through the versioned PE-US build loop.""" - if not experiments: - raise ValueError("run_us_microplex_source_experiments requires at least one experiment") - if performance_session is not None and performance_harness_config is None: - raise ValueError( - "performance_harness_config is required when providing performance_session" - ) - - output_root = Path(output_root) - output_root.mkdir(parents=True, exist_ok=True) - results: list[USMicroplexExperimentResult] = [] - active_performance_session = performance_session - if performance_harness_config is not None and active_performance_session is None: - active_performance_session = USMicroplexPerformanceSession() - - shared_comparison_cache = ( - policyengine_comparison_cache - or ( - active_performance_session.comparison_cache - if active_performance_session is not None - else None - ) - or PolicyEngineUSComparisonCache() - ) - if ( - active_performance_session is not None - and performance_harness_config is not None - and performance_harness_config.targets_db is not None - and performance_harness_config.baseline_dataset is not None - ): - active_performance_session.warm_parity_cache(config=performance_harness_config) - batch_native_scoring = ( - active_performance_session is not None - and performance_harness_config is not None - and performance_harness_config.evaluate_pe_native_loss - and len(experiments) > 1 - ) - - for experiment in experiments: - harness_metadata = { - "experiment_name": experiment.name, - **dict(policyengine_harness_metadata or {}), - **dict(experiment.metadata), - } - registry_metadata = { - "experiment_name": experiment.name, - **dict(experiment.metadata), - } - if ( - active_performance_session is not None - and performance_harness_config is not None - ): - harness_config = _resolve_experiment_performance_config( - experiment, - performance_harness_config, - ) - if batch_native_scoring: - harness_config = replace(harness_config, evaluate_pe_native_loss=False) - performance_result = active_performance_session.run( - list(experiment.providers), - config=harness_config, - queries=experiment.queries or None, - ) - artifacts = save_versioned_us_microplex_build_result( - performance_result.build_result, - output_root, - frontier_metric=frontier_metric, - policyengine_comparison_cache=shared_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=harness_metadata, - precomputed_policyengine_harness_payload=( - performance_result.parity_run.to_dict() - if performance_result.parity_run is not None - else None - ), - defer_policyengine_harness=performance_result.parity_run is None, - precomputed_policyengine_native_scores=( - None if batch_native_scoring else performance_result.pe_native_scores - ), - defer_policyengine_native_score=batch_native_scoring, - run_registry_path=run_registry_path, - run_registry_metadata=registry_metadata, - ) - else: - artifacts = build_and_save_versioned_us_microplex_from_source_providers( - list(experiment.providers), - output_root, - config=experiment.config, - queries=experiment.queries or None, - frontier_metric=frontier_metric, - policyengine_comparison_cache=shared_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=harness_metadata, - run_registry_path=run_registry_path, - run_registry_metadata=registry_metadata, - ) - results.append( - USMicroplexExperimentResult( - name=experiment.name, - artifact_paths=artifacts.artifact_paths, - frontier_metric=frontier_metric, - frontier_delta=artifacts.frontier_delta, - current_entry=artifacts.current_entry, - frontier_entry=artifacts.frontier_entry, - metadata=dict(experiment.metadata), - ) - ) - - resolved_run_registry_path = Path(run_registry_path or output_root / "run_registry.jsonl") - if batch_native_scoring: - backfill_us_pe_native_scores_bundles( - [result.artifact_paths.output_dir for result in results], - baseline_dataset=performance_harness_config.baseline_dataset, - policyengine_us_data_repo=performance_harness_config.policyengine_us_data_repo, - rebuild_registry=True, - ) - backfill_us_pe_native_audit_bundles( - [result.artifact_paths.output_dir for result in results], - policyengine_us_data_repo=performance_harness_config.policyengine_us_data_repo, - ) - results = list( - _refresh_experiment_results_from_registry( - results, - run_registry_path=resolved_run_registry_path, - frontier_metric=frontier_metric, - ) - ) - - report = USMicroplexExperimentReport( - output_root=output_root, - frontier_metric=frontier_metric, - results=tuple(results), - metadata=dict(metadata or {}), - ) - report.save(report_path or output_root / "experiment_report.json") - return report - - -def run_us_microplex_n_synthetic_sweep( - experiment: USMicroplexSourceExperimentSpec, - n_synthetic_values: tuple[int, ...] | list[int], - output_root: str | Path, - *, - name_template: str = "{base_name}-n{n_synthetic}", - frontier_metric: FrontierMetric = "candidate_composite_parity_loss", - policyengine_target_provider: Any | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - report_path: str | Path | None = None, - performance_harness_config: USMicroplexPerformanceHarnessConfig | None = None, - performance_session: USMicroplexPerformanceSession | None = None, - metadata: dict[str, Any] | None = None, -) -> USMicroplexExperimentReport: - """Run one base experiment across multiple n_synthetic values.""" - sweep_experiments = build_us_n_synthetic_sweep_experiments( - experiment, - n_synthetic_values, - name_template=name_template, - ) - sweep_values = [spec.metadata["n_synthetic"] for spec in sweep_experiments] - return run_us_microplex_source_experiments( - sweep_experiments, - output_root, - frontier_metric=frontier_metric, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - run_registry_path=run_registry_path, - report_path=report_path, - performance_harness_config=performance_harness_config, - performance_session=performance_session, - metadata={ - "base_experiment_name": experiment.name, - "n_synthetic_values": sweep_values, - "sweep_parameter": "n_synthetic", - **dict(metadata or {}), - }, - ) - - -def _resolve_experiment_performance_config( - experiment: USMicroplexSourceExperimentSpec, - base_config: USMicroplexPerformanceHarnessConfig, -) -> USMicroplexPerformanceHarnessConfig: - build_config = experiment.config or base_config.build_config - resolved = replace( - base_config, - build_config=build_config, - evaluate_parity=False, - ) - if build_config is None: - return resolved - return replace( - resolved, - n_synthetic=build_config.n_synthetic, - random_seed=build_config.random_seed, - ) - - -def _refresh_experiment_results_from_registry( - results: list[USMicroplexExperimentResult] | tuple[USMicroplexExperimentResult, ...], - *, - run_registry_path: str | Path, - frontier_metric: FrontierMetric, -) -> tuple[USMicroplexExperimentResult, ...]: - registry_entries = load_us_microplex_run_registry(run_registry_path) - if not registry_entries: - return tuple(results) - - frontier_entry = select_us_microplex_frontier_entry( - run_registry_path, - metric=frontier_metric, - ) - entries_by_artifact_id = {entry.artifact_id: entry for entry in registry_entries} - run_index_path = Path(run_registry_path).parent / "run_index.duckdb" - - refreshed: list[USMicroplexExperimentResult] = [] - for result in results: - version_id = result.artifact_paths.version_id or result.artifact_paths.output_dir.name - current_entry = entries_by_artifact_id.get(version_id) - current_value = ( - getattr(current_entry, frontier_metric, None) - if current_entry is not None - else None - ) - frontier_value = ( - getattr(frontier_entry, frontier_metric, None) - if frontier_entry is not None - else None - ) - frontier_delta = ( - current_value - frontier_value - if current_value is not None and frontier_value is not None - else None - ) - refreshed.append( - replace( - result, - artifact_paths=_refresh_experiment_artifact_paths( - result.artifact_paths, - run_registry_path=Path(run_registry_path), - run_index_path=run_index_path, - ), - current_entry=current_entry, - frontier_entry=frontier_entry, - frontier_delta=frontier_delta, - ) - ) - return tuple(refreshed) - - -def _refresh_experiment_artifact_paths( - artifact_paths: USMicroplexArtifactPaths, - *, - run_registry_path: Path, - run_index_path: Path, -) -> USMicroplexArtifactPaths: - manifest_payload = _load_optional_json(artifact_paths.manifest) - artifacts = dict(manifest_payload.get("artifacts", {})) if manifest_payload else {} - artifact_root = artifact_paths.output_dir - return replace( - artifact_paths, - scaffold_seed_data=_resolve_optional_result_artifact_path( - artifact_root, - artifacts.get("scaffold_seed_data"), - ), - data_flow_snapshot=_resolve_optional_result_artifact_path( - artifact_root, - artifacts.get("data_flow_snapshot"), - fallback=str( - resolve_us_stage_artifact_contract_path( - artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - ).relative_to(artifact_root) - ), - ), - artifact_inventory=_resolve_optional_result_artifact_path( - artifact_root, - artifacts.get("artifact_inventory"), - ), - conditional_readiness=_resolve_optional_result_artifact_path( - artifact_root, - artifacts.get("conditional_readiness"), - ), - policyengine_harness=_resolve_optional_result_artifact_path( - artifact_root, - artifacts.get("policyengine_harness"), - ), - policyengine_native_scores=_resolve_optional_result_artifact_path( - artifact_root, - artifacts.get("policyengine_native_scores"), - ), - policyengine_native_audit=_resolve_optional_result_artifact_path( - artifact_root, - artifacts.get("policyengine_native_audit"), - ), - policyengine_native_target_diagnostics=_resolve_optional_result_artifact_path( - artifact_root, - artifacts.get("policyengine_native_target_diagnostics"), - ), - run_registry=Path(run_registry_path), - run_index_db=run_index_path, - ) - - -def _resolve_optional_result_artifact_path( - artifact_root: Path, - artifact_name: str | None, - *, - fallback: str | None = None, -) -> Path | None: - if artifact_name: - path = artifact_root / artifact_name - return path if path.exists() else None - if fallback is None: - return None - fallback_path = artifact_root / fallback - return fallback_path if fallback_path.exists() else None - - -def _load_optional_json(path: Path) -> dict[str, Any] | None: - if not path.exists(): - return None - return json.loads(path.read_text()) diff --git a/src/microplex_us/pipelines/export_lineage_manifest.py b/src/microplex_us/pipelines/export_lineage_manifest.py deleted file mode 100644 index fd33c98e..00000000 --- a/src/microplex_us/pipelines/export_lineage_manifest.py +++ /dev/null @@ -1,896 +0,0 @@ -"""Generate static lineage coverage for eCPS-required PE-US exports. - -Column presence and H5 support parity answer whether a finished artifact has -the right shape and nonzero/variant data. This module answers the cheaper -pre-build question: does Microplex have an intended source or construction path -for each required export column? -""" - -from __future__ import annotations - -import argparse -import json -import sys -from dataclasses import asdict, dataclass, field -from pathlib import Path -from typing import Any - -from microplex_us.microdata_roles import POLICYENGINE_US_TAKEUP_INPUT_VARIABLES -from microplex_us.pipelines.check_export_columns import ( - DEFAULT_CONTRACT_PATH, - _h5_column_values, - _support_requirement, - _support_stats, - load_contract, -) -from microplex_us.policyengine.us import ( - POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES, - POLICYENGINE_US_EXPORT_COLUMN_ALIASES, - POLICYENGINE_US_EXPORT_DEFAULTS, - POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES, - POLICYENGINE_US_NUMERIC_ENUM_EXPORT_MAPS, - POLICYENGINE_US_STRUCTURAL_EXPORT_COLUMNS, -) -from microplex_us.variables import VARIABLE_SEMANTIC_SPECS - -SCHEMA_VERSION = 1 - -SOURCE_BACKED_EVIDENCE_KINDS = frozenset( - { - "cps_raw_mapping", - "cps_derived_recode", - "puf_raw_mapping", - "puf_support_clone_imputation", - "puf_support_clone_override", - "puf_support_clone_refresh", - "pe_source_impute_target", - "pe_source_impute_observed", - "pipeline_constructed", - "semantic_derived", - "policyengine_export_alias", - "takeup_assumption", - "allowed_computed_export", - "enum_map", - "structural_export", - } -) -DEFAULT_ONLY_EVIDENCE_KIND = "policyengine_export_default" - - -@dataclass(frozen=True) -class ExportLineageEvidence: - """One static source/code path that can populate an export column.""" - - kind: str - source: str - detail: str - raw_columns: tuple[str, ...] = () - source_variables: tuple[str, ...] = () - - -@dataclass -class ExportLineageEntry: - """Lineage coverage for one required export column.""" - - column: str - required: bool - entity: str | None - evidence: list[ExportLineageEvidence] = field(default_factory=list) - export_path_status: str = "unknown" - has_source_lineage: bool = False - ecps_support_requirement: str | None = None - ecps_support_stats: dict[str, Any] | None = None - issue: str | None = None - - -def build_export_lineage_manifest( - *, - contract_path: Path = DEFAULT_CONTRACT_PATH, - support_baseline: Path | None = None, - period: int = 2024, -) -> dict[str, Any]: - """Return per-column lineage coverage for the eCPS export contract.""" - contract = load_contract(contract_path) - required_columns = sorted(str(column) for column in contract["required"]) - evidence_index = _build_static_evidence_index() - baseline_support = ( - _baseline_support_by_column( - support_baseline, - required_columns=required_columns, - period=period, - ) - if support_baseline is not None - else {} - ) - - entries: list[ExportLineageEntry] = [] - for column in required_columns: - evidence = sorted( - evidence_index.get(column, []), - key=lambda item: (item.kind, item.source, item.detail), - ) - has_source_lineage = any( - item.kind in SOURCE_BACKED_EVIDENCE_KINDS for item in evidence - ) - status = _export_path_status(evidence, has_source_lineage=has_source_lineage) - entry = ExportLineageEntry( - column=column, - required=True, - entity=_infer_export_entity(column, evidence), - evidence=evidence, - export_path_status=status, - has_source_lineage=has_source_lineage, - ) - - support_info = baseline_support.get(column) - if support_info is not None: - entry.ecps_support_requirement = support_info["requirement"] - entry.ecps_support_stats = support_info["stats"] - if support_info["requirement"] is not None and not has_source_lineage: - entry.issue = "ecps_populated_export_has_no_source_lineage" - elif not evidence: - entry.issue = "required_export_has_no_static_lineage" - entries.append(entry) - - issue_entries = [entry for entry in entries if entry.issue] - payload = { - "schema_version": SCHEMA_VERSION, - "contract_path": str(contract_path), - "support_baseline": str(support_baseline) if support_baseline else None, - "period": int(period), - "summary": { - "required_export_count": len(entries), - "source_lineage_count": sum(entry.has_source_lineage for entry in entries), - "default_only_count": sum( - entry.export_path_status == "default_only" for entry in entries - ), - "unknown_count": sum( - entry.export_path_status == "unknown" for entry in entries - ), - "ecps_populated_checked_count": sum( - entry.ecps_support_requirement is not None for entry in entries - ), - "issue_count": len(issue_entries), - }, - "issues": [ - { - "column": entry.column, - "issue": entry.issue, - "export_path_status": entry.export_path_status, - "ecps_support_requirement": entry.ecps_support_requirement, - } - for entry in issue_entries - ], - "columns": [_entry_to_dict(entry) for entry in entries], - } - return payload - - -def _entry_to_dict(entry: ExportLineageEntry) -> dict[str, Any]: - payload = asdict(entry) - payload["evidence"] = [asdict(item) for item in entry.evidence] - return payload - - -def _export_path_status( - evidence: list[ExportLineageEvidence], - *, - has_source_lineage: bool, -) -> str: - if has_source_lineage: - if any(item.kind == "structural_export" for item in evidence): - return "structural" - return "source_or_constructed" - if evidence and all(item.kind == DEFAULT_ONLY_EVIDENCE_KIND for item in evidence): - return "default_only" - if evidence: - return "documented_no_source" - return "unknown" - - -def _baseline_support_by_column( - baseline_h5: Path, - *, - required_columns: list[str], - period: int, -) -> dict[str, dict[str, Any]]: - import h5py - - period_key = str(int(period)) - support: dict[str, dict[str, Any]] = {} - with h5py.File(baseline_h5, "r") as handle: - for column in required_columns: - values = _h5_column_values(handle, column, period_key=period_key) - if values is None: - continue - stats = _support_stats(column, values) - support[column] = { - "requirement": _support_requirement(stats), - "stats": asdict(stats), - } - return support - - -def _build_static_evidence_index() -> dict[str, list[ExportLineageEvidence]]: - index: dict[str, list[ExportLineageEvidence]] = {} - _add_structural_evidence(index) - _add_policyengine_export_evidence(index) - _add_cps_evidence(index) - _add_puf_manifest_evidence(index) - _add_pe_source_impute_spec_evidence(index) - _add_puf_support_clone_evidence(index) - _add_pipeline_constructed_evidence(index) - _add_semantic_evidence(index) - return index - - -def _append( - index: dict[str, list[ExportLineageEvidence]], - column: str, - evidence: ExportLineageEvidence, -) -> None: - index.setdefault(str(column), []).append(evidence) - - -def _add_structural_evidence(index: dict[str, list[ExportLineageEvidence]]) -> None: - for column in POLICYENGINE_US_STRUCTURAL_EXPORT_COLUMNS: - _append( - index, - column, - ExportLineageEvidence( - kind="structural_export", - source="policyengine_us_export", - detail="Entity id/link/weight column emitted by PE-US H5 writer.", - ), - ) - - -def _add_policyengine_export_evidence( - index: dict[str, list[ExportLineageEvidence]], -) -> None: - for source_column, target_column in POLICYENGINE_US_EXPORT_COLUMN_ALIASES.items(): - _append( - index, - target_column, - ExportLineageEvidence( - kind="policyengine_export_alias", - source="POLICYENGINE_US_EXPORT_COLUMN_ALIASES", - detail=f"Exports source column {source_column!r} as {target_column!r}.", - source_variables=(source_column,), - ), - ) - for column, default in POLICYENGINE_US_EXPORT_DEFAULTS.items(): - _append( - index, - column, - ExportLineageEvidence( - kind=DEFAULT_ONLY_EVIDENCE_KIND, - source="POLICYENGINE_US_EXPORT_DEFAULTS", - detail=f"Default exported when no source column is present: {default!r}.", - ), - ) - for column in POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES: - _append( - index, - column, - ExportLineageEvidence( - kind="allowed_computed_export", - source="POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES", - detail="Computed/overridable PE-US export allowed by the H5 writer.", - ), - ) - for column in POLICYENGINE_US_NUMERIC_ENUM_EXPORT_MAPS: - _append( - index, - column, - ExportLineageEvidence( - kind="enum_map", - source="POLICYENGINE_US_NUMERIC_ENUM_EXPORT_MAPS", - detail="Numeric source code mapped to PE-US enum export value.", - ), - ) - for column in POLICYENGINE_US_TAKEUP_INPUT_VARIABLES: - _append( - index, - column, - ExportLineageEvidence( - kind="takeup_assumption", - source="microplex_us.pipelines.us", - detail="Policy take-up input generated from Microplex take-up assumptions/source proxies.", - ), - ) - - -def _add_cps_evidence(index: dict[str, list[ExportLineageEvidence]]) -> None: - from microplex_us.data_sources.cps import ( - CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP, - CURRENT_HEALTH_COVERAGE_RULE_INPUT_ALIAS_MAP, - HOUSEHOLD_VARIABLES, - PERSON_VARIABLES, - ) - - for raw_column, column in PERSON_VARIABLES.items(): - if str(column).startswith("_"): - continue - _append( - index, - column, - ExportLineageEvidence( - kind="cps_raw_mapping", - source="CPS ASEC person", - detail=f"Mapped from CPS ASEC raw person column {raw_column}.", - raw_columns=(raw_column,), - ), - ) - for raw_column, column in HOUSEHOLD_VARIABLES.items(): - if str(column).startswith("_"): - continue - _append( - index, - column, - ExportLineageEvidence( - kind="cps_raw_mapping", - source="CPS ASEC household", - detail=f"Mapped from CPS ASEC raw household column {raw_column}.", - raw_columns=(raw_column,), - ), - ) - for column, raw_column in CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP.items(): - _append( - index, - column, - ExportLineageEvidence( - kind="cps_derived_recode", - source="CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP", - detail=f"Current health coverage report recoded from {raw_column}.", - raw_columns=(raw_column,), - ), - ) - for column, raw_column in { - "reported_has_private_health_coverage_at_interview": "NOW_PRIV", - "reported_has_public_health_coverage_at_interview": "NOW_PUB", - "reported_is_insured_at_interview": "NOW_COV", - "reported_is_uninsured_at_interview": "NOW_COV", - }.items(): - _append( - index, - column, - ExportLineageEvidence( - kind="cps_derived_recode", - source="CPS current health coverage recodes", - detail=f"Derived from CPS ASEC current-coverage raw column {raw_column}.", - raw_columns=(raw_column,), - ), - ) - for column, source_columns in { - "reported_has_multiple_health_coverage_at_interview": tuple( - CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP - ), - "has_esi": ("reported_has_employer_sponsored_health_coverage_at_interview",), - "has_marketplace_health_coverage": ( - "reported_has_marketplace_health_coverage_at_interview", - ), - }.items(): - _append( - index, - column, - ExportLineageEvidence( - kind="cps_derived_recode", - source="CPS current health coverage recodes", - detail="Derived from CPS ASEC current health coverage indicators.", - source_variables=tuple(source_columns), - ), - ) - for column, source_column in CURRENT_HEALTH_COVERAGE_RULE_INPUT_ALIAS_MAP.items(): - _append( - index, - column, - ExportLineageEvidence( - kind="cps_derived_recode", - source="CURRENT_HEALTH_COVERAGE_RULE_INPUT_ALIAS_MAP", - detail=f"Rule input aliases reported coverage column {source_column}.", - source_variables=(source_column,), - ), - ) - - -def _add_puf_manifest_evidence(index: dict[str, list[ExportLineageEvidence]]) -> None: - manifest_path = Path(__file__).resolve().parents[1] / "manifests" / "puf.json" - payload = json.loads(manifest_path.read_text()) - for observation in payload.get("observations", []): - for mapping in observation.get("columns", []): - column = mapping.get("canonical_name") - raw_column = mapping.get("raw_column") - if not column: - continue - _append( - index, - str(column), - ExportLineageEvidence( - kind="puf_raw_mapping", - source="manifests/puf.json", - detail=f"Mapped from IRS SOI PUF raw column {raw_column}.", - raw_columns=(str(raw_column),) if raw_column is not None else (), - ), - ) - - -def _add_pe_source_impute_spec_evidence( - index: dict[str, list[ExportLineageEvidence]], -) -> None: - spec_path = ( - Path(__file__).resolve().parents[1] - / "manifests" - / "pe_source_impute_blocks.json" - ) - payload = json.loads(spec_path.read_text()) - for block_name, block in payload.get("blocks", {}).items(): - source = str(block.get("survey_name") or block_name) - block_label = f"{source}:{block_name}" - for column in block.get("target_variables", []): - _append( - index, - str(column), - ExportLineageEvidence( - kind="pe_source_impute_target", - source=block_label, - detail="Target variable populated by PE-source donor imputation block.", - ), - ) - for column in ( - *block.get("person_variables", []), - *block.get("household_variables", []), - ): - _append( - index, - str(column), - ExportLineageEvidence( - kind="pe_source_impute_observed", - source=block_label, - detail="Observed variable available in donor source spec.", - ), - ) - - -def _add_puf_support_clone_evidence( - index: dict[str, list[ExportLineageEvidence]], -) -> None: - from microplex_us.pipelines.us import ( - PUF_SUPPORT_CLONE_CPS_REFRESH_VARIABLES, - PUF_SUPPORT_CLONE_IMPUTED_VARIABLES, - PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES, - PUF_SUPPORT_CLONE_SPECIAL_VARIABLES, - ) - - for column in PUF_SUPPORT_CLONE_IMPUTED_VARIABLES: - _append( - index, - column, - ExportLineageEvidence( - kind="puf_support_clone_imputation", - source="PUF_SUPPORT_CLONE_IMPUTED_VARIABLES", - detail="PUF donor variable imputed onto the CPS support-clone surface.", - ), - ) - for column in PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES: - _append( - index, - column, - ExportLineageEvidence( - kind="puf_support_clone_override", - source="PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES", - detail="PUF donor variable may override/collapse onto the CPS scaffold.", - ), - ) - for column in PUF_SUPPORT_CLONE_SPECIAL_VARIABLES: - _append( - index, - column, - ExportLineageEvidence( - kind="puf_support_clone_imputation", - source="PUF_SUPPORT_CLONE_SPECIAL_VARIABLES", - detail="Special support-clone variable populated through PUF/CPS support handling.", - ), - ) - for column in PUF_SUPPORT_CLONE_CPS_REFRESH_VARIABLES: - _append( - index, - column, - ExportLineageEvidence( - kind="puf_support_clone_refresh", - source="PUF_SUPPORT_CLONE_CPS_REFRESH_VARIABLES", - detail="CPS-only status/categorical field refreshed after PUF support matching.", - ), - ) - - -def _add_pipeline_constructed_evidence( - index: dict[str, list[ExportLineageEvidence]], -) -> None: - constructed: dict[str, tuple[str, tuple[str, ...]]] = { - "block_geoid": ( - "Representative Census block assignment", - ("state_fips", "county_fips", "spm_unit_size"), - ), - "tract_geoid": ( - "Representative Census block assignment", - ("block_geoid",), - ), - "congressional_district_geoid": ( - "Representative Census block assignment", - ("block_geoid", "state_fips"), - ), - "employment_income_before_lsr": ( - "USMicroplexPipeline income normalizer", - ("employment_income", "wage_income", "income"), - ), - "weekly_hours_worked_before_lsr": ( - "CPS weekly-hours export support", - ("A_HRS1", "hours_worked", "hours_worked_last_week"), - ), - "selected_marketplace_plan_benchmark_ratio": ( - "PE-US ACA selected Marketplace plan ratio construction", - ( - "health_insurance_premiums_without_medicare_part_b", - "takes_up_aca_if_eligible", - "aca_ptc", - "slcsp", - ), - ), - "self_employment_income_before_lsr": ( - "USMicroplexPipeline income normalizer", - ("self_employment_income",), - ), - "long_term_capital_gains_before_response": ( - "USMicroplexPipeline investment-income normalizer", - ("long_term_capital_gains", "capital_gains"), - ), - "taxable_private_pension_income": ( - "CPS private pension taxable/exempt split", - ("PEN_VAL", "ANN_VAL"), - ), - "social_security_disability": ( - "CPS Social Security reason-code split", - ("SS_VAL", "SSKIND1", "SSKIND2", "age"), - ), - "social_security_survivors": ( - "CPS Social Security reason-code split", - ("SS_VAL", "SSKIND1", "SSKIND2"), - ), - "social_security_dependents": ( - "CPS Social Security reason-code split", - ("SS_VAL", "SSKIND1", "SSKIND2"), - ), - "disability_benefits": ( - "CPS disability-income workers-comp split", - ("DSAB_VAL1", "DSAB_VAL2", "DSAB_ON1", "DSAB_ON2"), - ), - "employer_sponsored_insurance_premiums": ( - "CPS employer-sponsored insurance premium imputation", - ("NOW_OWNGRP", "NOW_HIPAID", "NOW_GRPFTYP", "PHIP_VAL"), - ), - "reported_owns_employer_sponsored_health_insurance_at_interview": ( - "CPS employer-sponsored insurance policyholder recode", - ("NOW_OWNGRP",), - ), - "takes_up_dc_ptc": ( - "Microplex tax-unit take-up-rate construction", - ("DEFAULT_DC_PTC_TAKEUP_RATE",), - ), - "is_unmarried_partner_of_household_head": ( - "CPS relationship-to-householder recode", - ("PERRP",), - ), - "is_separated": ( - "CPS marital-status recode", - ("A_MARITL",), - ), - "is_surviving_spouse": ( - "CPS marital-status recode", - ("A_MARITL",), - ), - "is_blind": ( - "CPS difficulty-seeing recode", - ("PEDISEYE",), - ), - "ssn_card_type": ( - "CPS nativity/legal-status tax-id replacement", - ("PRCITSHP", "PEINUSYR", "PENATVTY", "PEMLR"), - ), - "immigration_status_str": ( - "CPS nativity/legal-status tax-id replacement", - ("PRCITSHP", "PEINUSYR", "PENATVTY", "PEMLR"), - ), - "is_pregnant": ( - "USMicroplexPipeline pregnancy-rate construction", - ("age", "is_female", "state_fips"), - ), - "has_valid_ssn": ("CPS tax-id replacement construction", ("ssn_card_type",)), - "taxpayer_id_type": ("CPS tax-id replacement construction", ("ssn_card_type",)), - "has_tin": ("PE-US export identity construction", ("taxpayer_id_type",)), - "has_itin": ("PE-US export identity construction", ("taxpayer_id_type",)), - "hourly_wage": ("CPS hourly work recode", ("_hourly_pay_cents",)), - "is_paid_hourly": ("CPS hourly work recode", ("_is_paid_hourly_code",)), - "is_union_member_or_covered": ("CPS union recode", ("A_UNMEM",)), - "is_tipped_occupation": ( - "CPS occupation to Treasury tipped-occupation recode", - ("detailed_occupation_recode",), - ), - "treasury_tipped_occupation_code": ( - "CPS occupation to Treasury tipped-occupation recode", - ("detailed_occupation_recode",), - ), - "is_computer_scientist": ( - "CPS detailed occupation recode", - ("detailed_occupation_recode",), - ), - "is_executive_administrative_professional": ( - "CPS detailed occupation recode", - ("detailed_occupation_recode",), - ), - "is_farmer_fisher": ( - "CPS detailed occupation recode", - ("detailed_occupation_recode",), - ), - "is_military": ("CPS class/occupation recode", ("class_of_worker",)), - "is_full_time_college_student": ( - "CPS school-enrollment recode", - ("_high_school_or_college_status",), - ), - "has_never_worked": ("CPS work-status recode", ("work_status",)), - "previous_year_income_available": ( - "Prior CPS ASEC PERIDNUM join", - ("PERIDNUM",), - ), - "self_employment_income_last_year": ( - "Prior CPS ASEC PERIDNUM join", - ("PERIDNUM", "SEMP_VAL"), - ), - "tax_exempt_private_pension_income": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "taxable_401k_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "tax_exempt_401k_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "taxable_403b_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "tax_exempt_403b_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "regular_ira_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "roth_ira_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "tax_exempt_ira_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "taxable_sep_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "tax_exempt_sep_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "other_type_retirement_account_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "keogh_distributions": ( - "CPS retirement distribution split", - ("DST_SC*", "DST_VAL*"), - ), - "self_employed_pension_contributions": ( - "CPS retirement-contribution capped-account split", - ("RETCB_VAL", "age", "wage_income", "self_employment_income"), - ), - "traditional_401k_contributions": ( - "CPS retirement-contribution capped-account split", - ("RETCB_VAL", "age", "wage_income", "self_employment_income"), - ), - "roth_401k_contributions": ( - "CPS retirement-contribution capped-account split", - ("RETCB_VAL", "age", "wage_income", "self_employment_income"), - ), - "traditional_ira_contributions": ( - "CPS retirement-contribution capped-account split", - ("RETCB_VAL", "age", "wage_income", "self_employment_income"), - ), - "roth_ira_contributions": ( - "CPS retirement-contribution capped-account split", - ("RETCB_VAL", "age", "wage_income", "self_employment_income"), - ), - "self_employed_pension_contributions_desired": ( - "CPS retirement-contribution desired-account split", - ("RETCB_VAL", "wage_income", "self_employment_income"), - ), - "traditional_401k_contributions_desired": ( - "CPS retirement-contribution desired-account split", - ("RETCB_VAL", "wage_income", "self_employment_income"), - ), - "roth_401k_contributions_desired": ( - "CPS retirement-contribution desired-account split", - ("RETCB_VAL", "wage_income", "self_employment_income"), - ), - "traditional_ira_contributions_desired": ( - "CPS retirement-contribution desired-account split", - ("RETCB_VAL", "wage_income", "self_employment_income"), - ), - "roth_ira_contributions_desired": ( - "CPS retirement-contribution desired-account split", - ("RETCB_VAL", "wage_income", "self_employment_income"), - ), - "first_home_mortgage_balance": ( - "Tax-unit mortgage support construction", - ("first_home_mortgage_interest", "scf_mortgage_debt"), - ), - "first_home_mortgage_interest": ( - "Tax-unit mortgage support construction", - ("deductible_mortgage_interest", "mortgage_interest_paid"), - ), - "first_home_mortgage_origination_year": ( - "Tax-unit mortgage support construction", - ("first_home_mortgage_interest",), - ), - "spm_unit_capped_work_childcare_expenses": ( - "CPS SPM capped work-childcare source amount", - ("SPM_CAPWKCCXPNS",), - ), - } - aotc_columns = ( - "is_pursuing_credential_for_american_opportunity_credit", - "attends_eligible_educational_institution_for_american_opportunity_credit", - "is_enrolled_at_least_half_time_for_american_opportunity_credit", - "has_american_opportunity_credit_1098_t_or_exception", - "has_american_opportunity_credit_institution_ein", - "has_completed_first_four_years_of_postsecondary_education", - "has_felony_drug_conviction", - "american_opportunity_credit_claimed_prior_years", - ) - for column in aotc_columns: - constructed[column] = ( - "USMicroplexPipeline._construct_aotc_eligibility_inputs", - ("american_opportunity_credit", "qualified_tuition_expenses"), - ) - for column, (detail, variables) in constructed.items(): - _append( - index, - column, - ExportLineageEvidence( - kind="pipeline_constructed", - source="microplex_us.pipelines.us", - detail=detail, - source_variables=tuple(variables), - ), - ) - - -def _add_semantic_evidence(index: dict[str, list[ExportLineageEvidence]]) -> None: - for column, spec in VARIABLE_SEMANTIC_SPECS.items(): - if not spec.derived_from: - continue - _append( - index, - column, - ExportLineageEvidence( - kind="semantic_derived", - source="VARIABLE_SEMANTIC_SPECS", - detail="Variable semantic spec declares derived source variables.", - source_variables=tuple(spec.derived_from), - ), - ) - - -def _infer_export_entity( - column: str, - evidence: list[ExportLineageEvidence], -) -> str | None: - if column in POLICYENGINE_US_STRUCTURAL_EXPORT_COLUMNS: - if column == "household_weight" or column == "household_id": - return "household" - if column.startswith("person_") or column == "person_id": - return "person" - if column.endswith("_id"): - return column.removesuffix("_id") - if column in POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES: - return POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES[column] - for item in evidence: - if item.kind.startswith("pe_source_impute"): - if "household" in item.detail.lower(): - return "household" - return "person" - return None - - -def _format_report(payload: dict[str, Any]) -> str: - summary = payload["summary"] - lines = [ - "eCPS export lineage manifest", - f" required exports: {summary['required_export_count']}", - f" source lineage: {summary['source_lineage_count']}", - f" default only: {summary['default_only_count']}", - f" unknown: {summary['unknown_count']}", - f" eCPS populated checked: {summary['ecps_populated_checked_count']}", - f" issues: {summary['issue_count']}", - ] - if payload["issues"]: - lines.append("") - lines.append(" issue columns:") - for issue in payload["issues"]: - lines.append( - " - " - f"{issue['column']} ({issue['issue']}; " - f"{issue['export_path_status']})" - ) - lines.append("") - lines.append(" RESULT: " + ("PASS" if not payload["issues"] else "FAIL")) - return "\n".join(lines) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - prog="export_lineage_manifest", - description=( - "Generate a static lineage manifest for required eCPS export columns." - ), - ) - parser.add_argument( - "--contract", - default=str(DEFAULT_CONTRACT_PATH), - help="eCPS export contract JSON path.", - ) - parser.add_argument( - "--support-baseline", - metavar="H5", - help=( - "Optional eCPS baseline H5. When supplied, the manifest flags " - "eCPS-populated required exports that have no source lineage." - ), - ) - parser.add_argument( - "--period", - type=int, - default=2024, - help="Period to inspect in --support-baseline (default: 2024).", - ) - parser.add_argument( - "--output", - metavar="JSON", - help="Optional output JSON path.", - ) - parser.add_argument( - "--fail-on-issues", - action="store_true", - help="Exit 1 if the manifest contains lineage issues.", - ) - args = parser.parse_args(argv) - - payload = build_export_lineage_manifest( - contract_path=Path(args.contract), - support_baseline=Path(args.support_baseline) if args.support_baseline else None, - period=int(args.period), - ) - if args.output: - output_path = Path(args.output) - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - print(_format_report(payload)) - return 1 if args.fail_on_issues and payload["issues"] else 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/microplex_us/pipelines/hf_artifacts.py b/src/microplex_us/pipelines/hf_artifacts.py deleted file mode 100644 index 4cc7e247..00000000 --- a/src/microplex_us/pipelines/hf_artifacts.py +++ /dev/null @@ -1,735 +0,0 @@ -"""Publish Microplex artifact bundles to Hugging Face.""" - -from __future__ import annotations - -import argparse -import json -import os -import sys -from collections.abc import Callable -from dataclasses import dataclass -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -HF_PUBLISH_MANIFEST_FILENAME = "hf_publish_manifest.json" -DEFAULT_HF_REPO_TYPE = "dataset" -DEFAULT_DIAGNOSTICS_REPO = "policyengine/microplex-us-diagnostics" -DEFAULT_DATASET_REPO = "policyengine/microplex-us-deployed-datasets" -DIAGNOSTICS_RUN_PREFIX = "runs" -DATASET_STAGING_PREFIX = "staging" -DIAGNOSTICS_ARTIFACT_KEYS = ( - "policyengine_native_scores", - "policyengine_native_audit", - "policyengine_native_target_diagnostics", -) -DATASET_ARTIFACT_KEYS = ("policyengine_dataset",) - - -@dataclass(frozen=True) -class HuggingFacePublishConfig: - """Destination repos and auth settings for Hugging Face artifact publishing.""" - - diagnostics_repo: str | None = DEFAULT_DIAGNOSTICS_REPO - dataset_repo: str | None = DEFAULT_DATASET_REPO - repo_type: str = DEFAULT_HF_REPO_TYPE - token: str | None = None - diagnostics_run_prefix: str = DIAGNOSTICS_RUN_PREFIX - dataset_staging_prefix: str = DATASET_STAGING_PREFIX - - @classmethod - def from_env(cls) -> HuggingFacePublishConfig: - return cls( - diagnostics_repo=os.environ.get( - "MICROPLEX_HF_DIAGNOSTICS_REPO", - DEFAULT_DIAGNOSTICS_REPO, - ), - dataset_repo=os.environ.get( - "MICROPLEX_HF_DATASET_REPO", - DEFAULT_DATASET_REPO, - ), - repo_type=os.environ.get("MICROPLEX_HF_REPO_TYPE", DEFAULT_HF_REPO_TYPE), - token=_first_env("MICROPLEX_HF_TOKEN", "HUGGING_FACE_TOKEN", "HF_TOKEN"), - diagnostics_run_prefix=os.environ.get( - "MICROPLEX_HF_DIAGNOSTICS_RUN_PREFIX", - DIAGNOSTICS_RUN_PREFIX, - ), - dataset_staging_prefix=os.environ.get( - "MICROPLEX_HF_DATASET_STAGING_PREFIX", - DATASET_STAGING_PREFIX, - ), - ) - - -def _first_env(*names: str) -> str | None: - for name in names: - value = os.environ.get(name) - if value: - return value - return None - - -def normalize_hf_prefix(prefix: str) -> str: - """Normalize a Hugging Face repo path prefix.""" - return prefix.strip("/") - - -def build_hf_repo_path(*parts: str | None) -> str: - """Join repo path parts using POSIX separators.""" - return "/".join(str(part).strip("/") for part in parts if part) - - -def resolve_bundle_run_id(artifact_dir: str | Path, run_id: str | None = None) -> str: - """Return the explicit run ID or the bundle directory name.""" - return run_id or Path(artifact_dir).resolve().name - - -def resolve_manifest_artifact_path( - artifact_dir: str | Path, - manifest: dict[str, Any], - artifact_key: str, -) -> Path: - """Resolve one artifact path from a bundle manifest.""" - artifact_name = dict(manifest.get("artifacts", {})).get(artifact_key) - if not isinstance(artifact_name, str) or not artifact_name: - raise FileNotFoundError( - f"Manifest is missing artifacts.{artifact_key} for {artifact_dir}" - ) - path = Path(artifact_name) - if not path.is_absolute(): - path = Path(artifact_dir) / path - if not path.exists(): - raise FileNotFoundError( - f"Manifest artifact {artifact_key!r} does not exist: {path}" - ) - return path - - -def diagnostics_repo_paths( - artifact_dir: str | Path, - *, - run_id: str | None = None, - run_prefix: str = DIAGNOSTICS_RUN_PREFIX, -) -> dict[str, str]: - """Return Hugging Face repo paths for diagnostics files.""" - resolved_run_id = resolve_bundle_run_id(artifact_dir, run_id) - prefix = build_hf_repo_path(normalize_hf_prefix(run_prefix), resolved_run_id) - return { - "manifest": build_hf_repo_path(prefix, "manifest.json"), - "policyengine_native_scores": build_hf_repo_path( - prefix, - "policyengine_native_scores.json", - ), - "policyengine_native_audit": build_hf_repo_path( - prefix, - "pe_us_data_rebuild_native_audit.json", - ), - "policyengine_native_target_diagnostics": build_hf_repo_path( - prefix, - "pe_native_target_diagnostics.json", - ), - "latest": "latest.json", - "run_registry": "run_registry.jsonl", - } - - -def dataset_repo_paths( - artifact_dir: str | Path, - *, - run_id: str | None = None, - staging_prefix: str = DATASET_STAGING_PREFIX, - promote: bool = False, -) -> dict[str, str]: - """Return Hugging Face repo paths for deployed dataset files.""" - resolved_run_id = resolve_bundle_run_id(artifact_dir, run_id) - prefix = build_hf_repo_path(normalize_hf_prefix(staging_prefix), resolved_run_id) - paths = { - "policyengine_dataset": build_hf_repo_path(prefix, "policyengine_us.h5"), - "manifest": build_hf_repo_path(prefix, "manifest.json"), - } - if promote: - paths.update( - { - "promoted_policyengine_dataset": "policyengine_us.h5", - "promoted_manifest": "manifest.json", - } - ) - return paths - - -def build_latest_payload( - *, - run_id: str, - diagnostics_repo: str, - repo_type: str, - paths: dict[str, str], - manifest: dict[str, Any], -) -> dict[str, Any]: - """Build the diagnostics repo's latest-run pointer.""" - return { - "schema_version": 1, - "updated_at": datetime.now(UTC).isoformat(), - "run_id": run_id, - "artifact_id": run_id, - "repo_id": diagnostics_repo, - "repo_type": repo_type, - "paths": { - "manifest": paths["manifest"], - "policyengine_native_scores": paths["policyengine_native_scores"], - "policyengine_native_audit": paths["policyengine_native_audit"], - "policyengine_native_target_diagnostics": ( - paths["policyengine_native_target_diagnostics"] - ), - }, - "summary": { - "created_at": manifest.get("created_at"), - "policyengine_native_scores": manifest.get("policyengine_native_scores"), - "policyengine_native_audit": manifest.get("policyengine_native_audit"), - }, - } - - -def build_run_registry_entry( - *, - run_id: str, - diagnostics_repo: str, - repo_type: str, - paths: dict[str, str], - manifest: dict[str, Any], -) -> dict[str, Any]: - """Build one compact diagnostics registry row.""" - return { - "recorded_at": datetime.now(UTC).isoformat(), - "run_id": run_id, - "artifact_id": run_id, - "repo_id": diagnostics_repo, - "repo_type": repo_type, - "manifest": paths["manifest"], - "policyengine_native_scores": paths["policyengine_native_scores"], - "policyengine_native_audit": paths["policyengine_native_audit"], - "policyengine_native_target_diagnostics": ( - paths["policyengine_native_target_diagnostics"] - ), - "candidate_enhanced_cps_native_loss": _nested_get( - manifest, - "policyengine_native_scores", - "candidate_enhanced_cps_native_loss", - ), - "enhanced_cps_native_loss_delta": _nested_get( - manifest, - "policyengine_native_scores", - "enhanced_cps_native_loss_delta", - ), - } - - -def _nested_get(payload: dict[str, Any], *keys: str) -> Any: - value: Any = payload - for key in keys: - if not isinstance(value, dict): - return None - value = value.get(key) - return value - - -def build_diagnostics_operations( - artifact_dir: str | Path, - config: HuggingFacePublishConfig, - *, - run_id: str | None = None, - registry_text: str = "", -) -> tuple[list[Any], dict[str, Any]]: - """Build Hugging Face commit operations for diagnostics JSON files.""" - if config.diagnostics_repo is None: - raise ValueError("diagnostics_repo is required to publish diagnostics") - root = Path(artifact_dir).resolve() - manifest_path = root / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - resolved_run_id = resolve_bundle_run_id(root, run_id) - paths = diagnostics_repo_paths( - root, - run_id=resolved_run_id, - run_prefix=config.diagnostics_run_prefix, - ) - files = { - "manifest": manifest_path, - "policyengine_native_scores": resolve_manifest_artifact_path( - root, - manifest, - "policyengine_native_scores", - ), - "policyengine_native_audit": resolve_manifest_artifact_path( - root, - manifest, - "policyengine_native_audit", - ), - "policyengine_native_target_diagnostics": resolve_manifest_artifact_path( - root, - manifest, - "policyengine_native_target_diagnostics", - ), - } - latest = build_latest_payload( - run_id=resolved_run_id, - diagnostics_repo=config.diagnostics_repo, - repo_type=config.repo_type, - paths=paths, - manifest=manifest, - ) - registry_entry = build_run_registry_entry( - run_id=resolved_run_id, - diagnostics_repo=config.diagnostics_repo, - repo_type=config.repo_type, - paths=paths, - manifest=manifest, - ) - registry_text = _append_registry_jsonl(registry_text, registry_entry) - - operations = [ - _commit_add(paths[key], path) - for key, path in files.items() - ] - operations.extend( - [ - _commit_add_bytes(paths["latest"], latest), - _commit_add_text(paths["run_registry"], registry_text), - ] - ) - payload = { - "repo_id": config.diagnostics_repo, - "repo_type": config.repo_type, - "run_id": resolved_run_id, - "paths": paths, - "files": {key: str(path) for key, path in files.items()}, - "latest": latest, - "run_registry_entry": registry_entry, - "operation_count": len(operations), - } - return operations, payload - - -def build_dataset_operations( - artifact_dir: str | Path, - config: HuggingFacePublishConfig, - *, - run_id: str | None = None, - promote: bool = False, -) -> tuple[list[Any], dict[str, Any]]: - """Build Hugging Face commit operations for the deployed dataset repo.""" - if config.dataset_repo is None: - raise ValueError("dataset_repo is required to publish datasets") - root = Path(artifact_dir).resolve() - manifest_path = root / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - resolved_run_id = resolve_bundle_run_id(root, run_id) - paths = dataset_repo_paths( - root, - run_id=resolved_run_id, - staging_prefix=config.dataset_staging_prefix, - promote=promote, - ) - dataset_path = resolve_manifest_artifact_path(root, manifest, "policyengine_dataset") - operations = [ - _commit_add(paths["policyengine_dataset"], dataset_path), - _commit_add(paths["manifest"], manifest_path), - ] - if promote: - operations.extend( - [ - _commit_add(paths["promoted_policyengine_dataset"], dataset_path), - _commit_add(paths["promoted_manifest"], manifest_path), - ] - ) - payload = { - "repo_id": config.dataset_repo, - "repo_type": config.repo_type, - "run_id": resolved_run_id, - "paths": paths, - "files": { - "policyengine_dataset": str(dataset_path), - "manifest": str(manifest_path), - }, - "promoted": bool(promote), - "operation_count": len(operations), - } - return operations, payload - - -def publish_microplex_artifact_to_hf( - artifact_dir: str | Path, - config: HuggingFacePublishConfig, - *, - run_id: str | None = None, - publish_diagnostics: bool = True, - publish_dataset: bool = False, - promote_dataset: bool = False, - dry_run: bool = False, - api: Any | None = None, - registry_loader: Callable[[HuggingFacePublishConfig], str] | None = None, -) -> dict[str, Any]: - """Publish a completed Microplex bundle to configured Hugging Face repos.""" - if not publish_diagnostics and not publish_dataset: - raise ValueError("At least one of publish_diagnostics or publish_dataset is required") - root = Path(artifact_dir).resolve() - resolved_run_id = resolve_bundle_run_id(root, run_id) - result: dict[str, Any] = { - "schema_version": 1, - "created_at": datetime.now(UTC).isoformat(), - "artifact_dir": str(root), - "run_id": resolved_run_id, - "dry_run": dry_run, - "diagnostics": None, - "dataset": None, - } - hf_api = None if dry_run else api or create_hf_api() - - if publish_diagnostics: - registry_text = ( - registry_loader(config) - if registry_loader is not None - else load_existing_registry_text(config) - ) - operations, diagnostics_payload = build_diagnostics_operations( - root, - config, - run_id=resolved_run_id, - registry_text=registry_text, - ) - result["diagnostics"] = diagnostics_payload - if not dry_run: - hf_api.create_commit( - repo_id=config.diagnostics_repo, - repo_type=config.repo_type, - operations=operations, - commit_message=f"Publish Microplex diagnostics {resolved_run_id}", - token=config.token, - ) - - if publish_dataset: - operations, dataset_payload = build_dataset_operations( - root, - config, - run_id=resolved_run_id, - promote=promote_dataset, - ) - result["dataset"] = dataset_payload - if not dry_run: - hf_api.create_commit( - repo_id=config.dataset_repo, - repo_type=config.repo_type, - operations=operations, - commit_message=f"Publish Microplex dataset {resolved_run_id}", - token=config.token, - ) - - result["status"] = "dry_run" if dry_run else "published" - _write_json(root / HF_PUBLISH_MANIFEST_FILENAME, result) - return result - - -def smoke_published_hf_artifact( - config: HuggingFacePublishConfig, - *, - run_id: str | None = None, - check_dataset: bool = True, - check_promoted_dataset: bool = True, - api: Any | None = None, - latest_loader: Callable[[HuggingFacePublishConfig], dict[str, Any]] | None = None, -) -> dict[str, Any]: - """Verify that a published Hugging Face artifact exposes expected files.""" - if config.diagnostics_repo is None: - raise ValueError("diagnostics_repo is required for smoke checks") - hf_api = api or create_hf_api() - latest = ( - latest_loader(config) - if latest_loader is not None - else load_hf_json(config, config.diagnostics_repo, "latest.json") - ) - resolved_run_id = run_id or latest.get("run_id") - if not isinstance(resolved_run_id, str) or not resolved_run_id: - raise ValueError("run_id is required when latest.json does not define run_id") - - diagnostics_paths = diagnostics_repo_paths( - ".", - run_id=resolved_run_id, - run_prefix=config.diagnostics_run_prefix, - ) - expected_diagnostics = set(diagnostics_paths.values()) - diagnostics_files = set( - hf_api.list_repo_files( - repo_id=config.diagnostics_repo, - repo_type=config.repo_type, - token=config.token, - ) - ) - - result: dict[str, Any] = { - "schema_version": 1, - "checked_at": datetime.now(UTC).isoformat(), - "run_id": resolved_run_id, - "diagnostics": { - "repo_id": config.diagnostics_repo, - "expected": sorted(expected_diagnostics), - "missing": sorted(expected_diagnostics - diagnostics_files), - }, - "dataset": None, - } - - if check_dataset: - if config.dataset_repo is None: - raise ValueError("dataset_repo is required when check_dataset is true") - dataset_paths = dataset_repo_paths( - ".", - run_id=resolved_run_id, - staging_prefix=config.dataset_staging_prefix, - promote=check_promoted_dataset, - ) - expected_dataset = set(dataset_paths.values()) - dataset_files = set( - hf_api.list_repo_files( - repo_id=config.dataset_repo, - repo_type=config.repo_type, - token=config.token, - ) - ) - result["dataset"] = { - "repo_id": config.dataset_repo, - "expected": sorted(expected_dataset), - "missing": sorted(expected_dataset - dataset_files), - } - - missing = list(result["diagnostics"]["missing"]) - if result["dataset"] is not None: - missing.extend(result["dataset"]["missing"]) - result["status"] = "passed" if not missing else "failed" - result["missing_count"] = len(missing) - return result - - -def create_hf_api() -> Any: - """Create a Hugging Face API client lazily.""" - try: - from huggingface_hub import HfApi - except ImportError as error: # pragma: no cover - exercised by CLI environment. - raise RuntimeError( - "huggingface_hub is required for Hugging Face uploads. Install the " - "optional extra with `uv sync --extra hf` or run through " - "`uv run --extra hf ...`." - ) from error - return HfApi() - - -def load_hf_json( - config: HuggingFacePublishConfig, - repo_id: str, - filename: str, -) -> dict[str, Any]: - """Download one JSON file from Hugging Face.""" - try: - from huggingface_hub import hf_hub_download - except ImportError as error: # pragma: no cover - raise RuntimeError( - "huggingface_hub is required for Hugging Face smoke checks." - ) from error - path = hf_hub_download( - repo_id=repo_id, - filename=filename, - repo_type=config.repo_type, - token=config.token, - ) - return json.loads(Path(path).read_text()) - - -def _commit_add(path_in_repo: str, local_path: Path) -> Any: - try: - from huggingface_hub import CommitOperationAdd - except ImportError as error: # pragma: no cover - raise RuntimeError( - "huggingface_hub is required for Hugging Face commit operations." - ) from error - return CommitOperationAdd( - path_in_repo=path_in_repo, - path_or_fileobj=str(local_path), - ) - - -def _commit_add_bytes(path_in_repo: str, payload: dict[str, Any]) -> Any: - return _commit_add_text( - path_in_repo, - json.dumps(payload, indent=2, sort_keys=True) + "\n", - ) - - -def _commit_add_text(path_in_repo: str, text: str) -> Any: - try: - from huggingface_hub import CommitOperationAdd - except ImportError as error: # pragma: no cover - raise RuntimeError( - "huggingface_hub is required for Hugging Face commit operations." - ) from error - return CommitOperationAdd( - path_in_repo=path_in_repo, - path_or_fileobj=text.encode("utf-8"), - ) - - -def load_existing_registry_text(config: HuggingFacePublishConfig) -> str: - """Download the existing diagnostics registry JSONL, or return empty text.""" - if config.diagnostics_repo is None: - return "" - try: - from huggingface_hub import hf_hub_download - except ImportError: - return "" - try: - path = hf_hub_download( - repo_id=config.diagnostics_repo, - filename="run_registry.jsonl", - repo_type=config.repo_type, - token=config.token, - ) - except Exception: - return "" - return Path(path).read_text() - - -def _append_registry_jsonl(existing_text: str, entry: dict[str, Any]) -> str: - lines = [line for line in existing_text.splitlines() if line.strip()] - lines = [ - line - for line in lines - if json.loads(line).get("run_id") != entry["run_id"] - ] - lines.append(json.dumps(entry, sort_keys=True)) - return "\n".join(lines) + "\n" - - -def _write_json(path: str | Path, payload: dict[str, Any]) -> None: - resolved = Path(path) - temp_path = resolved.with_suffix(resolved.suffix + ".tmp") - temp_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - temp_path.replace(resolved) - - -def _build_config_from_args(args: argparse.Namespace) -> HuggingFacePublishConfig: - env_config = HuggingFacePublishConfig.from_env() - return HuggingFacePublishConfig( - diagnostics_repo=args.diagnostics_repo or env_config.diagnostics_repo, - dataset_repo=args.dataset_repo or env_config.dataset_repo, - repo_type=args.repo_type or env_config.repo_type, - token=args.token or env_config.token, - diagnostics_run_prefix=( - args.diagnostics_run_prefix or env_config.diagnostics_run_prefix - ), - dataset_staging_prefix=( - args.dataset_staging_prefix or env_config.dataset_staging_prefix - ), - ) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Publish a Microplex artifact bundle to Hugging Face." - ) - parser.add_argument("artifact_dir", type=Path) - parser.add_argument("--run-id", default=None) - parser.add_argument("--diagnostics-repo", default=None) - parser.add_argument("--dataset-repo", default=None) - parser.add_argument("--repo-type", default=None) - parser.add_argument("--token", default=None) - parser.add_argument("--diagnostics-run-prefix", default=None) - parser.add_argument("--dataset-staging-prefix", default=None) - parser.add_argument( - "--no-diagnostics", - action="store_true", - help="Do not publish diagnostics JSON files.", - ) - parser.add_argument( - "--publish-dataset", - action="store_true", - help="Also publish policyengine_us.h5 and manifest.json to the dataset repo.", - ) - parser.add_argument( - "--promote-dataset", - action="store_true", - help="Also write policyengine_us.h5 and manifest.json at the dataset repo root.", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Write hf_publish_manifest.json without uploading.", - ) - args = parser.parse_args(argv) - try: - config = _build_config_from_args(args) - result = publish_microplex_artifact_to_hf( - args.artifact_dir, - config, - run_id=args.run_id, - publish_diagnostics=not args.no_diagnostics, - publish_dataset=args.publish_dataset or args.promote_dataset, - promote_dataset=args.promote_dataset, - dry_run=args.dry_run, - ) - except Exception as error: # noqa: BLE001 - CLI should report concise failure. - print(f"Hugging Face publish failed: {error}", file=sys.stderr) - return 1 - mode = "planned" if args.dry_run else "published" - print(f"Hugging Face artifact {mode}: {result['run_id']}") - print(args.artifact_dir / HF_PUBLISH_MANIFEST_FILENAME) - return 0 - - -def main_smoke(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Smoke-check a published Microplex Hugging Face artifact." - ) - parser.add_argument("--run-id", default=None) - parser.add_argument("--diagnostics-repo", default=None) - parser.add_argument("--dataset-repo", default=None) - parser.add_argument("--repo-type", default=None) - parser.add_argument("--token", default=None) - parser.add_argument("--diagnostics-run-prefix", default=None) - parser.add_argument("--dataset-staging-prefix", default=None) - parser.add_argument( - "--no-dataset", - action="store_true", - help="Only check diagnostics files.", - ) - parser.add_argument( - "--no-promoted-dataset", - action="store_true", - help="Do not require root policyengine_us.h5 and manifest.json.", - ) - parser.add_argument( - "--json", - action="store_true", - help="Print the full smoke-check payload as JSON.", - ) - args = parser.parse_args(argv) - try: - config = _build_config_from_args(args) - result = smoke_published_hf_artifact( - config, - run_id=args.run_id, - check_dataset=not args.no_dataset, - check_promoted_dataset=not args.no_promoted_dataset, - ) - except Exception as error: # noqa: BLE001 - CLI should report concise failure. - print(f"Hugging Face smoke check failed: {error}", file=sys.stderr) - return 1 - if args.json: - print(json.dumps(result, indent=2, sort_keys=True)) - elif result["status"] == "passed": - print(f"Hugging Face artifact smoke check passed: {result['run_id']}") - else: - print( - f"Hugging Face artifact smoke check failed: {result['run_id']} " - f"({result['missing_count']} missing files)", - file=sys.stderr, - ) - return 0 if result["status"] == "passed" else 1 - - -if __name__ == "__main__": # pragma: no cover - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/imputation_ablation.py b/src/microplex_us/pipelines/imputation_ablation.py deleted file mode 100644 index 2793f051..00000000 --- a/src/microplex_us/pipelines/imputation_ablation.py +++ /dev/null @@ -1,454 +0,0 @@ -"""Ablation scoring for donor-imputation conditioning hypotheses.""" - -from __future__ import annotations - -from collections.abc import Mapping, Sequence -from dataclasses import asdict, dataclass, field -from typing import Any - -import numpy as np -import pandas as pd - -from microplex_us.variables import PUF_IRS_TAX_PREFERRED_CONDITION_VARS - - -@dataclass(frozen=True) -class ImputationAblationVariant: - """Describe one imputation strategy under test.""" - - name: str - description: str - condition_selection: str - hard_gate_columns: tuple[str, ...] = () - primary_predictors: tuple[str, ...] = () - secondary_predictors: tuple[str, ...] = () - forbidden_predictors: tuple[str, ...] = () - support_mapping: str = "rank" - semantic_guards: bool = False - - -@dataclass(frozen=True) -class ImputationAblationSliceSpec: - """Joint slice used to test conditional imputation structure.""" - - name: str - columns: tuple[str, ...] - min_weight: float = 0.0 - - -@dataclass(frozen=True) -class ImputationTargetScore: - """Target-level observed-vs-imputed score for one variant.""" - - target: str - row_count: int - observed_positive_rate: float - imputed_positive_rate: float - support_precision: float - support_recall: float - support_f1: float - mean_absolute_error: float - weighted_mean_absolute_error: float - weighted_total_relative_error: float - - -@dataclass(frozen=True) -class ImputationSliceScore: - """Conditional distribution score for one target and joint slice.""" - - target: str - slice_name: str - columns: tuple[str, ...] - cell_count: int - total_js_divergence: float | None - support_js_divergence: float | None - mean_abs_positive_rate_delta: float | None - - -@dataclass(frozen=True) -class ImputationAblationVariantScore: - """All scores for one imputation ablation variant.""" - - variant: ImputationAblationVariant - target_scores: dict[str, ImputationTargetScore] - slice_scores: tuple[ImputationSliceScore, ...] - aggregate_metrics: dict[str, float | None] - post_calibration_metrics: dict[str, float] = field(default_factory=dict) - - -@dataclass(frozen=True) -class ImputationAblationReport: - """Comparable pre/post calibration imputation ablation scorecard.""" - - row_count: int - targets: tuple[str, ...] - slice_specs: tuple[ImputationAblationSliceSpec, ...] - variants: dict[str, ImputationAblationVariantScore] - - def to_dict(self) -> dict[str, Any]: - return asdict(self) - - -def default_imputation_ablation_variants() -> tuple[ImputationAblationVariant, ...]: - """Return the first hypothesis test variants for QRF conditioning.""" - pe_tax_predictors = tuple(PUF_IRS_TAX_PREFERRED_CONDITION_VARS) - structural_gates = ( - "age_group", - "tax_unit_is_joint", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - ) - return ( - ImputationAblationVariant( - name="broad_common_qrf", - description="Current-style QRF using every compatible common predictor.", - condition_selection="all_shared", - support_mapping="rank", - semantic_guards=False, - ), - ImputationAblationVariant( - name="structured_pe_conditioning", - description=( - "PE-style structural gates, preferred tax-unit predictors, and " - "zero-inflated support mapping." - ), - condition_selection="pe_prespecified", - hard_gate_columns=structural_gates, - primary_predictors=pe_tax_predictors, - secondary_predictors=("state_fips", "employment_income", "income"), - forbidden_predictors=("policyengine_output", "post_calibration_weight"), - support_mapping="zero_inflated_positive", - semantic_guards=True, - ), - ImputationAblationVariant( - name="broad_common_with_guards", - description=( - "Broad common-predictor QRF with the same semantic guards, isolating " - "guard effects from conditioning effects." - ), - condition_selection="all_shared", - support_mapping="rank", - semantic_guards=True, - ), - ImputationAblationVariant( - name="rich_predictor_stress", - description=( - "Over-rich predictor set used to test whether more predictors alone " - "beat explicit structure." - ), - condition_selection="all_shared", - secondary_predictors=("state_fips", "education", "occupation", "survey_id"), - support_mapping="rank", - semantic_guards=False, - ), - ) - - -def score_imputation_ablation_variants( - *, - observed_frame: pd.DataFrame, - imputed_frames: Mapping[str, pd.DataFrame], - target_variables: Sequence[str], - slice_specs: Sequence[ImputationAblationSliceSpec] = (), - variants: Sequence[ImputationAblationVariant] | None = None, - weight_column: str | None = None, - post_calibration_metrics: Mapping[str, Mapping[str, float]] | None = None, -) -> ImputationAblationReport: - """Score imputed candidate frames against masked-observed truth proxies.""" - targets = tuple(dict.fromkeys(target_variables)) - if not targets: - raise ValueError("target_variables must not be empty") - _require_columns(observed_frame, targets) - if weight_column is not None: - _require_columns(observed_frame, (weight_column,)) - - variant_defs = { - variant.name: variant - for variant in ( - tuple(variants) - if variants is not None - else default_imputation_ablation_variants() - ) - } - missing_variant_defs = set(imputed_frames) - set(variant_defs) - if missing_variant_defs: - for name in sorted(missing_variant_defs): - variant_defs[name] = ImputationAblationVariant( - name=name, - description="Ad hoc imputation candidate supplied to the ablation scorer.", - condition_selection="unspecified", - ) - - weights = _weight_series(observed_frame, weight_column) - scores: dict[str, ImputationAblationVariantScore] = {} - for variant_name, imputed_frame in imputed_frames.items(): - _validate_frame_pair(observed_frame, imputed_frame, targets) - target_scores = { - target: _score_target( - observed_frame[target], - imputed_frame[target], - weights=weights, - target=target, - ) - for target in targets - } - slice_scores = tuple( - _score_slice( - observed_frame=observed_frame, - imputed_frame=imputed_frame, - weights=weights, - target=target, - slice_spec=slice_spec, - ) - for target in targets - for slice_spec in slice_specs - ) - scores[variant_name] = ImputationAblationVariantScore( - variant=variant_defs[variant_name], - target_scores=target_scores, - slice_scores=slice_scores, - aggregate_metrics=_aggregate_variant_metrics( - target_scores=target_scores, - slice_scores=slice_scores, - ), - post_calibration_metrics=dict( - (post_calibration_metrics or {}).get(variant_name, {}) - ), - ) - - return ImputationAblationReport( - row_count=len(observed_frame), - targets=targets, - slice_specs=tuple(slice_specs), - variants=scores, - ) - - -def _score_target( - observed: pd.Series, - imputed: pd.Series, - *, - weights: pd.Series, - target: str, -) -> ImputationTargetScore: - observed_numeric = _numeric_series(observed) - imputed_numeric = _numeric_series(imputed) - observed_positive = observed_numeric > 0.0 - imputed_positive = imputed_numeric > 0.0 - true_positive_weight = float(weights[observed_positive & imputed_positive].sum()) - imputed_positive_weight = float(weights[imputed_positive].sum()) - observed_positive_weight = float(weights[observed_positive].sum()) - precision = _safe_ratio(true_positive_weight, imputed_positive_weight) - recall = _safe_ratio(true_positive_weight, observed_positive_weight) - absolute_error = (imputed_numeric - observed_numeric).abs() - observed_total = float((observed_numeric * weights).sum()) - imputed_total = float((imputed_numeric * weights).sum()) - return ImputationTargetScore( - target=target, - row_count=len(observed_numeric), - observed_positive_rate=_safe_ratio( - observed_positive_weight, float(weights.sum()) - ), - imputed_positive_rate=_safe_ratio( - imputed_positive_weight, float(weights.sum()) - ), - support_precision=precision, - support_recall=recall, - support_f1=_safe_f1(precision, recall), - mean_absolute_error=float(absolute_error.mean()), - weighted_mean_absolute_error=_safe_ratio( - float((absolute_error * weights).sum()), - float(weights.sum()), - ), - weighted_total_relative_error=_relative_error(imputed_total, observed_total), - ) - - -def _score_slice( - *, - observed_frame: pd.DataFrame, - imputed_frame: pd.DataFrame, - weights: pd.Series, - target: str, - slice_spec: ImputationAblationSliceSpec, -) -> ImputationSliceScore: - missing = [ - column for column in slice_spec.columns if column not in observed_frame.columns - ] - if missing: - return ImputationSliceScore( - target=target, - slice_name=slice_spec.name, - columns=slice_spec.columns, - cell_count=0, - total_js_divergence=None, - support_js_divergence=None, - mean_abs_positive_rate_delta=None, - ) - - observed_numeric = _numeric_series(observed_frame[target]) - imputed_numeric = _numeric_series(imputed_frame[target]) - cell_keys = _cell_keys(observed_frame, slice_spec.columns) - cells = sorted(cell_keys.unique()) - observed_totals: list[float] = [] - imputed_totals: list[float] = [] - observed_support: list[float] = [] - imputed_support: list[float] = [] - positive_rate_deltas: list[float] = [] - for cell in cells: - mask = cell_keys == cell - cell_weight = float(weights[mask].sum()) - if cell_weight <= slice_spec.min_weight: - continue - observed_cell = observed_numeric[mask] - imputed_cell = imputed_numeric[mask] - cell_weights = weights[mask] - observed_totals.append(float((observed_cell * cell_weights).sum())) - imputed_totals.append(float((imputed_cell * cell_weights).sum())) - observed_support_weight = float(cell_weights[observed_cell > 0.0].sum()) - imputed_support_weight = float(cell_weights[imputed_cell > 0.0].sum()) - observed_support.append(observed_support_weight) - imputed_support.append(imputed_support_weight) - positive_rate_deltas.append( - abs( - _safe_ratio(imputed_support_weight, cell_weight) - - _safe_ratio(observed_support_weight, cell_weight) - ) - ) - - return ImputationSliceScore( - target=target, - slice_name=slice_spec.name, - columns=slice_spec.columns, - cell_count=len(observed_totals), - total_js_divergence=_jensen_shannon_divergence(observed_totals, imputed_totals), - support_js_divergence=_jensen_shannon_divergence( - observed_support, imputed_support - ), - mean_abs_positive_rate_delta=( - float(np.mean(positive_rate_deltas)) if positive_rate_deltas else None - ), - ) - - -def _aggregate_variant_metrics( - *, - target_scores: Mapping[str, ImputationTargetScore], - slice_scores: Sequence[ImputationSliceScore], -) -> dict[str, float | None]: - return { - "mean_weighted_mae": _mean_or_none( - [score.weighted_mean_absolute_error for score in target_scores.values()] - ), - "mean_total_relative_error": _mean_or_none( - [score.weighted_total_relative_error for score in target_scores.values()] - ), - "mean_support_f1": _mean_or_none( - [score.support_f1 for score in target_scores.values()] - ), - "mean_slice_total_js_divergence": _mean_or_none( - [score.total_js_divergence for score in slice_scores] - ), - "mean_slice_support_js_divergence": _mean_or_none( - [score.support_js_divergence for score in slice_scores] - ), - "mean_slice_positive_rate_delta": _mean_or_none( - [score.mean_abs_positive_rate_delta for score in slice_scores] - ), - } - - -def _validate_frame_pair( - observed_frame: pd.DataFrame, - imputed_frame: pd.DataFrame, - targets: Sequence[str], -) -> None: - if len(imputed_frame) != len(observed_frame): - raise ValueError("observed_frame and imputed_frames must have the same length") - if not imputed_frame.index.equals(observed_frame.index): - raise ValueError("observed_frame and imputed_frames must have matching indexes") - _require_columns(imputed_frame, targets) - - -def _require_columns(frame: pd.DataFrame, columns: Sequence[str]) -> None: - missing = [column for column in columns if column not in frame.columns] - if missing: - raise ValueError(f"Frame is missing required columns: {missing}") - - -def _numeric_series(series: pd.Series) -> pd.Series: - return ( - pd.to_numeric(series, errors="coerce") - .replace([np.inf, -np.inf], np.nan) - .fillna(0.0) - .astype(float) - ) - - -def _weight_series(frame: pd.DataFrame, weight_column: str | None) -> pd.Series: - if weight_column is None: - return pd.Series(1.0, index=frame.index, dtype=float) - return _numeric_series(frame[weight_column]).clip(lower=0.0) - - -def _cell_keys(frame: pd.DataFrame, columns: Sequence[str]) -> pd.Series: - keys = frame.loc[:, list(columns)].astype("string").fillna("__MISSING__") - return keys.agg("|".join, axis=1) - - -def _relative_error(candidate: float, baseline: float) -> float: - baseline_abs = abs(float(baseline)) - if baseline_abs <= 1e-9: - return 0.0 if abs(float(candidate)) <= 1e-9 else 1.0 - return float(abs(float(candidate) - float(baseline)) / baseline_abs) - - -def _safe_ratio(numerator: float, denominator: float) -> float: - if abs(float(denominator)) <= 1e-12: - return 0.0 - return float(numerator) / float(denominator) - - -def _safe_f1(precision: float, recall: float) -> float: - denominator = precision + recall - if denominator <= 1e-12: - return 0.0 - return float(2.0 * precision * recall / denominator) - - -def _mean_or_none(values: Sequence[float | None]) -> float | None: - numeric = [float(value) for value in values if value is not None] - if not numeric: - return None - return float(np.mean(numeric)) - - -def _jensen_shannon_divergence( - observed_values: Sequence[float], - imputed_values: Sequence[float], -) -> float | None: - observed = np.asarray(observed_values, dtype=float).clip(min=0.0) - imputed = np.asarray(imputed_values, dtype=float).clip(min=0.0) - if observed.size == 0 or imputed.size == 0: - return None - observed_total = observed.sum() - imputed_total = imputed.sum() - if observed_total <= 1e-12 and imputed_total <= 1e-12: - return 0.0 - if observed_total <= 1e-12 or imputed_total <= 1e-12: - return 1.0 - observed_prob = observed / observed_total - imputed_prob = imputed / imputed_total - midpoint = 0.5 * (observed_prob + imputed_prob) - return float( - 0.5 * _kl_divergence(observed_prob, midpoint) - + 0.5 * _kl_divergence(imputed_prob, midpoint) - ) - - -def _kl_divergence(probabilities: np.ndarray, reference: np.ndarray) -> float: - mask = probabilities > 0.0 - return float( - np.sum(probabilities[mask] * np.log2(probabilities[mask] / reference[mask])) - ) diff --git a/src/microplex_us/pipelines/index_db.py b/src/microplex_us/pipelines/index_db.py deleted file mode 100644 index d16a8735..00000000 --- a/src/microplex_us/pipelines/index_db.py +++ /dev/null @@ -1,635 +0,0 @@ -"""Derived DuckDB index for querying saved US microplex artifacts.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -import duckdb - -from microplex_us.pipelines.registry import ( - FrontierMetric, - USMicroplexRunRegistryEntry, - load_us_microplex_run_registry, -) - -RUN_INDEX_FILENAME = "run_index.duckdb" - - -def resolve_us_microplex_run_index_path(path: str | Path) -> Path: - """Resolve a root directory or explicit DuckDB path to the run-index file.""" - candidate_path = Path(path) - if candidate_path.suffix == ".duckdb": - return candidate_path - return candidate_path / RUN_INDEX_FILENAME - - -def append_us_microplex_run_index_entry( - path: str | Path, - entry: USMicroplexRunRegistryEntry, - *, - policyengine_harness_payload: dict[str, Any] | None = None, -) -> Path: - """Upsert one saved run and its harness detail into the derived DuckDB index.""" - index_path = resolve_us_microplex_run_index_path(path) - index_path.parent.mkdir(parents=True, exist_ok=True) - with duckdb.connect(str(index_path)) as conn: - _ensure_schema(conn) - _delete_artifact_rows(conn, entry.artifact_id) - conn.execute( - """ - INSERT INTO runs ( - artifact_id, - created_at, - artifact_dir, - manifest_path, - policyengine_harness_path, - config_hash, - synthesis_backend, - calibration_backend, - source_names_json, - rows_seed, - rows_synthetic, - rows_calibrated, - weights_nonzero, - weights_total, - full_oracle_capped_mean_abs_relative_error, - full_oracle_mean_abs_relative_error, - candidate_mean_abs_relative_error, - baseline_mean_abs_relative_error, - mean_abs_relative_error_delta, - candidate_composite_parity_loss, - baseline_composite_parity_loss, - composite_parity_loss_delta, - slice_win_rate, - target_win_rate, - supported_target_rate, - tag_summaries_json, - parity_scorecard_json, - baseline_dataset, - targets_db, - target_period, - target_variables_json, - target_domains_json, - target_geo_levels_json, - target_reform_id, - policyengine_us_runtime_version, - improved_candidate_frontier, - improved_delta_frontier, - improved_composite_frontier, - metadata_json - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - _run_row(entry), - ) - if policyengine_harness_payload is not None: - slice_rows = _slice_rows(entry.artifact_id, policyengine_harness_payload) - if slice_rows: - conn.executemany( - """ - INSERT INTO slice_metrics ( - artifact_id, - slice_name, - description, - tags_json, - query_json, - candidate_supported_target_count, - candidate_unsupported_target_count, - candidate_mean_abs_relative_error, - candidate_max_abs_relative_error, - baseline_supported_target_count, - baseline_unsupported_target_count, - baseline_mean_abs_relative_error, - baseline_max_abs_relative_error, - mean_abs_relative_error_delta, - candidate_beats_baseline - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - slice_rows, - ) - target_rows = _target_metric_rows(entry.artifact_id, policyengine_harness_payload) - if target_rows: - conn.executemany( - """ - INSERT INTO target_metrics ( - artifact_id, - slice_name, - target_key, - target_name, - entity, - period, - measure, - aggregation, - target_value, - tolerance, - source, - units, - description, - geo_level, - geographic_id, - domain_variable, - target_metadata_json, - filters_json, - candidate_actual_value, - candidate_absolute_error, - candidate_relative_error, - baseline_actual_value, - baseline_absolute_error, - baseline_relative_error, - candidate_beats_baseline - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - target_rows, - ) - return index_path - - -def rebuild_us_microplex_run_index( - path: str | Path, - *, - registry_path: str | Path, -) -> Path: - """Rebuild the derived DuckDB index from canonical artifacts and registry entries.""" - index_path = resolve_us_microplex_run_index_path(path) - index_path.parent.mkdir(parents=True, exist_ok=True) - with duckdb.connect(str(index_path)) as conn: - _ensure_schema(conn) - conn.execute("DELETE FROM target_metrics") - conn.execute("DELETE FROM slice_metrics") - conn.execute("DELETE FROM runs") - for entry in load_us_microplex_run_registry(registry_path): - append_us_microplex_run_index_entry( - index_path, - entry, - policyengine_harness_payload=_load_harness_payload(entry.policyengine_harness_path), - ) - return index_path - - -def select_us_microplex_frontier_index_row( - path: str | Path, - *, - metric: FrontierMetric = "candidate_composite_parity_loss", -) -> dict[str, Any] | None: - """Select the best indexed run by one frontier metric.""" - metric_column = { - "full_oracle_capped_mean_abs_relative_error": "full_oracle_capped_mean_abs_relative_error", - "full_oracle_mean_abs_relative_error": "full_oracle_mean_abs_relative_error", - "candidate_composite_parity_loss": "candidate_composite_parity_loss", - "candidate_mean_abs_relative_error": "candidate_mean_abs_relative_error", - "mean_abs_relative_error_delta": "mean_abs_relative_error_delta", - }[metric] - index_path = resolve_us_microplex_run_index_path(path) - if not index_path.exists(): - return None - with duckdb.connect(str(index_path), read_only=True) as conn: - row = conn.execute( - f""" - SELECT * - FROM runs - WHERE {metric_column} IS NOT NULL - ORDER BY {metric_column} ASC, created_at ASC - LIMIT 1 - """ - ).fetchone() - if row is None: - return None - columns = [column[0] for column in conn.description] - return dict(zip(columns, row, strict=True)) - - -def list_us_microplex_target_delta_rows( - path: str | Path, - *, - artifact_id: str | None = None, - slice_name: str | None = None, - limit: int | None = None, -) -> list[dict[str, Any]]: - """List indexed per-target deltas, ordered by candidate-vs-baseline improvement.""" - index_path = resolve_us_microplex_run_index_path(path) - if not index_path.exists(): - return [] - conditions: list[str] = [] - parameters: list[Any] = [] - if artifact_id is not None: - conditions.append("artifact_id = ?") - parameters.append(artifact_id) - if slice_name is not None: - conditions.append("slice_name = ?") - parameters.append(slice_name) - where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else "" - limit_clause = f"LIMIT {int(limit)}" if limit is not None else "" - with duckdb.connect(str(index_path), read_only=True) as conn: - rows = conn.execute( - f""" - SELECT - artifact_id, - slice_name, - target_name, - geo_level, - domain_variable, - target_value, - candidate_relative_error, - baseline_relative_error, - ABS(candidate_relative_error) - ABS(baseline_relative_error) AS abs_relative_error_delta_vs_baseline, - candidate_beats_baseline - FROM target_metrics - {where_clause} - ORDER BY abs_relative_error_delta_vs_baseline ASC NULLS LAST, target_name ASC - {limit_clause} - """, - parameters, - ).fetchall() - columns = [column[0] for column in conn.description] - return [dict(zip(columns, row, strict=True)) for row in rows] - - -def compare_us_microplex_target_delta_rows( - path: str | Path, - *, - artifact_id: str, - baseline_artifact_id: str, - slice_name: str | None = None, - limit: int | None = None, -) -> list[dict[str, Any]]: - """Compare per-target candidate error between two saved artifacts.""" - index_path = resolve_us_microplex_run_index_path(path) - if not index_path.exists(): - return [] - conditions = [ - "current.artifact_id = ?", - "baseline.artifact_id = ?", - "current.target_key = baseline.target_key", - "current.slice_name = baseline.slice_name", - ] - parameters: list[Any] = [artifact_id, baseline_artifact_id] - if slice_name is not None: - conditions.append("current.slice_name = ?") - parameters.append(slice_name) - where_clause = f"WHERE {' AND '.join(conditions)}" - limit_clause = f"LIMIT {int(limit)}" if limit is not None else "" - with duckdb.connect(str(index_path), read_only=True) as conn: - rows = conn.execute( - f""" - SELECT - current.artifact_id, - baseline.artifact_id AS baseline_artifact_id, - current.slice_name, - current.target_key, - current.target_name, - current.geo_level, - current.domain_variable, - current.target_value, - current.candidate_relative_error, - baseline.candidate_relative_error AS baseline_candidate_relative_error, - ABS(current.candidate_relative_error) - ABS(baseline.candidate_relative_error) - AS abs_relative_error_delta_vs_other, - current.candidate_beats_baseline, - baseline.candidate_beats_baseline AS baseline_candidate_beats_baseline - FROM target_metrics AS current - JOIN target_metrics AS baseline - ON current.target_key = baseline.target_key - AND current.slice_name = baseline.slice_name - {where_clause} - ORDER BY abs_relative_error_delta_vs_other ASC NULLS LAST, current.target_name ASC - {limit_clause} - """, - parameters, - ).fetchall() - columns = [column[0] for column in conn.description] - return [dict(zip(columns, row, strict=True)) for row in rows] - - -def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: - conn.execute( - """ - CREATE TABLE IF NOT EXISTS runs ( - artifact_id TEXT PRIMARY KEY, - created_at TEXT, - artifact_dir TEXT, - manifest_path TEXT, - policyengine_harness_path TEXT, - config_hash TEXT, - synthesis_backend TEXT, - calibration_backend TEXT, - source_names_json TEXT, - rows_seed BIGINT, - rows_synthetic BIGINT, - rows_calibrated BIGINT, - weights_nonzero DOUBLE, - weights_total DOUBLE, - full_oracle_capped_mean_abs_relative_error DOUBLE, - full_oracle_mean_abs_relative_error DOUBLE, - candidate_mean_abs_relative_error DOUBLE, - baseline_mean_abs_relative_error DOUBLE, - mean_abs_relative_error_delta DOUBLE, - candidate_composite_parity_loss DOUBLE, - baseline_composite_parity_loss DOUBLE, - composite_parity_loss_delta DOUBLE, - slice_win_rate DOUBLE, - target_win_rate DOUBLE, - supported_target_rate DOUBLE, - tag_summaries_json TEXT, - parity_scorecard_json TEXT, - baseline_dataset TEXT, - targets_db TEXT, - target_period BIGINT, - target_variables_json TEXT, - target_domains_json TEXT, - target_geo_levels_json TEXT, - target_reform_id BIGINT, - policyengine_us_runtime_version TEXT, - improved_candidate_frontier BOOLEAN, - improved_delta_frontier BOOLEAN, - improved_composite_frontier BOOLEAN, - metadata_json TEXT - ) - """ - ) - _ensure_column( - conn, - "runs", - "full_oracle_capped_mean_abs_relative_error", - "DOUBLE", - ) - _ensure_column( - conn, - "runs", - "full_oracle_mean_abs_relative_error", - "DOUBLE", - ) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS slice_metrics ( - artifact_id TEXT, - slice_name TEXT, - description TEXT, - tags_json TEXT, - query_json TEXT, - candidate_supported_target_count BIGINT, - candidate_unsupported_target_count BIGINT, - candidate_mean_abs_relative_error DOUBLE, - candidate_max_abs_relative_error DOUBLE, - baseline_supported_target_count BIGINT, - baseline_unsupported_target_count BIGINT, - baseline_mean_abs_relative_error DOUBLE, - baseline_max_abs_relative_error DOUBLE, - mean_abs_relative_error_delta DOUBLE, - candidate_beats_baseline BOOLEAN - ) - """ - ) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS target_metrics ( - artifact_id TEXT, - slice_name TEXT, - target_key TEXT, - target_name TEXT, - entity TEXT, - period BIGINT, - measure TEXT, - aggregation TEXT, - target_value DOUBLE, - tolerance DOUBLE, - source TEXT, - units TEXT, - description TEXT, - geo_level TEXT, - geographic_id TEXT, - domain_variable TEXT, - target_metadata_json TEXT, - filters_json TEXT, - candidate_actual_value DOUBLE, - candidate_absolute_error DOUBLE, - candidate_relative_error DOUBLE, - baseline_actual_value DOUBLE, - baseline_absolute_error DOUBLE, - baseline_relative_error DOUBLE, - candidate_beats_baseline BOOLEAN - ) - """ - ) - - -def _delete_artifact_rows(conn: duckdb.DuckDBPyConnection, artifact_id: str) -> None: - conn.execute("DELETE FROM target_metrics WHERE artifact_id = ?", [artifact_id]) - conn.execute("DELETE FROM slice_metrics WHERE artifact_id = ?", [artifact_id]) - conn.execute("DELETE FROM runs WHERE artifact_id = ?", [artifact_id]) - - -def _run_row(entry: USMicroplexRunRegistryEntry) -> tuple[Any, ...]: - return ( - entry.artifact_id, - entry.created_at, - entry.artifact_dir, - entry.manifest_path, - entry.policyengine_harness_path, - entry.config_hash, - entry.synthesis_backend, - entry.calibration_backend, - _json_text(entry.source_names), - _int_or_none(entry.rows.get("seed")), - _int_or_none(entry.rows.get("synthetic")), - _int_or_none(entry.rows.get("calibrated")), - _float_or_none(entry.weights.get("nonzero")), - _float_or_none(entry.weights.get("total")), - _float_or_none(entry.full_oracle_capped_mean_abs_relative_error), - _float_or_none(entry.full_oracle_mean_abs_relative_error), - _float_or_none(entry.candidate_mean_abs_relative_error), - _float_or_none(entry.baseline_mean_abs_relative_error), - _float_or_none(entry.mean_abs_relative_error_delta), - _float_or_none(entry.candidate_composite_parity_loss), - _float_or_none(entry.baseline_composite_parity_loss), - _float_or_none(entry.composite_parity_loss_delta), - _float_or_none(entry.slice_win_rate), - _float_or_none(entry.target_win_rate), - _float_or_none(entry.supported_target_rate), - _json_text(entry.tag_summaries), - _json_text(entry.parity_scorecard), - entry.baseline_dataset, - entry.targets_db, - _int_or_none(entry.target_period), - _json_text(entry.target_variables), - _json_text(entry.target_domains), - _json_text(entry.target_geo_levels), - _int_or_none(entry.target_reform_id), - entry.policyengine_us_runtime_version, - entry.improved_candidate_frontier, - entry.improved_delta_frontier, - entry.improved_composite_frontier, - _json_text(entry.metadata), - ) - - -def _ensure_column( - conn: duckdb.DuckDBPyConnection, - table_name: str, - column_name: str, - column_type: str, -) -> None: - existing = { - str(row[1]) - for row in conn.execute(f"PRAGMA table_info('{table_name}')").fetchall() - } - if column_name in existing: - return - conn.execute( - f"ALTER TABLE {table_name} ADD COLUMN {column_name} {column_type}" - ) - - -def _slice_rows( - artifact_id: str, - harness_payload: dict[str, Any], -) -> list[tuple[Any, ...]]: - rows: list[tuple[Any, ...]] = [] - for slice_payload in harness_payload.get("slices", []): - summary = dict(slice_payload.get("summary", {})) - delta = summary.get("mean_abs_relative_error_delta") - rows.append( - ( - artifact_id, - slice_payload["name"], - slice_payload.get("description"), - _json_text(slice_payload.get("tags", [])), - _json_text(slice_payload.get("query", {})), - _int_or_none(summary.get("candidate_supported_target_count")), - _int_or_none(summary.get("candidate_unsupported_target_count")), - _float_or_none(summary.get("candidate_mean_abs_relative_error")), - _float_or_none(summary.get("candidate_max_abs_relative_error")), - _int_or_none(summary.get("baseline_supported_target_count")), - _int_or_none(summary.get("baseline_unsupported_target_count")), - _float_or_none(summary.get("baseline_mean_abs_relative_error")), - _float_or_none(summary.get("baseline_max_abs_relative_error")), - _float_or_none(delta), - (delta < 0.0) if delta is not None else None, - ) - ) - return rows - - -def _target_metric_rows( - artifact_id: str, - harness_payload: dict[str, Any], -) -> list[tuple[Any, ...]]: - rows: list[tuple[Any, ...]] = [] - for slice_payload in harness_payload.get("slices", []): - slice_name = slice_payload["name"] - candidate_payload = dict(slice_payload.get("candidate", {})) - baseline_payload = ( - dict(slice_payload.get("baseline", {})) - if slice_payload.get("baseline") is not None - else {} - ) - candidate_by_key = { - _target_key(item["target"]): item - for item in candidate_payload.get("evaluations", []) - } - baseline_by_key = { - _target_key(item["target"]): item - for item in baseline_payload.get("evaluations", []) - } - for target_key in sorted(set(candidate_by_key) | set(baseline_by_key)): - candidate_item = candidate_by_key.get(target_key) - baseline_item = baseline_by_key.get(target_key) - target_payload = ( - dict(candidate_item["target"]) - if candidate_item is not None - else dict(baseline_item["target"]) - ) - metadata = dict(target_payload.get("metadata", {})) - candidate_relative_error = _float_or_none( - candidate_item.get("relative_error") - if candidate_item is not None - else None - ) - baseline_relative_error = _float_or_none( - baseline_item.get("relative_error") - if baseline_item is not None - else None - ) - candidate_beats_baseline = None - if ( - candidate_relative_error is not None - and baseline_relative_error is not None - ): - candidate_beats_baseline = abs(candidate_relative_error) < abs( - baseline_relative_error - ) - rows.append( - ( - artifact_id, - slice_name, - target_key, - target_payload["name"], - target_payload["entity"], - _int_or_none(target_payload.get("period")), - target_payload.get("measure"), - target_payload.get("aggregation"), - _float_or_none(target_payload.get("value")), - _float_or_none(target_payload.get("tolerance")), - target_payload.get("source"), - target_payload.get("units"), - target_payload.get("description"), - metadata.get("geo_level"), - metadata.get("geographic_id"), - metadata.get("domain_variable"), - _json_text(metadata), - _json_text(target_payload.get("filters", [])), - _float_or_none( - candidate_item.get("actual_value") - if candidate_item is not None - else None - ), - _float_or_none( - candidate_item.get("absolute_error") - if candidate_item is not None - else None - ), - candidate_relative_error, - _float_or_none( - baseline_item.get("actual_value") - if baseline_item is not None - else None - ), - _float_or_none( - baseline_item.get("absolute_error") - if baseline_item is not None - else None - ), - baseline_relative_error, - candidate_beats_baseline, - ) - ) - return rows - - -def _load_harness_payload(path: str | None) -> dict[str, Any] | None: - if path is None: - return None - harness_path = Path(path) - if not harness_path.exists(): - return None - return json.loads(harness_path.read_text()) - - -def _target_key(target_payload: dict[str, Any]) -> str: - return json.dumps(target_payload, sort_keys=True, separators=(",", ":")) - - -def _json_text(value: Any) -> str: - return json.dumps(value, sort_keys=True) - - -def _int_or_none(value: Any) -> int | None: - if value is None: - return None - return int(value) - - -def _float_or_none(value: Any) -> float | None: - if value is None: - return None - return float(value) diff --git a/src/microplex_us/pipelines/local_reweighting.py b/src/microplex_us/pipelines/local_reweighting.py deleted file mode 100644 index 7bbd2e9e..00000000 --- a/src/microplex_us/pipelines/local_reweighting.py +++ /dev/null @@ -1,189 +0,0 @@ -"""Thin US adapter for shared target-driven household reweighting.""" - -from __future__ import annotations - -from dataclasses import dataclass - -import pandas as pd -from microplex.core import EntityType -from microplex.targets import ( - EntityTableBinding, - EntityTableBundle, - TargetConstraintCompilationResult, - TargetReweightingDiagnostics, - TargetSpec, - reweight_entity_table_bundle_targets, -) - -from microplex_us.policyengine.us import PolicyEngineUSEntityTableBundle - - -@dataclass(frozen=True) -class USHouseholdTargetReweightingResult: - """Result of applying shared target reweighting to a US PE table bundle.""" - - tables: PolicyEngineUSEntityTableBundle - compilation: TargetConstraintCompilationResult - diagnostics: TargetReweightingDiagnostics - - -def reweight_us_household_targets( - tables: PolicyEngineUSEntityTableBundle, - *, - targets: list[TargetSpec], - max_iter: int = 8, - tol: float = 1e-4, - factor_bounds: tuple[float, float] = (0.5, 2.0), -) -> USHouseholdTargetReweightingResult: - """Reweight US household-aligned PE tables using the shared core module.""" - bundle_result = reweight_entity_table_bundle_targets( - _as_entity_table_bundle(tables), - targets=targets, - max_iter=max_iter, - tol=tol, - factor_bounds=factor_bounds, - ) - updated_tables = _policyengine_us_bundle_from_entity_table_bundle( - tables, - bundle_result.bundle, - ) - return USHouseholdTargetReweightingResult( - tables=updated_tables, - compilation=bundle_result.compilation, - diagnostics=bundle_result.diagnostics, - ) - - -def _as_entity_table_bundle(tables: PolicyEngineUSEntityTableBundle) -> EntityTableBundle: - bindings: dict[EntityType, EntityTableBinding] = { - EntityType.HOUSEHOLD: EntityTableBinding( - frame=tables.households, - id_column="household_id", - ), - } - for entity, frame, id_column, weight_link_column, synced_weight_column in ( - (EntityType.PERSON, tables.persons, "person_id", "household_id", "weight"), - ( - EntityType.TAX_UNIT, - tables.tax_units, - "tax_unit_id", - "household_id", - "household_weight", - ), - ( - EntityType.SPM_UNIT, - tables.spm_units, - "spm_unit_id", - "household_id", - "household_weight", - ), - ( - EntityType.FAMILY, - tables.families, - "family_id", - "household_id", - "household_weight", - ), - ): - if frame is None: - continue - resolved_link_column = ( - weight_link_column - if weight_link_column in frame.columns - else "person_household_id" - if "person_household_id" in frame.columns - else None - ) - bindings[entity] = EntityTableBinding( - frame=frame, - id_column=id_column, - weight_link_column=resolved_link_column, - synced_weight_column=synced_weight_column if resolved_link_column else None, - ) - return EntityTableBundle( - weight_entity=EntityType.HOUSEHOLD, - weight_column="household_weight", - bindings=bindings, - ) - - -def _policyengine_us_bundle_from_entity_table_bundle( - tables: PolicyEngineUSEntityTableBundle, - bundle: EntityTableBundle, -) -> PolicyEngineUSEntityTableBundle: - households = bundle.table_for(EntityType.HOUSEHOLD).copy() - household_weights = households.set_index("household_id")["household_weight"] - return PolicyEngineUSEntityTableBundle( - households=households, - persons=_table_or_synced_weights( - bundle, - tables.persons, - EntityType.PERSON, - household_weights, - weight_column="weight", - ), - tax_units=_table_or_synced_weights( - bundle, - tables.tax_units, - EntityType.TAX_UNIT, - household_weights, - weight_column="household_weight", - ), - spm_units=_table_or_synced_weights( - bundle, - tables.spm_units, - EntityType.SPM_UNIT, - household_weights, - weight_column="household_weight", - ), - families=_table_or_synced_weights( - bundle, - tables.families, - EntityType.FAMILY, - household_weights, - weight_column="household_weight", - ), - marital_units=_sync_entity_weights( - tables.marital_units, - household_weights, - weight_column="household_weight", - ), - ) - - -def _table_or_synced_weights( - bundle: EntityTableBundle, - fallback_frame: pd.DataFrame | None, - entity: EntityType, - household_weights: pd.Series, - *, - weight_column: str, -) -> pd.DataFrame | None: - if fallback_frame is None: - return None - try: - return bundle.table_for(entity).copy() - except KeyError: - return _sync_entity_weights( - fallback_frame, - household_weights, - weight_column=weight_column, - ) - - -def _sync_entity_weights( - frame: pd.DataFrame | None, - household_weights: pd.Series, - *, - weight_column: str, -) -> pd.DataFrame | None: - if frame is None: - return None - updated = frame.copy() - household_id_column = ( - "person_household_id" if "person_household_id" in updated.columns else "household_id" - ) - if household_id_column not in updated.columns: - return updated - updated[weight_column] = updated[household_id_column].map(household_weights) - return updated diff --git a/src/microplex_us/pipelines/mp300k_artifact_gates.py b/src/microplex_us/pipelines/mp300k_artifact_gates.py deleted file mode 100644 index e17fc34f..00000000 --- a/src/microplex_us/pipelines/mp300k_artifact_gates.py +++ /dev/null @@ -1,2568 +0,0 @@ -"""Persistent artifact quality gates for mp-300k candidates.""" - -from __future__ import annotations - -import argparse -import hashlib -import json -import re -from datetime import UTC, datetime -from pathlib import Path -from typing import Any, Literal - -import h5py -import numpy as np - -from microplex_us.pipelines.mp_benchmark_manifest import ( - frozen_production_ecps_benchmark_manifest_descriptor, - frozen_production_pin_mismatches, - load_frozen_production_ecps_benchmark_manifest, -) - -GateStatus = Literal["pass", "fail", "unmeasured"] - -_ENTITY_ID_ARRAYS = { - "household": "household_id", - "person": "person_id", - "tax_unit": "tax_unit_id", - "spm_unit": "spm_unit_id", - "family": "family_id", - "marital_unit": "marital_unit_id", -} -_PERSON_LINK_ARRAYS = { - "household": "person_household_id", - "tax_unit": "person_tax_unit_id", - "spm_unit": "person_spm_unit_id", - "family": "person_family_id", - "marital_unit": "person_marital_unit_id", -} -_REQUIRED_PERIOD_ARRAYS = ( - "household_id", - "household_weight", - "person_id", - "person_household_id", - "tax_unit_id", - "person_tax_unit_id", - "spm_unit_id", - "person_spm_unit_id", - "family_id", - "person_family_id", - "marital_unit_id", - "person_marital_unit_id", -) -_DEFAULT_REQUIRED_GATES = ( - "candidate_artifact", - "compatibility", - "column_contract", - "export_support", - "export_lineage", - "artifact_size", - "runtime", - "source_weight_diagnostics", - "ecps_comparison", - "arch_target_coverage", - "benchmark_manifest", -) -_DEFAULT_ARCH_COVERAGE_PROFILE = "pe_native_broad_source_backed" -_PROTECTED_ECPS_TARGET_FAMILIES = ( - "ssi", - "snap", - "wages", - "self_employment_income", - "capital_gains", - "interest", - "dividends", - "retirement_income", - "disability", - "household_net_income", -) -_CORE_BENCHMARK_ECPS_TARGET_FAMILIES = ( - "state_agi_distribution", - "state_age_distribution", - "national_ssa", - "national_irs_other", - "state_aca_spending", -) -_FAMILY_FLOOR_ALIASES = { - "state_aca_spending": ("state_aca_enrollment",), -} -_PROTECTED_FAMILY_RELATIVE_LOSS_TOLERANCE = 0.05 -_PROTECTED_FAMILY_ABSOLUTE_LOSS_TOLERANCE = 0.005 -_DEFAULT_MAX_SUPPORT_WEIGHT_SHARE = 0.25 -_FORBIDDEN_SOURCE_DIAGNOSTIC_VARIABLES = frozenset( - { - "ssi_reported", - "ssi_amount_reported", - "ssdi_reported", - "snap_reported", - "tanf_reported", - "wic_reported", - "source_dataset", - "source_survey", - "donor_source", - "imputation_source", - } -) -_FORBIDDEN_SOURCE_DIAGNOSTIC_PREFIXES = ( - "diagnostic_", - "source_dataset_", - "source_survey_", - "donor_source_", - "imputation_source_", -) -_FORBIDDEN_SOURCE_DIAGNOSTIC_SUFFIXES = ( - "_diagnostic", - "_diagnostics", - "_source_dataset", - "_source_survey", - "_donor_source", - "_imputation_source", -) -_REQUIRED_BENCHMARK_MANIFEST_EVIDENCE = { - "certificate_type": ( - ("certificate_type",), - ("frozen_ecps_baseline_certificate", "certificate_type"), - ), - "period": ( - ("period",), - ("target_period",), - ), - "baseline_dataset.path": ( - ("baseline_dataset", "path"), - ("baseline_dataset_path",), - ("enhanced_cps", "path"), - ("enhanced_cps_path",), - ), - "baseline_dataset.sha256": ( - ("baseline_dataset", "sha256"), - ("baseline_dataset_sha256",), - ("enhanced_cps", "sha256"), - ("enhanced_cps_sha256",), - ), - "policyengine_us_data.commit": ( - ("policyengine_us_data", "commit"), - ("policyengine_us_data", "commit_sha"), - ("policyengine_us_data_commit",), - ("policyengine_us_data_commit_sha",), - ("us_data_commit",), - ), - "policyengine_us.version": ( - ("policyengine_us", "version"), - ("policyengine_us_version",), - ), - "target_surface.target_profile": ( - ("target_surface", "target_profile"), - ("target_profile",), - ), - "target_surface.target_scope": ( - ("target_surface", "target_scope"), - ("target_scope",), - ("target_scope_filter",), - ), - "target_surface.target_count": ( - ("target_surface", "target_count"), - ("target_count",), - ), - "target_surface.target_names_sha256": ( - ("target_surface", "target_names_sha256"), - ("target_names_sha256",), - ), - "scoring_config.sha256": ( - ("scoring_config", "sha256"), - ("scoring_config_sha256",), - ), - "baseline_metrics.baseline_enhanced_cps_native_loss": ( - ("baseline_metrics", "baseline_enhanced_cps_native_loss"), - ("baseline_enhanced_cps_native_loss",), - ), - "baseline_metrics.baseline_holdout_loss": ( - ("baseline_metrics", "baseline_holdout_loss"), - ("baseline_holdout_loss",), - ), - "baseline_metrics.baseline_unweighted_msre": ( - ("baseline_metrics", "baseline_unweighted_msre"), - ("baseline_unweighted_msre",), - ), - "target_db.path": ( - ("target_db", "path"), - ("target_db_path",), - ("targets_db", "path"), - ("policyengine_targets_db", "path"), - ), - "target_db.sha256": ( - ("target_db", "sha256"), - ("target_db_sha256",), - ("targets_db", "sha256"), - ("policyengine_targets_db", "sha256"), - ("policyengine_targets_db_sha256",), - ), -} -_HEX_RE = re.compile(r"^[0-9a-fA-F]+$") - - -def build_mp300k_artifact_gate_report( - artifact_dir: str | Path, - *, - candidate_dataset_path: str | Path | None = None, - baseline_dataset_path: str | Path | None = None, - ecps_comparison_payload: Any = None, - arch_coverage_payload: dict[str, Any] | None = None, - runtime_smoke_payload: dict[str, Any] | None = None, - source_weight_diagnostics_payload: dict[str, Any] | None = None, - source_weight_diagnostics_path: str | Path | None = None, - benchmark_manifest_path: str | Path | None = None, - period: int = 2024, - arch_coverage_profile: str = _DEFAULT_ARCH_COVERAGE_PROFILE, - artifact_size_ratio_threshold: float = 2.0, - runtime_ratio_threshold: float = 1.25, - max_support_weight_share: float = _DEFAULT_MAX_SUPPORT_WEIGHT_SHARE, - compute_native_scores: bool = True, - require_ecps_comparison: bool = True, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> dict[str, Any]: - """Build a CI-friendly artifact gate report for one mp-300k candidate. - - The report is evidence-driven. It can consume precomputed runtime and - eCPS-comparison payloads, or compute the PE-native broad score when a - baseline dataset is available. - """ - - artifact_root = Path(artifact_dir).expanduser() - manifest_path = artifact_root / "manifest.json" - manifest = _load_manifest(manifest_path) - candidate_dataset = _resolve_candidate_dataset_path( - artifact_root, - manifest, - candidate_dataset_path, - ) - baseline_dataset = ( - Path(baseline_dataset_path).expanduser() - if baseline_dataset_path is not None - else _manifest_baseline_dataset(artifact_root, manifest) - ) - - candidate_gate = _candidate_artifact_gate( - manifest_path=manifest_path, - candidate_dataset=candidate_dataset, - ) - compatibility_gate = _compatibility_gate(candidate_dataset, period=period) - column_contract_gate = _column_contract_gate( - candidate_dataset, - baseline_dataset=baseline_dataset, - period=period, - ) - export_support_gate = _export_support_gate( - candidate_dataset, - baseline_dataset=baseline_dataset, - period=period, - ) - export_lineage_gate = _export_lineage_gate( - baseline_dataset=baseline_dataset, - period=period, - ) - artifact_size_gate = _artifact_size_gate( - candidate_dataset, - baseline_dataset=baseline_dataset, - artifact_size_ratio_threshold=artifact_size_ratio_threshold, - ) - benchmark_gate, benchmark_descriptor = _benchmark_manifest_gate( - benchmark_manifest_path - ) - benchmark_evidence = ( - dict(benchmark_descriptor.get("pinned_evidence") or {}) - if isinstance(benchmark_descriptor, dict) - else {} - ) - resolved_ecps_comparison = _resolve_ecps_comparison_payload( - ecps_comparison_payload, - candidate_dataset=candidate_dataset, - baseline_dataset=baseline_dataset, - period=period, - compute_native_scores=compute_native_scores, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - ecps_comparison_gate = _ecps_comparison_gate( - resolved_ecps_comparison, - benchmark_evidence=benchmark_evidence, - expected_period=period, - ) - arch_coverage_gate = _arch_target_coverage_gate( - arch_coverage_payload, - expected_period=period, - expected_profile=arch_coverage_profile, - ) - runtime_gate = _runtime_gate( - runtime_smoke_payload, - runtime_ratio_threshold=runtime_ratio_threshold, - ) - resolved_source_weight_diagnostics = _resolve_source_weight_diagnostics_payload( - artifact_root, - manifest, - source_weight_diagnostics_payload=source_weight_diagnostics_payload, - source_weight_diagnostics_path=source_weight_diagnostics_path, - ) - source_weight_diagnostics_gate = _source_weight_diagnostics_gate( - resolved_source_weight_diagnostics, - max_support_weight_share=max_support_weight_share, - ) - gates = { - "candidate_artifact": candidate_gate, - "compatibility": compatibility_gate, - "column_contract": column_contract_gate, - "export_support": export_support_gate, - "export_lineage": export_lineage_gate, - "artifact_size": artifact_size_gate, - "runtime": runtime_gate, - "source_weight_diagnostics": source_weight_diagnostics_gate, - "ecps_comparison": ecps_comparison_gate, - "arch_target_coverage": arch_coverage_gate, - "benchmark_manifest": benchmark_gate, - } - required_gates = _required_gate_names( - require_ecps_comparison=require_ecps_comparison, - ) - summary = _summarize_gates(gates, required_gates=required_gates) - return { - "schema_version": 1, - "generated_at": datetime.now(UTC).isoformat(), - "product": manifest.get("product") or "mp-300k", - "gate_set": "artifact_ci", - "artifact_id": artifact_root.name, - "artifact_dir": str(artifact_root.resolve()), - "period": int(period), - "required_gates": required_gates, - "summary": summary, - "manifest": _file_descriptor(manifest_path), - "candidate_dataset": _optional_file_descriptor(candidate_dataset), - "baseline_dataset": ( - _optional_file_descriptor(baseline_dataset) - if baseline_dataset is not None - else None - ), - "gates": gates, - "ecps_comparison_payload": resolved_ecps_comparison, - "arch_coverage": arch_coverage_payload, - "runtime_smoke": runtime_smoke_payload, - "source_weight_diagnostics": resolved_source_weight_diagnostics, - "benchmark_manifest": benchmark_descriptor, - } - - -def write_mp300k_artifact_gate_report( - artifact_dir: str | Path, - *, - output_path: str | Path | None = None, - update_manifest: bool = True, - **kwargs: Any, -) -> Path: - """Write ``mp300k_artifact_gates.json`` and reference it in manifest.""" - - artifact_root = Path(artifact_dir).expanduser() - report_path = ( - Path(output_path).expanduser() - if output_path is not None - else artifact_root / "mp300k_artifact_gates.json" - ) - report = build_mp300k_artifact_gate_report(artifact_root, **kwargs) - _write_json_atomically(report_path, report) - if update_manifest: - manifest_path = artifact_root / "manifest.json" - manifest = _load_manifest(manifest_path) - artifacts = dict(manifest.get("artifacts", {})) - artifacts["mp300k_artifact_gates"] = _relative_or_absolute( - report_path, - base_dir=artifact_root, - ) - manifest["artifacts"] = artifacts - manifest["mp300k_artifact_gates"] = { - "status": report["summary"]["status"], - "passing_required_gate_count": report["summary"][ - "passing_required_gate_count" - ], - "failed_required_gate_count": report["summary"][ - "failed_required_gate_count" - ], - "unmeasured_required_gate_count": report["summary"][ - "unmeasured_required_gate_count" - ], - } - _write_json_atomically(manifest_path, manifest) - return report_path - - -def _resolve_candidate_dataset_path( - artifact_root: Path, - manifest: dict[str, Any], - explicit_path: str | Path | None, -) -> Path: - if explicit_path is not None: - return Path(explicit_path).expanduser() - artifacts = dict(manifest.get("artifacts", {})) - dataset_name = artifacts.get("policyengine_dataset") - if not isinstance(dataset_name, str) or not dataset_name: - raise ValueError( - "manifest.artifacts.policyengine_dataset is required when " - "candidate_dataset_path is not supplied" - ) - dataset_path = Path(dataset_name).expanduser() - if not dataset_path.is_absolute(): - dataset_path = artifact_root / dataset_path - return dataset_path - - -def _manifest_baseline_dataset( - artifact_root: Path, manifest: dict[str, Any] -) -> Path | None: - config = dict(manifest.get("config", {})) - value = config.get("policyengine_baseline_dataset") - if value is None: - return None - baseline_path = Path(value).expanduser() - if not baseline_path.is_absolute(): - baseline_path = artifact_root / baseline_path - return baseline_path - - -def _candidate_artifact_gate( - *, - manifest_path: Path, - candidate_dataset: Path, -) -> dict[str, Any]: - missing = [ - str(path) for path in (manifest_path, candidate_dataset) if not path.exists() - ] - if missing: - return _gate( - "fail", - "required candidate artifact files are missing", - details={"missing": missing}, - ) - return _gate( - "pass", - "manifest and candidate H5 exist", - metrics={ - "manifest_size_bytes": manifest_path.stat().st_size, - "candidate_size_bytes": candidate_dataset.stat().st_size, - }, - ) - - -def _artifact_size_gate( - candidate_dataset: Path, - *, - baseline_dataset: Path | None, - artifact_size_ratio_threshold: float, -) -> dict[str, Any]: - threshold = float(artifact_size_ratio_threshold) - if baseline_dataset is None: - return _gate( - "unmeasured", - "baseline H5 has not been attached for artifact-size comparison", - metrics={"artifact_size_ratio_threshold": threshold}, - ) - if not candidate_dataset.exists() or not baseline_dataset.exists(): - missing = [ - str(path) - for path in (candidate_dataset, baseline_dataset) - if not path.exists() - ] - return _gate( - "fail", - "artifact-size comparison files are missing", - details={"missing": missing}, - metrics={"artifact_size_ratio_threshold": threshold}, - ) - candidate_size = candidate_dataset.stat().st_size - baseline_size = baseline_dataset.stat().st_size - if baseline_size <= 0: - return _gate( - "fail", - "baseline H5 size is nonpositive", - metrics={ - "candidate_size_bytes": candidate_size, - "baseline_size_bytes": baseline_size, - "artifact_size_ratio_threshold": threshold, - }, - ) - ratio = candidate_size / baseline_size - return _gate( - "pass" if ratio <= threshold else "fail", - ( - "candidate H5 size is inside the artifact-size threshold" - if ratio <= threshold - else "candidate H5 size exceeds the artifact-size threshold" - ), - metrics={ - "candidate_size_bytes": candidate_size, - "baseline_size_bytes": baseline_size, - "artifact_size_ratio": ratio, - "artifact_size_ratio_threshold": threshold, - }, - ) - - -def _column_contract_gate( - candidate_dataset: Path, - *, - baseline_dataset: Path | None, - period: int, -) -> dict[str, Any]: - if not candidate_dataset.exists(): - return _gate( - "fail", - "column-contract comparison file is missing", - details={"missing": [str(candidate_dataset)]}, - ) - - from microplex_us.pipelines.check_export_columns import ( - DEFAULT_CONTRACT_PATH, - compute_column_diff, - compute_spec_variable_manifest_diff, - load_contract, - ) - - contract = load_contract(DEFAULT_CONTRACT_PATH) - required = set(contract["required"]) - forbidden = set(contract["forbidden"]) - optional = set(contract.get("ecps_internal_optional", [])) - excluded = set(contract.get("formula_owned_excluded", [])) - candidate_column_set = _h5_top_level_columns(candidate_dataset) - diff = compute_column_diff( - candidate_column_set, - required=required, - forbidden=forbidden, - optional=optional, - excluded=excluded, - ) - spec_diff = compute_spec_variable_manifest_diff(contract=contract) - satisfied_count = len(required) - len(diff.missing_required) - contract_share = float(satisfied_count / len(required)) if required else None - metrics = { - "period": int(period), - "candidate_column_count": len(candidate_column_set), - "required_contract_column_count": len(required), - "forbidden_contract_column_count": len(forbidden), - "optional_contract_column_count": len(optional), - "excluded_contract_column_count": len(excluded), - "contract_column_count": len(required), - "candidate_contract_column_count": satisfied_count, - "missing_contract_column_count": len(diff.missing_required), - "forbidden_present_column_count": len(diff.forbidden_present), - "extra_unknown_column_count": len(diff.extra_unknown), - # Kept for compatibility with existing reports. Unknown columns are - # informational, matching check_export_columns. - "extra_candidate_column_count": len(diff.extra_unknown), - "column_contract_share": contract_share, - "spec_variable_manifest_count": spec_diff.variable_manifest_count, - "spec_required_contract_column_count": spec_diff.required_contract_count, - "spec_declared_imputation_variable_count": spec_diff.declared_imputation_count, - "spec_missing_required_column_count": len(spec_diff.missing_required), - "spec_missing_declared_imputation_count": len( - spec_diff.missing_declared_imputation - ), - "spec_extra_variable_count": len(spec_diff.extra_variables), - } - details = { - "missing_contract_columns": diff.missing_required, - "forbidden_present_columns": diff.forbidden_present, - "extra_unknown_columns": diff.extra_unknown, - "extra_candidate_columns": diff.extra_unknown, - "spec_variable_manifest": { - "spec_path": spec_diff.spec_path, - "missing_required": spec_diff.missing_required, - "missing_declared_imputation": spec_diff.missing_declared_imputation, - "extra_variables": spec_diff.extra_variables, - }, - } - if diff.missing_required or diff.forbidden_present or not spec_diff.ok: - return _gate( - "fail", - "candidate H5 leaf-input column set or spec manifest violates the frozen eCPS contract", - metrics=metrics, - details=details, - ) - return _gate( - "pass", - "candidate H5 leaf-input column set and spec manifest satisfy the frozen eCPS contract", - metrics=metrics, - details=details, - ) - - -def _export_support_gate( - candidate_dataset: Path, - *, - baseline_dataset: Path | None, - period: int, -) -> dict[str, Any]: - if baseline_dataset is None: - return _gate( - "unmeasured", - "pinned eCPS baseline H5 has not been attached for export-support comparison", - ) - if not candidate_dataset.exists() or not baseline_dataset.exists(): - missing = [ - str(path) - for path in (candidate_dataset, baseline_dataset) - if not path.exists() - ] - return _gate( - "fail", - "export-support comparison files are missing", - details={"missing": missing}, - ) - - from microplex_us.pipelines.check_export_columns import ( - DEFAULT_CONTRACT_PATH, - compute_support_diff, - load_contract, - support_diff_to_dict, - ) - - contract = load_contract(DEFAULT_CONTRACT_PATH) - required_columns = set(contract["required"]) - support_diff = compute_support_diff( - candidate_dataset, - baseline_h5=baseline_dataset, - period=period, - required_columns=required_columns, - ) - metrics = { - "period": int(period), - "checked_export_column_count": len(support_diff.checked_columns), - "ecps_populated_export_column_count": len( - support_diff.baseline_populated_columns - ), - "ecps_filler_export_column_count": len(support_diff.baseline_filler_columns), - "unsupported_populated_export_column_count": len(support_diff.issues), - "required_contract_column_count": len(required_columns), - } - details = support_diff_to_dict(support_diff) - if support_diff.issues: - return _gate( - "fail", - "candidate export columns lack support for eCPS-populated columns", - metrics=metrics, - details=details, - ) - return _gate( - "pass", - "candidate export columns have support for every eCPS-populated export", - metrics=metrics, - details=details, - ) - - -def _export_lineage_gate( - *, - baseline_dataset: Path | None, - period: int, -) -> dict[str, Any]: - if baseline_dataset is None: - return _gate( - "unmeasured", - "pinned eCPS baseline H5 has not been attached for export-lineage comparison", - ) - if not baseline_dataset.exists(): - return _gate( - "fail", - "export-lineage comparison file is missing", - details={"missing": [str(baseline_dataset)]}, - ) - - from microplex_us.pipelines.export_lineage_manifest import ( - build_export_lineage_manifest, - ) - - manifest = build_export_lineage_manifest( - support_baseline=baseline_dataset, - period=period, - ) - summary = dict(manifest["summary"]) - columns = list(manifest["columns"]) - default_only_columns = [ - column["column"] - for column in columns - if column["export_path_status"] == "default_only" - ] - details = { - "issues": manifest["issues"], - "default_only_columns": default_only_columns, - } - if manifest["issues"]: - return _gate( - "fail", - "required eCPS-populated exports lack source/code lineage", - metrics=summary, - details=details, - ) - return _gate( - "pass", - "every eCPS-populated required export has source/code lineage", - metrics=summary, - details=details, - ) - - -def _h5_top_level_columns(candidate_dataset: Path) -> set[str]: - """Return base column names at the top level of an exported H5. - - Accepts both shapes a column can take: a group ``/`` - (the eCPS export layout) or a flat dataset ````. Names are - collapsed to the base column via ``split("/")[0]`` so the two are - comparable. Shared by the fast column-parity CLI - (``check_export_columns``) so it reads columns the same way the - artifact path does. - """ - with h5py.File(candidate_dataset, "r") as handle: - return {name.split("/")[0] for name in handle.keys()} - - -def _compatibility_gate(candidate_dataset: Path, *, period: int) -> dict[str, Any]: - try: - inspection = _inspect_h5_contract(candidate_dataset, period=period) - if inspection["failures"]: - return _gate( - "fail", - "candidate H5 violates the PolicyEngine table contract", - metrics=inspection["metrics"], - details=inspection["details"], - ) - from microplex_us.policyengine.us import load_policyengine_us_entity_tables - - tables = load_policyengine_us_entity_tables( - candidate_dataset, - period=period, - variables=(), - ) - household_weight_sum = float(tables.households["household_weight"].sum()) - if household_weight_sum <= 0: - return _gate( - "fail", - "candidate H5 has nonpositive household weight sum", - metrics={"household_weight_sum": household_weight_sum}, - ) - return _gate( - "pass", - "candidate H5 satisfies the structural PolicyEngine table contract", - metrics={ - **inspection["metrics"], - "household_count": int(len(tables.households)), - "person_count": int(len(tables.persons)) - if tables.persons is not None - else 0, - "household_weight_sum": household_weight_sum, - }, - ) - except Exception as exc: # noqa: BLE001 - this is a gate report boundary. - return _gate( - "fail", - "candidate H5 failed the structural PolicyEngine table contract", - details={"error": str(exc)}, - ) - - -def _inspect_h5_contract( - candidate_dataset: Path, *, period: int -) -> dict[str, dict[str, Any] | list[str]]: - period_key = str(int(period)) - failures: list[str] = [] - details: dict[str, Any] = {} - metrics: dict[str, Any] = {"period": int(period)} - with h5py.File(candidate_dataset, "r") as handle: - period_groups = sorted( - name for name, value in handle.items() if isinstance(value, h5py.Group) - ) - metrics["variable_count"] = len(period_groups) - metrics["required_structural_array_count"] = len(_REQUIRED_PERIOD_ARRAYS) - missing = [ - variable - for variable in _REQUIRED_PERIOD_ARRAYS - if variable not in handle or period_key not in handle[variable] - ] - if missing: - failures.append("missing_required_period_arrays") - details["missing_arrays"] = missing - - variables_missing_period = [ - variable for variable in period_groups if period_key not in handle[variable] - ] - if variables_missing_period: - failures.append("variables_missing_requested_period") - details["variables_missing_period"] = variables_missing_period - - forbidden_variables = [ - variable - for variable in period_groups - if _is_forbidden_source_diagnostic_variable(variable) - ] - if forbidden_variables: - failures.append("source_diagnostic_variables_exported") - details["forbidden_source_diagnostic_variables"] = forbidden_variables - - arrays = { - variable: np.asarray(handle[variable][period_key]) - for variable in _REQUIRED_PERIOD_ARRAYS - if variable in handle and period_key in handle[variable] - } - _inspect_structural_dtypes(arrays, failures=failures, details=details) - _inspect_structural_lengths(arrays, failures=failures, details=details) - _inspect_entity_ids(arrays, failures=failures, details=details) - _inspect_person_links(arrays, failures=failures, details=details) - _inspect_household_weights(arrays, failures=failures, details=details) - nonfinite = _nonfinite_numeric_period_arrays( - handle, - period_key=period_key, - ) - if nonfinite: - failures.append("nonfinite_numeric_period_arrays") - details["nonfinite_numeric_arrays"] = nonfinite - metrics["nonfinite_numeric_array_count"] = len(nonfinite) - else: - metrics["nonfinite_numeric_array_count"] = 0 - - metrics.update(_structural_count_metrics(arrays)) - return {"failures": failures, "details": details, "metrics": metrics} - - -def _is_forbidden_source_diagnostic_variable(variable: str) -> bool: - return ( - variable in _FORBIDDEN_SOURCE_DIAGNOSTIC_VARIABLES - or variable.startswith(_FORBIDDEN_SOURCE_DIAGNOSTIC_PREFIXES) - or variable.endswith(_FORBIDDEN_SOURCE_DIAGNOSTIC_SUFFIXES) - ) - - -def _inspect_structural_dtypes( - arrays: dict[str, np.ndarray], - *, - failures: list[str], - details: dict[str, Any], -) -> None: - invalid_id_dtypes = { - variable: str(values.dtype) - for variable, values in arrays.items() - if variable != "household_weight" and not _is_valid_id_dtype(values.dtype) - } - if invalid_id_dtypes: - failures.append("invalid_structural_id_dtypes") - details["invalid_structural_id_dtypes"] = invalid_id_dtypes - - household_weight = arrays.get("household_weight") - if household_weight is not None and not _is_valid_weight_dtype( - household_weight.dtype - ): - failures.append("invalid_household_weight_dtype") - details["invalid_household_weight_dtype"] = str(household_weight.dtype) - - -def _is_valid_id_dtype(dtype: np.dtype[Any]) -> bool: - return np.issubdtype(dtype, np.integer) or dtype.kind in {"S", "U", "O"} - - -def _is_valid_weight_dtype(dtype: np.dtype[Any]) -> bool: - return np.issubdtype(dtype, np.number) and dtype.kind != "b" - - -def _inspect_structural_lengths( - arrays: dict[str, np.ndarray], - *, - failures: list[str], - details: dict[str, Any], -) -> None: - household_count = _array_length(arrays.get("household_id")) - person_count = _array_length(arrays.get("person_id")) - length_mismatches: dict[str, dict[str, int | None]] = {} - if household_count is not None: - _record_length_mismatch( - length_mismatches, - "household_weight", - actual=_array_length(arrays.get("household_weight")), - expected=household_count, - ) - if person_count is not None: - for variable in _PERSON_LINK_ARRAYS.values(): - _record_length_mismatch( - length_mismatches, - variable, - actual=_array_length(arrays.get(variable)), - expected=person_count, - ) - if length_mismatches: - failures.append("structural_array_length_mismatches") - details["structural_array_length_mismatches"] = length_mismatches - - -def _array_length(values: np.ndarray | None) -> int | None: - if values is None: - return None - return int(values.shape[0]) if values.ndim else 1 - - -def _record_length_mismatch( - length_mismatches: dict[str, dict[str, int | None]], - variable: str, - *, - actual: int | None, - expected: int, -) -> None: - if actual != expected: - length_mismatches[variable] = { - "actual": actual, - "expected": expected, - } - - -def _inspect_entity_ids( - arrays: dict[str, np.ndarray], - *, - failures: list[str], - details: dict[str, Any], -) -> None: - duplicate_ids: dict[str, int] = {} - empty_entities: list[str] = [] - for variable in _ENTITY_ID_ARRAYS.values(): - values = arrays.get(variable) - if values is None: - continue - if _array_length(values) == 0: - empty_entities.append(variable) - continue - unique_count = len(np.unique(values)) - if unique_count != len(values): - duplicate_ids[variable] = int(len(values) - unique_count) - if empty_entities: - failures.append("empty_entity_id_arrays") - details["empty_entity_id_arrays"] = empty_entities - if duplicate_ids: - failures.append("duplicate_entity_ids") - details["duplicate_entity_ids"] = duplicate_ids - - -def _inspect_person_links( - arrays: dict[str, np.ndarray], - *, - failures: list[str], - details: dict[str, Any], -) -> None: - invalid_links: dict[str, list[Any]] = {} - for entity, link_variable in _PERSON_LINK_ARRAYS.items(): - id_variable = _ENTITY_ID_ARRAYS[entity] - link_values = arrays.get(link_variable) - id_values = arrays.get(id_variable) - if link_values is None or id_values is None: - continue - missing_values = link_values[~np.isin(link_values, id_values)] - if missing_values.size: - invalid_links[link_variable] = _jsonable_sample(missing_values) - if invalid_links: - failures.append("invalid_person_entity_links") - details["invalid_person_entity_links"] = invalid_links - - -def _inspect_household_weights( - arrays: dict[str, np.ndarray], - *, - failures: list[str], - details: dict[str, Any], -) -> None: - household_weight = arrays.get("household_weight") - if household_weight is None: - return - values = np.asarray(household_weight, dtype=np.float64) - if not np.isfinite(values).all(): - failures.append("nonfinite_household_weights") - details["nonfinite_household_weight_count"] = int( - np.size(values) - np.isfinite(values).sum() - ) - negative_count = int((values < 0).sum()) - if negative_count: - failures.append("negative_household_weights") - details["negative_household_weight_count"] = negative_count - - -def _nonfinite_numeric_period_arrays( - handle: h5py.File, - *, - period_key: str, -) -> dict[str, int]: - nonfinite: dict[str, int] = {} - for variable, group in handle.items(): - if not isinstance(group, h5py.Group) or period_key not in group: - continue - dataset = group[period_key] - if not np.issubdtype(dataset.dtype, np.floating): - continue - values = np.asarray(dataset) - finite = np.isfinite(values) - if not finite.all(): - nonfinite[variable] = int(np.size(values) - finite.sum()) - return nonfinite - - -def _structural_count_metrics(arrays: dict[str, np.ndarray]) -> dict[str, int]: - metrics: dict[str, int] = {} - for entity, variable in _ENTITY_ID_ARRAYS.items(): - count = _array_length(arrays.get(variable)) - if count is not None: - metrics[f"{entity}_count"] = count - return metrics - - -def _jsonable_sample(values: np.ndarray, *, limit: int = 5) -> list[Any]: - sample = np.unique(values)[:limit].tolist() - result: list[Any] = [] - for value in sample: - if hasattr(value, "item"): - value = value.item() - if isinstance(value, bytes): - value = value.decode("utf-8", errors="replace") - result.append(value) - return result - - -def _resolve_ecps_comparison_payload( - ecps_comparison_payload: Any, - *, - candidate_dataset: Path, - baseline_dataset: Path | None, - period: int, - compute_native_scores: bool, - policyengine_us_data_repo: str | Path | None, - policyengine_us_data_python: str | Path | None, -) -> Any: - if ecps_comparison_payload is not None: - return ecps_comparison_payload - if not compute_native_scores or baseline_dataset is None: - return None - from microplex_us.pipelines.pe_native_scores import compute_us_pe_native_scores - - return compute_us_pe_native_scores( - candidate_dataset_path=candidate_dataset, - baseline_dataset_path=baseline_dataset, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - - -def _ecps_comparison_gate( - ecps_comparison_payload: dict[str, Any] | None, - *, - benchmark_evidence: dict[str, Any] | None = None, - expected_period: int, -) -> dict[str, Any]: - if ecps_comparison_payload is None: - return _gate( - "unmeasured", - "PE-native eCPS comparison has not been attached", - ) - summary = _ecps_comparison_summary(ecps_comparison_payload) - candidate_loss = summary.get("candidate_enhanced_cps_native_loss") - baseline_loss = summary.get("baseline_enhanced_cps_native_loss") - loss_delta = summary.get("enhanced_cps_native_loss_delta") - reported_candidate_beats = summary.get("candidate_beats_baseline") - details: dict[str, Any] = {} - if candidate_loss is not None and baseline_loss is not None: - computed_loss_delta = float(candidate_loss) - float(baseline_loss) - if ( - loss_delta is not None - and abs(float(loss_delta) - computed_loss_delta) > 1e-12 - ): - details["reported_loss_delta"] = loss_delta - details["computed_loss_delta"] = computed_loss_delta - loss_delta = computed_loss_delta - candidate_beats = None - if loss_delta is not None: - candidate_beats = float(loss_delta) < 0.0 - if ( - reported_candidate_beats is not None - and candidate_beats is not None - and bool(reported_candidate_beats) != candidate_beats - ): - details["reported_candidate_beats_baseline"] = reported_candidate_beats - details["computed_candidate_beats_baseline"] = candidate_beats - contract = _ecps_comparison_contract_summary( - ecps_comparison_payload, - summary, - benchmark_evidence=benchmark_evidence, - expected_period=expected_period, - ) - details.update(contract["details"]) - missing_requirements = list(contract["missing_requirements"]) - status: GateStatus - if candidate_beats is None: - status = "unmeasured" - elif missing_requirements: - status = "fail" - else: - status = "pass" if bool(candidate_beats) else "fail" - return _gate( - status, - ( - "candidate beats pinned eCPS under the release comparison contract" - if status == "pass" - else ( - ( - "eCPS comparison is missing release-contract evidence: " - + ", ".join(missing_requirements) - ) - if missing_requirements - else "candidate does not beat pinned eCPS on PE-native broad loss" - if status == "fail" - else "PE-native eCPS comparison payload is incomplete" - ) - ), - metrics={ - "candidate_enhanced_cps_native_loss": candidate_loss, - "baseline_enhanced_cps_native_loss": baseline_loss, - "enhanced_cps_native_loss_delta": loss_delta, - "n_targets_kept": summary.get("n_targets_kept"), - "matched_household_count": contract["matched_household_count"], - "holdout_target_fraction": contract["holdout_target_fraction"], - "candidate_holdout_loss": contract["candidate_holdout_loss"], - "baseline_holdout_loss": contract["baseline_holdout_loss"], - "candidate_unweighted_msre": contract["candidate_unweighted_msre"], - "baseline_unweighted_msre": contract["baseline_unweighted_msre"], - }, - details=details, - ) - - -def _ecps_comparison_summary(payload: Any) -> dict[str, Any]: - if isinstance(payload, list): - for item in payload: - summary = _ecps_comparison_summary(item) - if summary: - return summary - return {} - if not isinstance(payload, dict): - return {} - for key in ("summary", "broad_loss", "loss_summary"): - value = payload.get(key) - if isinstance(value, dict): - return dict(value) - if "candidate_enhanced_cps_native_loss" in payload: - return dict(payload) - if ( - "best_variant_loss" in payload - and "baseline_enhanced_cps_native_loss" in payload - ): - best_variant_loss = payload.get("best_variant_loss") - baseline_loss = payload.get("baseline_enhanced_cps_native_loss") - loss_delta = None - candidate_beats = None - if best_variant_loss is not None and baseline_loss is not None: - loss_delta = float(best_variant_loss) - float(baseline_loss) - candidate_beats = loss_delta < 0.0 - return { - "candidate_enhanced_cps_native_loss": best_variant_loss, - "baseline_enhanced_cps_native_loss": baseline_loss, - "enhanced_cps_native_loss_delta": loss_delta, - "candidate_beats_baseline": candidate_beats, - "best_variant_label": payload.get("best_variant_label"), - } - return {} - - -def _ecps_comparison_contract_summary( - payload: Any, - summary: dict[str, Any], - *, - benchmark_evidence: dict[str, Any] | None = None, - expected_period: int, -) -> dict[str, Any]: - candidate_households = _first_nested_present( - payload, - summary, - "candidate_household_count", - "candidate_households", - "candidate_n_households", - "candidate_record_count", - ) - baseline_households = _first_nested_present( - payload, - summary, - "baseline_household_count", - "baseline_households", - "baseline_n_households", - "baseline_record_count", - ) - matched_household_count = None - if candidate_households is not None and baseline_households is not None: - matched_household_count = int(candidate_households) == int(baseline_households) - else: - matched_household_count = _first_nested_present( - payload, - summary, - "matched_household_count", - "matched_n", - "matched_record_count", - ) - if matched_household_count is not None: - matched_household_count = bool(matched_household_count) - - symmetric_refit = _first_nested_present( - payload, - summary, - "symmetric_refit", - "symmetric_reweight", - "refit_both", - ) - candidate_refit_config = _first_nested_present( - payload, - summary, - "candidate_refit_config", - "candidate_fit_config", - ) - baseline_refit_config = _first_nested_present( - payload, - summary, - "baseline_refit_config", - "baseline_fit_config", - ) - if symmetric_refit is None and ( - isinstance(candidate_refit_config, dict) - and isinstance(baseline_refit_config, dict) - ): - symmetric_refit = candidate_refit_config == baseline_refit_config - if symmetric_refit is not None: - symmetric_refit = bool(symmetric_refit) - - score_candidate_only = _first_nested_present( - payload, - summary, - "score_candidate_only", - ) - if score_candidate_only is not None and bool(score_candidate_only): - symmetric_refit = False - - objective_identity = _first_nested_present( - payload, - summary, - "refit_objective_matches_scoring", - "objective_identity_recovery_passed", - ) - if objective_identity is not None: - objective_identity = bool(objective_identity) - - ecps_refit_recovery = _first_nested_present( - payload, - summary, - "ecps_refit_recovery_passed", - "baseline_refit_recovery_passed", - ) - if ecps_refit_recovery is not None: - ecps_refit_recovery = bool(ecps_refit_recovery) - ecps_refit_effective = _first_nested_present( - payload, - summary, - "ecps_refit_effective_passed", - "baseline_refit_effective_passed", - ) - if ecps_refit_effective is not None: - ecps_refit_effective = bool(ecps_refit_effective) - - holdout_target_fraction = _first_nested_present( - payload, - summary, - "holdout_target_fraction", - ) - holdout_targets = _first_nested_present( - payload, - summary, - "holdout_targets", - "n_holdout_targets", - ) - has_holdout_targets = False - if holdout_target_fraction is not None: - has_holdout_targets = float(holdout_target_fraction) > 0.0 - elif holdout_targets is not None: - has_holdout_targets = int(holdout_targets) > 0 - - candidate_holdout_loss = _first_nested_present( - payload, - summary, - "candidate_holdout_loss", - ) - baseline_holdout_loss = _first_nested_present( - payload, - summary, - "baseline_holdout_loss", - ) - holdout_loss_beats_baseline = _loss_strictly_beats( - candidate_holdout_loss, baseline_holdout_loss - ) - - candidate_unweighted_msre = _first_nested_present( - payload, - summary, - "candidate_unweighted_msre", - "candidate_msre", - "candidate_mean_unweighted_msre", - ) - baseline_unweighted_msre = _first_nested_present( - payload, - summary, - "baseline_unweighted_msre", - "baseline_msre", - "baseline_mean_unweighted_msre", - ) - unweighted_msre_beats_baseline = _loss_strictly_beats( - candidate_unweighted_msre, baseline_unweighted_msre - ) - - protected_summary = _protected_family_floor_summary(payload, summary) - core_benchmark_summary = _core_benchmark_family_floor_summary(payload, summary) - frozen_baseline_summary = _frozen_baseline_certificate_summary( - payload, - summary, - benchmark_evidence=benchmark_evidence, - expected_period=expected_period, - ) - - requirements = { - "matched_household_count": matched_household_count is True, - "symmetric_refit": symmetric_refit is True, - "refit_objective_matches_scoring": objective_identity is True, - "ecps_refit_effective": ecps_refit_effective is True, - "frozen_ecps_baseline_certificate": frozen_baseline_summary["passed"] is True, - "holdout_target_split": has_holdout_targets, - "holdout_loss_beats_baseline": holdout_loss_beats_baseline is True, - "unweighted_msre_beats_baseline": (unweighted_msre_beats_baseline is True), - "protected_family_floors": protected_summary["passed"] is True, - "core_benchmark_family_floors": core_benchmark_summary["passed"] is True, - } - return { - "matched_household_count": matched_household_count, - "holdout_target_fraction": holdout_target_fraction, - "candidate_holdout_loss": candidate_holdout_loss, - "baseline_holdout_loss": baseline_holdout_loss, - "candidate_unweighted_msre": candidate_unweighted_msre, - "baseline_unweighted_msre": baseline_unweighted_msre, - "missing_requirements": [ - key for key, passed in requirements.items() if not passed - ], - "details": { - "candidate_household_count": candidate_households, - "baseline_household_count": baseline_households, - "symmetric_refit": symmetric_refit, - "score_candidate_only": score_candidate_only, - "refit_objective_matches_scoring": objective_identity, - "ecps_refit_recovery_passed": ecps_refit_recovery, - "ecps_refit_effective_passed": ecps_refit_effective, - "frozen_ecps_baseline_certificate": frozen_baseline_summary, - "holdout_targets": holdout_targets, - "candidate_holdout_loss": candidate_holdout_loss, - "baseline_holdout_loss": baseline_holdout_loss, - "holdout_loss_beats_baseline": holdout_loss_beats_baseline, - "candidate_unweighted_msre": candidate_unweighted_msre, - "baseline_unweighted_msre": baseline_unweighted_msre, - "unweighted_msre_beats_baseline": unweighted_msre_beats_baseline, - "protected_family_floor": protected_summary, - "core_benchmark_family_floor": core_benchmark_summary, - }, - } - - -def _frozen_baseline_certificate_summary( - payload: Any, - summary: dict[str, Any], - *, - benchmark_evidence: dict[str, Any] | None, - expected_period: int, -) -> dict[str, Any]: - certificate = _find_frozen_baseline_certificate(payload) - if not isinstance(certificate, dict): - return { - "passed": False, - "missing_evidence": ["frozen_ecps_baseline_certificate"], - "mismatches": [], - } - - missing: list[str] = [] - mismatches: list[dict[str, Any]] = [] - schema_version = certificate.get("schema_version") - if schema_version != 1: - mismatches.append( - { - "field": "schema_version", - "expected": 1, - "actual": schema_version, - } - ) - certificate_type = certificate.get("certificate_type") - if certificate_type != "frozen_production_ecps_baseline": - mismatches.append( - { - "field": "certificate_type", - "expected": "frozen_production_ecps_baseline", - "actual": certificate_type, - } - ) - certificate_period = certificate.get("period") - try: - certificate_period_int = int(certificate_period) - except (TypeError, ValueError): - certificate_period_int = None - if certificate_period_int != int(expected_period): - mismatches.append( - { - "field": "period", - "expected": int(expected_period), - "actual": certificate_period, - } - ) - - evidence_values = { - "certificate_type": certificate_type, - "period": certificate_period, - "baseline_dataset.sha256": _first_nested_path_value( - certificate, - ( - ("baseline_dataset", "sha256"), - ("enhanced_cps", "sha256"), - ("baseline_dataset_sha256",), - ("enhanced_cps_sha256",), - ), - ), - "target_db.sha256": _first_nested_path_value( - certificate, - ( - ("target_db", "sha256"), - ("targets_db", "sha256"), - ("policyengine_targets_db", "sha256"), - ("target_db_sha256",), - ("policyengine_targets_db_sha256",), - ), - ), - "policyengine_us_data.commit": _first_nested_path_value( - certificate, - ( - ("policyengine_us_data", "commit"), - ("policyengine_us_data", "commit_sha"), - ("policyengine_us_data_commit",), - ("policyengine_us_data_commit_sha",), - ), - ), - "policyengine_us.version": _first_nested_path_value( - certificate, - ( - ("policyengine_us", "version"), - ("policyengine_us_version",), - ), - ), - "target_surface.target_profile": _first_nested_path_value( - certificate, - ( - ("target_surface", "target_profile"), - ("target_profile",), - ), - ), - "target_surface.target_scope": _first_nested_path_value( - certificate, - ( - ("target_surface", "target_scope"), - ("target_scope",), - ("target_scope_filter",), - ), - ), - "scoring_config.sha256": _first_nested_path_value( - certificate, - ( - ("scoring_config", "sha256"), - ("scoring_config_sha256",), - ), - ), - "target_surface.target_names_sha256": _first_nested_path_value( - certificate, - ( - ("target_surface", "target_names_sha256"), - ("target_names_sha256",), - ), - ), - "target_surface.target_count": _first_nested_path_value( - certificate, - ( - ("target_surface", "target_count"), - ("target_count",), - ), - ), - "baseline_metrics.baseline_enhanced_cps_native_loss": ( - _certificate_metric( - certificate, - "baseline_enhanced_cps_native_loss", - ) - ), - "baseline_metrics.baseline_holdout_loss": ( - _certificate_metric(certificate, "baseline_holdout_loss") - ), - "baseline_metrics.baseline_unweighted_msre": ( - _certificate_metric(certificate, "baseline_unweighted_msre") - ), - } - for evidence_name, value in evidence_values.items(): - if not _valid_certificate_evidence_value(evidence_name, value): - missing.append(evidence_name) - - for metric_name in ( - "baseline_enhanced_cps_native_loss", - "baseline_holdout_loss", - "baseline_unweighted_msre", - ): - summary_value = summary.get(metric_name) - certificate_value = _certificate_metric(certificate, metric_name) - if summary_value is None or certificate_value is None: - continue - if not _float_equal(summary_value, certificate_value): - mismatches.append( - { - "field": f"baseline_metrics.{metric_name}", - "summary_value": summary_value, - "certificate_value": certificate_value, - } - ) - - for evidence_name in ( - "certificate_type", - "period", - "baseline_dataset.sha256", - "target_db.sha256", - "policyengine_us_data.commit", - "policyengine_us.version", - "target_surface.target_profile", - "target_surface.target_scope", - "target_surface.target_count", - "target_surface.target_names_sha256", - "scoring_config.sha256", - "baseline_metrics.baseline_enhanced_cps_native_loss", - "baseline_metrics.baseline_holdout_loss", - "baseline_metrics.baseline_unweighted_msre", - ): - expected_value = (benchmark_evidence or {}).get(evidence_name) - certificate_value = evidence_values.get(evidence_name) - if expected_value is None or certificate_value is None: - continue - if str(expected_value) != str(certificate_value): - mismatches.append( - { - "field": evidence_name, - "benchmark_manifest_value": expected_value, - "certificate_value": certificate_value, - } - ) - - mismatches.extend(frozen_production_pin_mismatches(evidence_values)) - - return { - "passed": not missing and not mismatches, - "certificate_type": certificate.get("certificate_type"), - "period": certificate.get("period"), - "missing_evidence": missing, - "mismatches": mismatches, - "baseline_dataset_sha256": evidence_values.get("baseline_dataset.sha256"), - "target_db_sha256": evidence_values.get("target_db.sha256"), - "policyengine_us_data_commit": evidence_values.get( - "policyengine_us_data.commit" - ), - "policyengine_us_version": evidence_values.get("policyengine_us.version"), - "scoring_config_sha256": evidence_values.get("scoring_config.sha256"), - "target_profile": evidence_values.get("target_surface.target_profile"), - "target_scope": evidence_values.get("target_surface.target_scope"), - "target_names_sha256": evidence_values.get( - "target_surface.target_names_sha256" - ), - "target_count": evidence_values.get("target_surface.target_count"), - } - - -def _find_frozen_baseline_certificate(payload: Any) -> Any: - if not isinstance(payload, dict): - return None - for key in ( - "frozen_ecps_baseline_certificate", - "baseline_certificate", - "certified_baseline", - ): - value = payload.get(key) - if isinstance(value, dict): - return value - metadata = payload.get("metadata") - if isinstance(metadata, dict): - value = metadata.get("frozen_ecps_baseline_certificate") - if isinstance(value, dict): - return value - return None - - -def _certificate_metric(certificate: dict[str, Any], metric_name: str) -> Any: - return _first_nested_path_value( - certificate, - ( - ("baseline_metrics", metric_name), - (metric_name,), - ), - ) - - -def _valid_certificate_evidence_value(name: str, value: Any) -> bool: - if name in {"period", "target_surface.target_count"}: - try: - return int(value) > 0 - except (TypeError, ValueError): - return False - if name == "certificate_type": - return value == "frozen_production_ecps_baseline" - if name.endswith(".version") or name in { - "target_surface.target_profile", - "target_surface.target_scope", - }: - return isinstance(value, str) and bool(value) - if name.endswith(".sha256"): - return ( - isinstance(value, str) - and len(value) == 64 - and bool(_HEX_RE.fullmatch(value)) - ) - if name.endswith(".commit"): - return ( - isinstance(value, str) - and 7 <= len(value) <= 40 - and bool(_HEX_RE.fullmatch(value)) - ) - if name.startswith("baseline_metrics."): - try: - return np.isfinite(float(value)) - except (TypeError, ValueError): - return False - return value is not None - - -def _float_equal(left: Any, right: Any, *, tolerance: float = 1e-12) -> bool: - return abs(float(left) - float(right)) <= tolerance - - -def _loss_strictly_beats(candidate: Any, baseline: Any) -> bool | None: - if candidate is None or baseline is None: - return None - try: - candidate_value = float(candidate) - baseline_value = float(baseline) - except (TypeError, ValueError): - return None - if not np.isfinite(candidate_value) or not np.isfinite(baseline_value): - return None - return candidate_value < baseline_value - - -def _first_nested_present( - payload: Any, - summary: dict[str, Any], - *keys: str, -) -> Any: - candidates: list[dict[str, Any]] = [summary] - if isinstance(payload, dict): - candidates.append(payload) - for nested_key in ( - "comparison_contract", - "ecps_comparison_contract", - "release_contract", - "validation", - "metadata", - ): - nested = payload.get(nested_key) - if isinstance(nested, dict): - candidates.append(nested) - for candidate in candidates: - value = _first_present(candidate, *keys) - if value is not None: - return value - return None - - -def _protected_family_floor_summary( - payload: Any, - summary: dict[str, Any], -) -> dict[str, Any]: - return _family_floor_summary( - payload, - summary, - families=_PROTECTED_ECPS_TARGET_FAMILIES, - explicit_pass_keys=( - "protected_family_floors_passed", - "protected_family_floor_passed", - ), - row_keys=( - "protected_family_losses", - "protected_family_floor_results", - "family_loss_comparison", - "family_breakdown", - ), - ) - - -def _core_benchmark_family_floor_summary( - payload: Any, - summary: dict[str, Any], -) -> dict[str, Any]: - return _family_floor_summary( - payload, - summary, - families=_CORE_BENCHMARK_ECPS_TARGET_FAMILIES, - explicit_pass_keys=( - "core_benchmark_family_floors_passed", - "core_benchmark_family_floor_passed", - ), - row_keys=( - "core_benchmark_family_losses", - "core_benchmark_family_floor_results", - "family_loss_comparison", - "family_breakdown", - ), - ) - - -def _family_floor_summary( - payload: Any, - summary: dict[str, Any], - *, - families: tuple[str, ...], - explicit_pass_keys: tuple[str, ...], - row_keys: tuple[str, ...], -) -> dict[str, Any]: - explicit = _first_nested_present( - payload, - summary, - *explicit_pass_keys, - ) - family_rows = _first_nested_present(payload, summary, *row_keys) - if family_rows is None and isinstance(payload, dict): - score_payload = payload.get("score") - if isinstance(score_payload, dict): - family_rows = _first_present(score_payload, *row_keys) - if family_rows is None: - score_summary = score_payload.get("summary") - if isinstance(score_summary, dict): - family_rows = _first_present(score_summary, *row_keys) - if family_rows is None: - return { - "passed": None, - "reported_passed": explicit, - "missing_families": list(families), - "regressions": [], - } - - rows_by_family = _family_loss_rows_by_name(family_rows) - missing: list[str] = [] - regressions: list[dict[str, Any]] = [] - for family in families: - row = rows_by_family.get(family) - matched_family = family - if row is None: - for alias in _FAMILY_FLOOR_ALIASES.get(family, ()): - row = rows_by_family.get(alias) - if row is not None: - matched_family = alias - break - if row is None: - missing.append(family) - continue - candidate_loss = _first_present( - row, - "candidate_loss", - "candidate_family_loss", - "candidate_loss_contribution", - ) - baseline_loss = _first_present( - row, - "baseline_loss", - "baseline_family_loss", - "baseline_loss_contribution", - ) - if candidate_loss is None or baseline_loss is None: - missing.append(family) - continue - delta = float(candidate_loss) - float(baseline_loss) - tolerance = max( - _PROTECTED_FAMILY_ABSOLUTE_LOSS_TOLERANCE, - _PROTECTED_FAMILY_RELATIVE_LOSS_TOLERANCE * abs(float(baseline_loss)), - ) - if delta > tolerance: - regression = { - "family": family, - "candidate_loss": float(candidate_loss), - "baseline_loss": float(baseline_loss), - "loss_delta": delta, - "allowed_delta": tolerance, - } - if matched_family != family: - regression["matched_family"] = matched_family - regressions.append(regression) - passed = not missing and not regressions - if explicit is not None: - passed = passed and bool(explicit) - return { - "passed": passed, - "missing_families": sorted(set(missing)), - "regressions": regressions, - "relative_tolerance": _PROTECTED_FAMILY_RELATIVE_LOSS_TOLERANCE, - "absolute_tolerance": _PROTECTED_FAMILY_ABSOLUTE_LOSS_TOLERANCE, - } - - -def _family_loss_rows_by_name(family_rows: Any) -> dict[str, dict[str, Any]]: - if isinstance(family_rows, dict): - return { - str(family): dict(row) - for family, row in family_rows.items() - if isinstance(row, dict) - } - if isinstance(family_rows, list): - rows: dict[str, dict[str, Any]] = {} - for row in family_rows: - if not isinstance(row, dict): - continue - family = _first_present(row, "family", "target_family", "name") - if family is not None: - rows[str(family)] = dict(row) - return rows - return {} - - -def _runtime_gate( - runtime_smoke_payload: dict[str, Any] | None, - *, - runtime_ratio_threshold: float, -) -> dict[str, Any]: - if runtime_smoke_payload is None: - return _gate("unmeasured", "runtime smoke benchmark has not been attached") - payload = dict(runtime_smoke_payload) - threshold = float(runtime_ratio_threshold) - ratio = _first_present(payload, "runtime_ratio", "median_runtime_ratio") - candidate_seconds = _first_present(payload, "candidate_seconds") - baseline_seconds = _first_present(payload, "baseline_seconds") - candidate_payload = payload.get("candidate") - baseline_payload = payload.get("baseline") - if isinstance(candidate_payload, dict): - if candidate_seconds is None: - candidate_seconds = _first_present( - candidate_payload, - "median_elapsed_seconds", - "elapsed_seconds", - ) - if isinstance(baseline_payload, dict): - if baseline_seconds is None: - baseline_seconds = _first_present( - baseline_payload, - "median_elapsed_seconds", - "elapsed_seconds", - ) - if ratio is None and candidate_seconds is not None and baseline_seconds: - ratio = float(candidate_seconds) / float(baseline_seconds) - passes = _first_present( - payload, - "passes_runtime_gate", - "passes_runtime_ratio_1_25x", - ) - details: dict[str, Any] = {} - reported_threshold = payload.get("runtime_ratio_threshold") - reported_threshold_matches = False - try: - reported_threshold_matches = float(reported_threshold) == threshold - except (TypeError, ValueError): - pass - if reported_threshold is not None and not reported_threshold_matches: - details["reported_runtime_ratio_threshold"] = reported_threshold - details["enforced_runtime_ratio_threshold"] = threshold - if ratio is None: - return _gate( - "unmeasured", - "runtime smoke payload is missing ratio or candidate/baseline seconds", - metrics={ - "candidate_seconds": candidate_seconds, - "baseline_seconds": baseline_seconds, - "runtime_ratio": ratio, - "runtime_ratio_threshold": threshold, - }, - ) - derived_passes = float(ratio) <= threshold - if passes is not None and bool(passes) != derived_passes: - details["reported_passes_runtime_gate"] = passes - details["computed_passes_runtime_gate"] = derived_passes - return _gate( - "pass" if derived_passes else "fail", - ( - "candidate runtime is inside the smoke benchmark threshold" - if derived_passes - else "candidate runtime exceeds the smoke benchmark threshold" - ), - metrics={ - "candidate_seconds": candidate_seconds, - "baseline_seconds": baseline_seconds, - "runtime_ratio": ratio, - "runtime_ratio_threshold": threshold, - }, - details=details, - ) - - -def _resolve_source_weight_diagnostics_payload( - artifact_root: Path, - manifest: dict[str, Any], - *, - source_weight_diagnostics_payload: dict[str, Any] | None, - source_weight_diagnostics_path: str | Path | None, -) -> dict[str, Any] | None: - if source_weight_diagnostics_payload is not None: - return source_weight_diagnostics_payload - path = _resolve_optional_manifest_artifact_path( - artifact_root, - manifest, - source_weight_diagnostics_path, - "source_weight_diagnostics", - "source_diagnostics", - ) - if path is None: - return None - return _load_json_file(path) - - -def _resolve_optional_manifest_artifact_path( - artifact_root: Path, - manifest: dict[str, Any], - explicit_path: str | Path | None, - *artifact_keys: str, -) -> Path | None: - if explicit_path is not None: - return Path(explicit_path).expanduser() - artifacts = dict(manifest.get("artifacts", {})) - for key in artifact_keys: - value = artifacts.get(key) - if isinstance(value, str) and value: - path = Path(value).expanduser() - return path if path.is_absolute() else artifact_root / path - return None - - -def _source_weight_diagnostics_gate( - payload: dict[str, Any] | None, - *, - max_support_weight_share: float, -) -> dict[str, Any]: - threshold = float(max_support_weight_share) - if payload is None: - return _gate( - "unmeasured", - "source-weight diagnostics sidecar has not been attached", - metrics={"max_support_weight_share": threshold}, - ) - if not isinstance(payload, dict): - return _gate( - "fail", - "source-weight diagnostics sidecar must be a JSON object", - metrics={"max_support_weight_share": threshold}, - ) - - entries = _source_weight_entries(payload) - summary = payload.get("summary") if isinstance(payload.get("summary"), dict) else {} - support_share = _first_present( - payload, - "support_household_weight_share", - "support_weight_share", - "puf_support_household_weight_share", - "puf_clone_household_weight_share", - "clone_household_weight_share", - ) - puf_support_share = _first_present( - payload, - "puf_support_household_weight_share", - "puf_clone_household_weight_share", - ) - max_source_share = _first_present( - payload, - "max_source_household_weight_share", - "largest_source_household_weight_share", - ) - if isinstance(summary, dict): - support_share = ( - support_share - if support_share is not None - else _first_present( - summary, - "support_household_weight_share", - "support_weight_share", - "puf_support_household_weight_share", - "puf_clone_household_weight_share", - "clone_household_weight_share", - ) - ) - puf_support_share = ( - puf_support_share - if puf_support_share is not None - else _first_present( - summary, - "puf_support_household_weight_share", - "puf_clone_household_weight_share", - ) - ) - max_source_share = ( - max_source_share - if max_source_share is not None - else _first_present( - summary, - "max_source_household_weight_share", - "largest_source_household_weight_share", - ) - ) - - if entries: - entry_shares = [_entry_household_weight_share(entry) for entry in entries] - numeric_entry_shares = [ - share for share in entry_shares if share is not None and share >= 0 - ] - if max_source_share is None and numeric_entry_shares: - max_source_share = max(numeric_entry_shares) - if support_share is None: - support_share = sum( - share - for entry, share in zip(entries, entry_shares, strict=True) - if share is not None and _is_support_source_entry(entry) - ) - if puf_support_share is None: - puf_support_share = sum( - share - for entry, share in zip(entries, entry_shares, strict=True) - if share is not None - and _is_support_source_entry(entry) - and _is_puf_source_entry(entry) - ) - - support_share_float = _optional_float(support_share) - puf_support_share_float = _optional_float(puf_support_share) - max_source_share_float = _optional_float(max_source_share) - failures: list[str] = [] - if not entries and support_share_float is None and puf_support_share_float is None: - failures.append("source_breakdown") - if support_share_float is not None and support_share_float > threshold: - failures.append("support_household_weight_share") - if puf_support_share_float is not None and puf_support_share_float > threshold: - failures.append("puf_support_household_weight_share") - - status: GateStatus = "pass" if not failures else "fail" - return _gate( - status, - ( - "source/support weight diagnostics are within dominance thresholds" - if status == "pass" - else "source/support weight diagnostics are missing or exceed dominance thresholds" - ), - metrics={ - "source_entry_count": len(entries), - "support_household_weight_share": support_share_float, - "puf_support_household_weight_share": puf_support_share_float, - "max_source_household_weight_share": max_source_share_float, - "max_support_weight_share": threshold, - }, - details={"failures": failures} if failures else None, - ) - - -def _source_weight_entries(payload: dict[str, Any]) -> list[dict[str, Any]]: - for key in ("sources", "source_classes", "source_weight_shares"): - value = payload.get(key) - if isinstance(value, list): - return [item for item in value if isinstance(item, dict)] - if isinstance(value, dict): - entries: list[dict[str, Any]] = [] - for name, item in value.items(): - if isinstance(item, dict): - entries.append({"source_name": name, **item}) - return entries - return [] - - -def _entry_household_weight_share(entry: dict[str, Any]) -> float | None: - return _optional_float( - _first_present( - entry, - "household_weight_share", - "weight_share", - "share", - ) - ) - - -def _is_support_source_entry(entry: dict[str, Any]) -> bool: - source_class = str( - _first_present(entry, "source_class", "class", "kind", "category") or "" - ).lower() - source_name = str(_first_present(entry, "source_name", "name") or "").lower() - if "fixed" in source_class or "forbes" in source_name: - return False - support_tokens = ("support", "clone", "donor_replay") - return any(token in source_class for token in support_tokens) or any( - token in source_name for token in support_tokens - ) - - -def _is_puf_source_entry(entry: dict[str, Any]) -> bool: - source_name = str(_first_present(entry, "source_name", "name") or "").lower() - source_class = str( - _first_present(entry, "source_class", "class", "kind", "category") or "" - ).lower() - return "puf" in source_name or "irs_soi" in source_name or "puf" in source_class - - -def _optional_float(value: Any) -> float | None: - if value is None: - return None - try: - return float(value) - except (TypeError, ValueError): - return None - - -def _arch_target_coverage_gate( - arch_coverage_payload: dict[str, Any] | None, - *, - expected_period: int, - expected_profile: str, -) -> dict[str, Any]: - if arch_coverage_payload is None: - return _gate( - "unmeasured", - "Arch target coverage report has not been attached", - metrics={ - "expected_profile": expected_profile, - "expected_period": int(expected_period), - }, - ) - if not isinstance(arch_coverage_payload, dict): - return _gate( - "fail", - "Arch target coverage report must be a JSON object", - metrics={ - "expected_profile": expected_profile, - "expected_period": int(expected_period), - }, - ) - payload = dict(arch_coverage_payload) - profile_name = payload.get("profile_name") - period = payload.get("period") - target_cell_count = payload.get("target_cell_count") - covered_cell_count = payload.get("covered_cell_count") - uncovered_cell_count = payload.get("uncovered_cell_count") - coverage_rate = payload.get("coverage_rate") - failures: list[str] = [] - if profile_name != expected_profile: - failures.append("profile_name") - if period is None or int(period) != int(expected_period): - failures.append("period") - if target_cell_count is None or int(target_cell_count) <= 0: - failures.append("target_cell_count") - if uncovered_cell_count is None or int(uncovered_cell_count) != 0: - failures.append("uncovered_cell_count") - if ( - covered_cell_count is not None - and target_cell_count is not None - and int(covered_cell_count) != int(target_cell_count) - ): - failures.append("covered_cell_count") - if coverage_rate is None or float(coverage_rate) < 1.0: - failures.append("coverage_rate") - status: GateStatus = "pass" if not failures else "fail" - return _gate( - status, - ( - "Arch source-backed target coverage is complete" - if status == "pass" - else "Arch source-backed target coverage is incomplete or mismatched" - ), - metrics={ - "profile_name": profile_name, - "expected_profile": expected_profile, - "period": period, - "expected_period": int(expected_period), - "target_cell_count": target_cell_count, - "covered_cell_count": covered_cell_count, - "uncovered_cell_count": uncovered_cell_count, - "coverage_rate": coverage_rate, - }, - details={"failures": failures} if failures else None, - ) - - -def _first_present(payload: dict[str, Any], *keys: str) -> Any: - for key in keys: - if key in payload: - return payload[key] - return None - - -def _benchmark_manifest_gate( - benchmark_manifest_path: str | Path | None, -) -> tuple[dict[str, Any], dict[str, Any] | None]: - if benchmark_manifest_path is None: - descriptor = frozen_production_ecps_benchmark_manifest_descriptor() - payload = load_frozen_production_ecps_benchmark_manifest() - else: - manifest_path = Path(benchmark_manifest_path).expanduser() - if not manifest_path.exists(): - return ( - _gate( - "fail", - "frozen microsimulation benchmark manifest path does not exist", - details={"path": str(manifest_path)}, - ), - None, - ) - descriptor = _file_descriptor(manifest_path) - try: - payload = json.loads(manifest_path.read_text()) - except json.JSONDecodeError as exc: - return ( - _gate( - "fail", - "frozen microsimulation benchmark manifest is not valid JSON", - details={"path": str(manifest_path), "error": str(exc)}, - ), - descriptor, - ) - evidence = _benchmark_manifest_evidence(payload) - descriptor = { - **descriptor, - "pinned_evidence": evidence["present"], - "missing_evidence": evidence["missing"], - } - if evidence["missing"]: - return ( - _gate( - "fail", - "frozen microsimulation benchmark manifest is missing pinned evidence", - metrics={ - "required_evidence_count": len( - _REQUIRED_BENCHMARK_MANIFEST_EVIDENCE - ), - "present_evidence_count": len(evidence["present"]), - }, - details={ - **descriptor, - "missing_evidence": evidence["missing"], - "present_evidence": evidence["present"], - }, - ), - descriptor, - ) - pin_mismatches = frozen_production_pin_mismatches(evidence["present"]) - if pin_mismatches: - return ( - _gate( - "fail", - ( - "frozen microsimulation benchmark manifest does not use " - "the release-pinned production eCPS baseline and all-target " - "surface" - ), - metrics={ - "required_evidence_count": len( - _REQUIRED_BENCHMARK_MANIFEST_EVIDENCE - ), - "present_evidence_count": len(evidence["present"]), - "production_pin_mismatch_count": len(pin_mismatches), - }, - details={ - **descriptor, - "present_evidence": evidence["present"], - "production_pin_mismatches": pin_mismatches, - }, - ), - { - **descriptor, - "production_pin_mismatches": pin_mismatches, - }, - ) - return ( - _gate( - "pass", - "frozen microsimulation benchmark manifest pins baseline, target, and package evidence", - metrics={ - "required_evidence_count": len(_REQUIRED_BENCHMARK_MANIFEST_EVIDENCE), - "present_evidence_count": len(evidence["present"]), - }, - details={**descriptor, "present_evidence": evidence["present"]}, - ), - descriptor, - ) - - -def _benchmark_manifest_evidence(payload: Any) -> dict[str, Any]: - if not isinstance(payload, dict): - return { - "present": {}, - "missing": list(_REQUIRED_BENCHMARK_MANIFEST_EVIDENCE), - } - present: dict[str, Any] = {} - missing: list[str] = [] - for evidence_name, paths in _REQUIRED_BENCHMARK_MANIFEST_EVIDENCE.items(): - value = _first_nested_path_value(payload, paths) - if not _valid_benchmark_evidence_value(evidence_name, value): - missing.append(evidence_name) - continue - present[evidence_name] = value - if _first_nested_path_value(payload, (("policyengine_us_data", "dirty"),)) is True: - missing.append("policyengine_us_data.clean") - return {"present": present, "missing": missing} - - -def _first_nested_path_value( - payload: dict[str, Any], - paths: tuple[tuple[str, ...], ...], -) -> Any: - for path in paths: - current: Any = payload - for part in path: - if not isinstance(current, dict) or part not in current: - current = None - break - current = current[part] - if current is not None: - return current - return None - - -def _valid_benchmark_evidence_value(name: str, value: Any) -> bool: - if name in {"period", "target_surface.target_count"}: - try: - return int(value) > 0 - except (TypeError, ValueError): - return False - if name == "certificate_type": - return value == "frozen_production_ecps_baseline" - if name.startswith("baseline_metrics."): - try: - return np.isfinite(float(value)) - except (TypeError, ValueError): - return False - if not isinstance(value, str) or not value: - return False - if name.endswith(".sha256"): - return len(value) == 64 and bool(_HEX_RE.fullmatch(value)) - if name.endswith(".commit"): - return 7 <= len(value) <= 40 and bool(_HEX_RE.fullmatch(value)) - return True - - -def _required_gate_names(*, require_ecps_comparison: bool) -> list[str]: - required = list(_DEFAULT_REQUIRED_GATES) - if not require_ecps_comparison: - required.remove("ecps_comparison") - return required - - -def _summarize_gates( - gates: dict[str, dict[str, Any]], *, required_gates: list[str] -) -> dict[str, Any]: - statuses = {name: gate["status"] for name, gate in gates.items()} - required = set(required_gates) - failed_required = [ - name - for name in required_gates - if statuses.get(name) == "fail" or statuses.get(name) is None - ] - unmeasured_required = [ - name - for name in required_gates - if statuses.get(name) == "unmeasured" and name not in failed_required - ] - passing_required = [name for name in required_gates if statuses.get(name) == "pass"] - failed_optional = [ - name - for name, status in statuses.items() - if name not in required and status == "fail" - ] - unmeasured_optional = [ - name - for name, status in statuses.items() - if name not in required and status == "unmeasured" - ] - if failed_required: - overall_status = "failed" - elif not unmeasured_required: - overall_status = "passed" - else: - overall_status = "incomplete" - return { - "status": overall_status, - "passing_required_gates": passing_required, - "failed_required_gates": failed_required, - "unmeasured_required_gates": unmeasured_required, - "failed_optional_gates": failed_optional, - "unmeasured_optional_gates": unmeasured_optional, - "passing_required_gate_count": len(passing_required), - "failed_required_gate_count": len(failed_required), - "unmeasured_required_gate_count": len(unmeasured_required), - } - - -def _gate( - status: GateStatus, - summary: str, - *, - metrics: dict[str, Any] | None = None, - details: dict[str, Any] | None = None, -) -> dict[str, Any]: - payload: dict[str, Any] = {"status": status, "summary": summary} - if metrics: - payload["metrics"] = metrics - if details: - payload["details"] = details - return payload - - -def _load_manifest(path: Path) -> dict[str, Any]: - if not path.exists(): - raise FileNotFoundError(f"manifest not found: {path}") - return json.loads(path.read_text()) - - -def _load_json_file(path: str | Path | None) -> dict[str, Any] | None: - if path is None: - return None - return json.loads(Path(path).expanduser().read_text()) - - -def _file_descriptor(path: Path) -> dict[str, Any]: - return { - "path": str(path.resolve()), - "size_bytes": path.stat().st_size, - "sha256": _sha256_file(path), - } - - -def _optional_file_descriptor(path: Path) -> dict[str, Any]: - if path.exists(): - return _file_descriptor(path) - return {"path": str(path.resolve()), "exists": False} - - -def _sha256_file(path: Path) -> str: - digest = hashlib.sha256() - with path.open("rb") as handle: - for chunk in iter(lambda: handle.read(1024 * 1024), b""): - digest.update(chunk) - return digest.hexdigest() - - -def _relative_or_absolute(path: Path, *, base_dir: Path) -> str: - try: - return str(path.resolve().relative_to(base_dir.resolve())) - except ValueError: - return str(path.resolve()) - - -def _write_json_atomically(path: Path, payload: dict[str, Any]) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - temp_path = path.with_name(f".{path.name}.tmp") - temp_path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temp_path.replace(path) - - -def main(argv: list[str] | None = None) -> int: - """CLI entry point for enforcing mp-300k artifact gates.""" - - parser = argparse.ArgumentParser( - description="Run persistent mp-300k artifact gates against an artifact bundle." - ) - parser.add_argument("--artifact-dir", required=True) - parser.add_argument("--candidate-dataset") - parser.add_argument("--baseline-dataset") - parser.add_argument( - "--ecps-comparison-json", - "--native-scores-json", - dest="ecps_comparison_json", - ) - parser.add_argument("--runtime-smoke-json") - parser.add_argument("--source-weight-diagnostics-json") - parser.add_argument("--arch-coverage-json") - parser.add_argument("--benchmark-manifest") - parser.add_argument("--output-json") - parser.add_argument("--target-period", type=int, default=2024) - parser.add_argument( - "--arch-coverage-profile", - default=_DEFAULT_ARCH_COVERAGE_PROFILE, - ) - parser.add_argument("--artifact-size-ratio-threshold", type=float, default=2.0) - parser.add_argument("--runtime-ratio-threshold", type=float, default=1.25) - parser.add_argument( - "--max-support-weight-share", - type=float, - default=_DEFAULT_MAX_SUPPORT_WEIGHT_SHARE, - ) - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--policyengine-us-data-python") - parser.add_argument( - "--skip-ecps-computation", - "--skip-native-score", - dest="skip_ecps_computation", - action="store_true", - help=( - "Do not compute PE-native scores when --ecps-comparison-json is absent. " - "The eCPS comparison gate will remain unmeasured." - ), - ) - parser.add_argument( - "--no-require-ecps-comparison", - action="store_true", - help=( - "Keep reporting the eCPS comparison gate, but do not make it block " - "the overall status. This is the intended deprecation path once eCPS " - "is no longer the comparator." - ), - ) - parser.add_argument( - "--allow-incomplete", - action="store_true", - help="Return exit code 0 when required gates are unmeasured but not failed.", - ) - parser.add_argument( - "--no-update-manifest", - action="store_true", - help="Write the gate report without adding it to manifest.json.", - ) - args = parser.parse_args(argv) - - report_path = write_mp300k_artifact_gate_report( - args.artifact_dir, - output_path=args.output_json, - update_manifest=not args.no_update_manifest, - candidate_dataset_path=args.candidate_dataset, - baseline_dataset_path=args.baseline_dataset, - ecps_comparison_payload=_load_json_file(args.ecps_comparison_json), - arch_coverage_payload=_load_json_file(args.arch_coverage_json), - runtime_smoke_payload=_load_json_file(args.runtime_smoke_json), - source_weight_diagnostics_payload=_load_json_file( - args.source_weight_diagnostics_json - ), - benchmark_manifest_path=args.benchmark_manifest, - period=args.target_period, - arch_coverage_profile=args.arch_coverage_profile, - artifact_size_ratio_threshold=args.artifact_size_ratio_threshold, - runtime_ratio_threshold=args.runtime_ratio_threshold, - max_support_weight_share=args.max_support_weight_share, - compute_native_scores=not args.skip_ecps_computation, - require_ecps_comparison=not args.no_require_ecps_comparison, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_python=args.policyengine_us_data_python, - ) - print(report_path) - report = json.loads(report_path.read_text()) - status = report["summary"]["status"] - if status == "passed" or (status == "incomplete" and args.allow_incomplete): - return 0 - return 1 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/mp300k_gate_inputs.py b/src/microplex_us/pipelines/mp300k_gate_inputs.py deleted file mode 100644 index ee35e4e9..00000000 --- a/src/microplex_us/pipelines/mp300k_gate_inputs.py +++ /dev/null @@ -1,380 +0,0 @@ -"""Package mp-300k artifact-gate inputs for CI handoff.""" - -from __future__ import annotations - -import argparse -import hashlib -import json -import shutil -import tarfile -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - - -def package_mp300k_gate_inputs( - artifact_dir: str | Path, - output_dir: str | Path, - *, - candidate_dataset_path: str | Path | None = None, - baseline_dataset_path: str | Path | None = None, - ecps_comparison_path: str | Path | None = None, - arch_coverage_path: str | Path | None = None, - runtime_smoke_path: str | Path | None = None, - benchmark_manifest_path: str | Path | None = None, - archive_name: str = "artifact.tar.gz", -) -> dict[str, Any]: - """Package an artifact archive plus gate evidence for GitHub Actions. - - The output directory is intended to be uploaded as a single Actions artifact - and consumed by ``mp300k-artifact-gates.yml`` through ``gate_inputs_artifact``. - """ - - artifact_root = Path(artifact_dir).expanduser() - output_root = Path(output_dir).expanduser() - manifest_path = artifact_root / "manifest.json" - manifest = _load_manifest(manifest_path) - candidate_dataset = _resolve_candidate_dataset_path( - artifact_root, - manifest, - candidate_dataset_path, - ) - if not candidate_dataset.exists(): - raise FileNotFoundError(f"candidate dataset not found: {candidate_dataset}") - baseline_dataset = _resolve_baseline_dataset_path( - artifact_root, - manifest, - baseline_dataset_path, - ) - - output_root.mkdir(parents=True, exist_ok=True) - archive_path = output_root / archive_name - stage_parent = output_root / ".staging" - if stage_parent.exists(): - shutil.rmtree(stage_parent) - stage_root = stage_parent / artifact_root.name - stage_root.mkdir(parents=True) - - candidate_relpath = _candidate_archive_relpath( - manifest, - candidate_dataset=candidate_dataset, - explicit_candidate_path=candidate_dataset_path, - ) - staged_candidate = stage_root / candidate_relpath - staged_candidate.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(candidate_dataset, staged_candidate) - baseline_relpath = None - if baseline_dataset is not None: - if not baseline_dataset.exists(): - raise FileNotFoundError(f"baseline dataset not found: {baseline_dataset}") - baseline_relpath = _baseline_archive_relpath( - manifest, - baseline_dataset=baseline_dataset, - explicit_baseline_path=baseline_dataset_path, - ) - staged_baseline = stage_root / baseline_relpath - staged_baseline.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(baseline_dataset, staged_baseline) - source_weight_diagnostics = _resolve_manifest_artifact_path( - artifact_root, - manifest, - "source_weight_diagnostics", - ) - source_weight_diagnostics_relpath = None - if source_weight_diagnostics is not None: - if not source_weight_diagnostics.exists(): - raise FileNotFoundError( - "source weight diagnostics not found: " - f"{source_weight_diagnostics}" - ) - source_weight_diagnostics_relpath = _manifest_artifact_archive_relpath( - manifest, - artifact_key="source_weight_diagnostics", - fallback=Path("source_weight_diagnostics.json"), - ) - staged_source_weight_diagnostics = ( - stage_root / source_weight_diagnostics_relpath - ) - staged_source_weight_diagnostics.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(source_weight_diagnostics, staged_source_weight_diagnostics) - - staged_manifest = _manifest_for_archive( - manifest, - source_artifact_dir=artifact_root, - source_candidate_dataset=candidate_dataset, - candidate_relpath=candidate_relpath, - source_baseline_dataset=baseline_dataset, - baseline_relpath=baseline_relpath, - source_weight_diagnostics=source_weight_diagnostics, - source_weight_diagnostics_relpath=source_weight_diagnostics_relpath, - ) - _write_json(stage_root / "manifest.json", staged_manifest) - _write_archive(archive_path, stage_root) - - evidence = { - "ecps_comparison": _copy_optional_evidence( - ecps_comparison_path, - output_root / "ecps_comparison.json", - ), - "arch_coverage": _copy_optional_evidence( - arch_coverage_path, - output_root / "arch_coverage.json", - ), - "runtime_smoke": _copy_optional_evidence( - runtime_smoke_path, - output_root / "runtime_smoke.json", - ), - "benchmark_manifest": _copy_optional_evidence( - benchmark_manifest_path, - output_root / "benchmark_manifest.json", - ), - } - metadata = { - "schema_version": 1, - "generated_at": datetime.now(UTC).isoformat(), - "source_artifact_dir": str(artifact_root.resolve()), - "source_manifest": _file_descriptor(manifest_path), - "source_candidate_dataset": _file_descriptor(candidate_dataset), - "source_baseline_dataset": ( - _file_descriptor(baseline_dataset) if baseline_dataset is not None else None - ), - "source_weight_diagnostics": ( - _file_descriptor(source_weight_diagnostics) - if source_weight_diagnostics is not None - else None - ), - "artifact_archive": _file_descriptor(archive_path), - "evidence": evidence, - "workflow_call": { - "uses": "./.github/workflows/mp300k-artifact-gates.yml", - "with": {"gate_inputs_artifact": output_root.name}, - }, - } - _write_json(output_root / "gate_inputs.json", metadata) - shutil.rmtree(stage_parent) - return metadata - - -def _load_manifest(path: Path) -> dict[str, Any]: - if not path.exists(): - raise FileNotFoundError(f"manifest not found: {path}") - return json.loads(path.read_text()) - - -def _resolve_candidate_dataset_path( - artifact_root: Path, - manifest: dict[str, Any], - explicit_path: str | Path | None, -) -> Path: - if explicit_path is not None: - return Path(explicit_path).expanduser() - artifacts = dict(manifest.get("artifacts", {})) - dataset_name = artifacts.get("policyengine_dataset") - if not isinstance(dataset_name, str) or not dataset_name: - raise ValueError( - "manifest.artifacts.policyengine_dataset is required when " - "candidate_dataset_path is not supplied" - ) - dataset_path = Path(dataset_name).expanduser() - if not dataset_path.is_absolute(): - dataset_path = artifact_root / dataset_path - return dataset_path - - -def _resolve_baseline_dataset_path( - artifact_root: Path, - manifest: dict[str, Any], - explicit_path: str | Path | None, -) -> Path | None: - if explicit_path is not None: - return Path(explicit_path).expanduser() - value = dict(manifest.get("config", {})).get("policyengine_baseline_dataset") - if value is None: - return None - if not isinstance(value, str) or not value: - raise ValueError("config.policyengine_baseline_dataset must be a path string") - baseline_path = Path(value).expanduser() - if not baseline_path.is_absolute(): - baseline_path = artifact_root / baseline_path - return baseline_path - - -def _safe_archive_relpath(candidate: Path, *, fallback: Path) -> Path: - if candidate.is_absolute() or ".." in candidate.parts: - return fallback - normalized_parts = [part for part in candidate.parts if part not in ("", ".")] - if not normalized_parts: - return fallback - return Path(*normalized_parts) - - -def _candidate_archive_relpath( - manifest: dict[str, Any], - *, - candidate_dataset: Path, - explicit_candidate_path: str | Path | None, -) -> Path: - if explicit_candidate_path is not None: - return Path(candidate_dataset.name) - dataset_name = dict(manifest.get("artifacts", {})).get("policyengine_dataset") - if isinstance(dataset_name, str) and dataset_name: - return _safe_archive_relpath( - Path(dataset_name), - fallback=Path(candidate_dataset.name), - ) - return Path(candidate_dataset.name) - - -def _baseline_archive_relpath( - manifest: dict[str, Any], - *, - baseline_dataset: Path, - explicit_baseline_path: str | Path | None, -) -> Path: - if explicit_baseline_path is not None: - return Path("baseline") / baseline_dataset.name - value = dict(manifest.get("config", {})).get("policyengine_baseline_dataset") - if isinstance(value, str) and value: - return _safe_archive_relpath( - Path(value), - fallback=Path("baseline") / baseline_dataset.name, - ) - return Path("baseline") / baseline_dataset.name - - -def _resolve_manifest_artifact_path( - artifact_root: Path, - manifest: dict[str, Any], - artifact_key: str, -) -> Path | None: - value = dict(manifest.get("artifacts", {})).get(artifact_key) - if value is None: - return None - if not isinstance(value, str) or not value: - raise ValueError(f"manifest.artifacts.{artifact_key} must be a path string") - path = Path(value).expanduser() - return path if path.is_absolute() else artifact_root / path - - -def _manifest_artifact_archive_relpath( - manifest: dict[str, Any], - *, - artifact_key: str, - fallback: Path, -) -> Path: - value = dict(manifest.get("artifacts", {})).get(artifact_key) - if isinstance(value, str) and value: - return _safe_archive_relpath(Path(value), fallback=fallback) - return fallback - - -def _manifest_for_archive( - manifest: dict[str, Any], - *, - source_artifact_dir: Path, - source_candidate_dataset: Path, - candidate_relpath: Path, - source_baseline_dataset: Path | None, - baseline_relpath: Path | None, - source_weight_diagnostics: Path | None, - source_weight_diagnostics_relpath: Path | None, -) -> dict[str, Any]: - updated = dict(manifest) - artifacts = dict(updated.get("artifacts", {})) - artifacts["policyengine_dataset"] = str(candidate_relpath) - if source_weight_diagnostics_relpath is not None: - artifacts["source_weight_diagnostics"] = str(source_weight_diagnostics_relpath) - updated["artifacts"] = artifacts - config = dict(updated.get("config", {})) - if baseline_relpath is not None: - config["policyengine_baseline_dataset"] = str(baseline_relpath) - updated["config"] = config - updated["mp300k_gate_inputs"] = { - "packaged_at": datetime.now(UTC).isoformat(), - "source_artifact_dir": str(source_artifact_dir.resolve()), - "source_candidate_dataset": str(source_candidate_dataset.resolve()), - "source_baseline_dataset": ( - str(source_baseline_dataset.resolve()) - if source_baseline_dataset is not None - else None - ), - "source_weight_diagnostics": ( - str(source_weight_diagnostics.resolve()) - if source_weight_diagnostics is not None - else None - ), - } - return updated - - -def _copy_optional_evidence( - source_path: str | Path | None, - destination_path: Path, -) -> dict[str, Any] | None: - if source_path is None: - return None - source = Path(source_path).expanduser() - if not source.exists(): - raise FileNotFoundError(f"evidence file not found: {source}") - shutil.copy2(source, destination_path) - return _file_descriptor(destination_path) - - -def _write_archive(archive_path: Path, stage_root: Path) -> None: - with tarfile.open(archive_path, "w:gz") as archive: - archive.add(stage_root, arcname=stage_root.name, recursive=True) - - -def _write_json(path: Path, payload: dict[str, Any]) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - - -def _file_descriptor(path: Path) -> dict[str, Any]: - return { - "path": str(path.resolve()), - "size_bytes": path.stat().st_size, - "sha256": _sha256_file(path), - } - - -def _sha256_file(path: Path) -> str: - digest = hashlib.sha256() - with path.open("rb") as handle: - for chunk in iter(lambda: handle.read(1024 * 1024), b""): - digest.update(chunk) - return digest.hexdigest() - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Package mp-300k artifact-gate inputs for CI." - ) - parser.add_argument("--artifact-dir", required=True) - parser.add_argument("--output-dir", required=True) - parser.add_argument("--candidate-dataset") - parser.add_argument("--baseline-dataset") - parser.add_argument("--ecps-comparison-json") - parser.add_argument("--arch-coverage-json") - parser.add_argument("--runtime-smoke-json") - parser.add_argument("--benchmark-manifest") - parser.add_argument("--archive-name", default="artifact.tar.gz") - args = parser.parse_args(argv) - - package_mp300k_gate_inputs( - args.artifact_dir, - args.output_dir, - candidate_dataset_path=args.candidate_dataset, - baseline_dataset_path=args.baseline_dataset, - ecps_comparison_path=args.ecps_comparison_json, - arch_coverage_path=args.arch_coverage_json, - runtime_smoke_path=args.runtime_smoke_json, - benchmark_manifest_path=args.benchmark_manifest, - archive_name=args.archive_name, - ) - print(Path(args.output_dir).expanduser() / "gate_inputs.json") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/mp_benchmark_manifest.py b/src/microplex_us/pipelines/mp_benchmark_manifest.py deleted file mode 100644 index c99dc1e6..00000000 --- a/src/microplex_us/pipelines/mp_benchmark_manifest.py +++ /dev/null @@ -1,379 +0,0 @@ -"""Pinned benchmark manifests for Microplex replacement artifacts.""" - -from __future__ import annotations - -import argparse -import hashlib -import importlib.metadata -import importlib.resources -import json -import subprocess -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -_DEFAULT_PE_US_DATA_REPO = Path.home() / "PolicyEngine" / "policyengine-us-data" -FROZEN_PRODUCTION_ECPS_CERTIFICATE_TYPE = "frozen_production_ecps_baseline" -FROZEN_PRODUCTION_ECPS_PERIOD = 2024 -FROZEN_PRODUCTION_ECPS_BASELINE_SHA256 = ( - "7af7026224f84cb6a91743fd8fa7ac506bad8c78e011fa58b6901894db4b4290" -) -FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256 = ( - "5d14671156c36cd7fff680d5c4d77ec7fb2026ea866b1e12378d9e9c9fb803dc" -) -FROZEN_PRODUCTION_ECPS_TARGET_PROFILE = "pe_native_broad" -FROZEN_PRODUCTION_ECPS_TARGET_SCOPE = "all" -FROZEN_PRODUCTION_ECPS_TARGET_COUNT = 3701 -FROZEN_PRODUCTION_ECPS_TARGET_NAMES_SHA256 = ( - "a49a85a021ef65d5cd5b26d6d605c726ea5ca191ec98d9b5d9cc8b7d5665c25f" -) -FROZEN_PRODUCTION_ECPS_SCORING_CONFIG_SHA256 = ( - "3e67b0ca1f869e4c68f7eba513517b7d4c8dd9aaa195b98c51c100fe65dbabde" -) -FROZEN_PRODUCTION_ECPS_BASELINE_ENHANCED_CPS_NATIVE_LOSS = ( - 0.0558541199034061 -) -FROZEN_PRODUCTION_ECPS_BASELINE_HOLDOUT_LOSS = 0.01266396784689227 -FROZEN_PRODUCTION_ECPS_BASELINE_UNWEIGHTED_MSRE = 3.4642345028776615 -FROZEN_PRODUCTION_ECPS_RESOURCE_NAME = ( - "frozen_production_ecps_2024_benchmark_manifest.json" -) -FROZEN_PRODUCTION_ECPS_REQUIRED_EVIDENCE = { - "certificate_type": FROZEN_PRODUCTION_ECPS_CERTIFICATE_TYPE, - "period": FROZEN_PRODUCTION_ECPS_PERIOD, - "baseline_dataset.sha256": FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - "target_db.sha256": FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, - "target_surface.target_profile": FROZEN_PRODUCTION_ECPS_TARGET_PROFILE, - "target_surface.target_scope": FROZEN_PRODUCTION_ECPS_TARGET_SCOPE, -} - - -def load_frozen_production_ecps_benchmark_manifest() -> dict[str, Any]: - """Load the source-controlled 2024 production eCPS benchmark manifest.""" - - payload = json.loads(_frozen_production_ecps_resource_bytes().decode()) - _assert_manifest_uses_frozen_production_pins(payload) - return payload - - -def frozen_production_ecps_benchmark_manifest_descriptor() -> dict[str, Any]: - """Return file-like evidence for the packaged production eCPS manifest.""" - - payload = _frozen_production_ecps_resource_bytes() - return { - "path": ( - "package:microplex_us.pipelines/" - f"{FROZEN_PRODUCTION_ECPS_RESOURCE_NAME}" - ), - "size_bytes": len(payload), - "sha256": hashlib.sha256(payload).hexdigest(), - "packaged_default": True, - } - - -def _frozen_production_ecps_resource_bytes() -> bytes: - return ( - importlib.resources.files(__package__) - .joinpath(FROZEN_PRODUCTION_ECPS_RESOURCE_NAME) - .read_bytes() - ) - - -def build_mp_benchmark_manifest( - *, - baseline_dataset_path: str | Path, - target_db_path: str | Path, - period: int = 2024, - target_profile: str = "pe_native_broad", - target_scope: str = "all", - target_count: int, - target_names_sha256: str, - scoring_config_sha256: str, - baseline_enhanced_cps_native_loss: float | None = None, - baseline_holdout_loss: float | None = None, - baseline_unweighted_msre: float | None = None, - certificate_type: str = "frozen_production_ecps_baseline", - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_commit: str | None = None, - policyengine_us_version: str | None = None, - allow_dirty_policyengine_us_data: bool = False, - enforce_production_pins: bool = True, -) -> dict[str, Any]: - """Build the frozen comparison manifest required by MP release gates.""" - - baseline_dataset = _file_descriptor(Path(baseline_dataset_path).expanduser()) - target_db = _file_descriptor(Path(target_db_path).expanduser()) - repo_path = ( - Path(policyengine_us_data_repo).expanduser() - if policyengine_us_data_repo is not None - else _DEFAULT_PE_US_DATA_REPO - ) - repo_descriptor = _policyengine_us_data_descriptor( - repo_path, - explicit_commit=policyengine_us_data_commit, - allow_dirty=allow_dirty_policyengine_us_data, - ) - version = policyengine_us_version or _installed_policyengine_us_version() - baseline_metrics = _baseline_metrics_descriptor( - certificate_type=certificate_type, - baseline_enhanced_cps_native_loss=baseline_enhanced_cps_native_loss, - baseline_holdout_loss=baseline_holdout_loss, - baseline_unweighted_msre=baseline_unweighted_msre, - ) - manifest = { - "schema_version": 1, - "certificate_type": str(certificate_type), - "generated_at": datetime.now(UTC).isoformat(), - "period": int(period), - "target_profile": str(target_profile), - "target_scope": str(target_scope), - "target_surface": { - "target_profile": str(target_profile), - "target_scope": str(target_scope), - "target_count": int(target_count), - "target_names_sha256": str(target_names_sha256), - }, - "scoring_config": {"sha256": str(scoring_config_sha256)}, - "baseline_metrics": baseline_metrics, - "baseline_dataset": baseline_dataset, - "policyengine_us_data": repo_descriptor, - "policyengine_us": {"version": version}, - "target_db": target_db, - } - if enforce_production_pins: - _assert_manifest_uses_frozen_production_pins(manifest) - return manifest - - -def _baseline_metrics_descriptor( - *, - certificate_type: str, - baseline_enhanced_cps_native_loss: float | None, - baseline_holdout_loss: float | None, - baseline_unweighted_msre: float | None, -) -> dict[str, float]: - metric_values = { - "baseline_enhanced_cps_native_loss": baseline_enhanced_cps_native_loss, - "baseline_holdout_loss": baseline_holdout_loss, - "baseline_unweighted_msre": baseline_unweighted_msre, - } - missing = [ - name - for name, value in metric_values.items() - if value is None - ] - if missing and certificate_type == FROZEN_PRODUCTION_ECPS_CERTIFICATE_TYPE: - raise ValueError( - "frozen production eCPS benchmark manifests must pin baseline " - "metrics: " + ", ".join(missing) - ) - return { - name: float(value) - for name, value in metric_values.items() - if value is not None - } - - -def write_mp_benchmark_manifest( - output_path: str | Path, - **kwargs: Any, -) -> Path: - """Write a pinned benchmark manifest JSON file.""" - - path = Path(output_path).expanduser() - payload = build_mp_benchmark_manifest(**kwargs) - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - return path - - -def _policyengine_us_data_descriptor( - repo_path: Path, - *, - explicit_commit: str | None, - allow_dirty: bool, -) -> dict[str, Any]: - repo = repo_path.resolve() - commit = explicit_commit or _git_output(repo, "rev-parse", "HEAD") - dirty = None if explicit_commit is not None else _git_dirty(repo) - if dirty and not allow_dirty: - raise ValueError( - "policyengine-us-data repo has uncommitted changes; commit or pass " - "--allow-dirty-policyengine-us-data to make the dirty state explicit" - ) - descriptor: dict[str, Any] = { - "repo": str(repo), - "commit": commit, - } - if dirty is not None: - descriptor["dirty"] = dirty - return descriptor - - -def _installed_policyengine_us_version() -> str: - try: - return importlib.metadata.version("policyengine-us") - except importlib.metadata.PackageNotFoundError as exc: - raise ValueError( - "policyengine-us is not installed; pass --policyengine-us-version" - ) from exc - - -def _file_descriptor(path: Path) -> dict[str, Any]: - resolved = path.resolve() - if not resolved.exists(): - raise FileNotFoundError(f"benchmark file not found: {resolved}") - return { - "path": str(resolved), - "size_bytes": resolved.stat().st_size, - "sha256": _sha256_file(resolved), - } - - -def _sha256_file(path: Path) -> str: - digest = hashlib.sha256() - with path.open("rb") as handle: - for chunk in iter(lambda: handle.read(1024 * 1024), b""): - digest.update(chunk) - return digest.hexdigest() - - -def frozen_production_pin_mismatches( - evidence: dict[str, Any], -) -> list[dict[str, Any]]: - """Return mismatches against the hard-pinned production eCPS surface.""" - - mismatches: list[dict[str, Any]] = [] - certificate_type = evidence.get("certificate_type") - if certificate_type != FROZEN_PRODUCTION_ECPS_CERTIFICATE_TYPE: - return mismatches - for field, expected in FROZEN_PRODUCTION_ECPS_REQUIRED_EVIDENCE.items(): - actual = evidence.get(field) - if actual is None: - continue - if str(actual) != str(expected): - mismatches.append( - { - "field": field, - "expected_production_pin": expected, - "actual": actual, - } - ) - return mismatches - - -def _assert_manifest_uses_frozen_production_pins( - manifest: dict[str, Any], -) -> None: - evidence = { - "certificate_type": manifest.get("certificate_type"), - "period": manifest.get("period"), - "baseline_dataset.sha256": (manifest.get("baseline_dataset") or {}).get( - "sha256" - ), - "target_db.sha256": (manifest.get("target_db") or {}).get("sha256"), - "target_surface.target_profile": ( - manifest.get("target_surface") or {} - ).get("target_profile"), - "target_surface.target_scope": ( - manifest.get("target_surface") or {} - ).get("target_scope"), - } - mismatches = frozen_production_pin_mismatches(evidence) - if mismatches: - details = ", ".join( - f"{item['field']}={item['actual']!r} " - f"(expected {item['expected_production_pin']!r})" - for item in mismatches - ) - raise ValueError( - "frozen production eCPS benchmark manifest does not use the " - f"release-pinned baseline/target surface: {details}" - ) - - -def _git_output(repo_path: Path, *args: str) -> str: - completed = subprocess.run( - ["git", "-C", str(repo_path), *args], - check=False, - capture_output=True, - text=True, - ) - if completed.returncode != 0: - detail = completed.stderr.strip() or completed.stdout.strip() - raise ValueError(f"git {' '.join(args)} failed for {repo_path}: {detail}") - return completed.stdout.strip() - - -def _git_dirty(repo_path: Path) -> bool: - return bool(_git_output(repo_path, "status", "--porcelain")) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Write a pinned benchmark manifest for MP replacement gates." - ) - parser.add_argument("--baseline-dataset", required=True) - parser.add_argument("--target-db", required=True) - parser.add_argument("--output-json", required=True) - parser.add_argument("--period", type=int, default=2024) - parser.add_argument("--target-profile", default="pe_native_broad") - parser.add_argument("--target-scope", default="all") - parser.add_argument("--target-count", type=int, required=True) - parser.add_argument("--target-names-sha256", required=True) - parser.add_argument("--scoring-config-sha256", required=True) - parser.add_argument("--baseline-enhanced-cps-native-loss", type=float, required=True) - parser.add_argument("--baseline-holdout-loss", type=float, required=True) - parser.add_argument("--baseline-unweighted-msre", type=float, required=True) - parser.add_argument( - "--certificate-type", - default="frozen_production_ecps_baseline", - ) - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--policyengine-us-data-commit") - parser.add_argument("--policyengine-us-version") - parser.add_argument( - "--allow-dirty-policyengine-us-data", - action="store_true", - help=( - "Allow a dirty policyengine-us-data repo and record that dirty state " - "in the manifest." - ), - ) - parser.add_argument( - "--allow-noncanonical-production-pins", - action="store_true", - help=( - "Allow writing an experimental manifest whose frozen-production " - "fields do not match the canonical production eCPS baseline, target " - "DB, and all-target surface. Release gates still reject it." - ), - ) - args = parser.parse_args(argv) - - written = write_mp_benchmark_manifest( - args.output_json, - baseline_dataset_path=args.baseline_dataset, - target_db_path=args.target_db, - period=args.period, - target_profile=args.target_profile, - target_scope=args.target_scope, - target_count=args.target_count, - target_names_sha256=args.target_names_sha256, - scoring_config_sha256=args.scoring_config_sha256, - baseline_enhanced_cps_native_loss=args.baseline_enhanced_cps_native_loss, - baseline_holdout_loss=args.baseline_holdout_loss, - baseline_unweighted_msre=args.baseline_unweighted_msre, - certificate_type=args.certificate_type, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_commit=args.policyengine_us_data_commit, - policyengine_us_version=args.policyengine_us_version, - allow_dirty_policyengine_us_data=args.allow_dirty_policyengine_us_data, - enforce_production_pins=not args.allow_noncanonical_production_pins, - ) - print(written) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/pe_focus_targets.py b/src/microplex_us/pipelines/pe_focus_targets.py deleted file mode 100644 index 1ea720b5..00000000 --- a/src/microplex_us/pipelines/pe_focus_targets.py +++ /dev/null @@ -1,327 +0,0 @@ -"""Targeted PE-native focus checks for ACA spending/enrollment and state AGI bins.""" - -from __future__ import annotations - -import json -import subprocess -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.pe_native_scores import ( - build_policyengine_us_data_subprocess_env, - resolve_policyengine_us_data_repo_root, -) - -_PE_FOCUS_TARGETS_SCRIPT = r""" -import json -import sys -from pathlib import Path -from typing import Any - -import sqlite3 -import numpy as np -from policyengine_core.data import Dataset -from policyengine_us import Microsimulation -from policyengine_us_data.storage import STORAGE_FOLDER -from policyengine_us_data.utils.census import STATE_ABBREV_TO_FIPS - -PERIOD = int(sys.argv[1]) -BASELINE_PATH = sys.argv[2] -CANDIDATE_PATH = sys.argv[3] - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -def get_agi_band_label(lower: float, upper: float) -> str: - if lower == -np.inf: - return f"-inf_{int(upper)}" - if upper == np.inf: - return f"{int(lower)}_inf" - return f"{int(lower)}_{int(upper)}" - - -def _load_focus_targets_from_db(period: int) -> list[dict[str, Any]]: - db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" - conn = sqlite3.connect(db_path) - conn.row_factory = sqlite3.Row - try: - rows = conn.execute( - ''' - SELECT - t.target_id, - t.variable, - t.value, - t.period, - t.stratum_id, - sc.constraint_variable, - sc.operation, - sc.value AS constraint_value - FROM targets t - JOIN strata s ON t.stratum_id = s.stratum_id - LEFT JOIN stratum_constraints sc - ON s.stratum_id = sc.stratum_id - WHERE t.active = 1 - AND t.reform_id = 0 - AND t.period <= ? - AND t.variable IN ('aca_ptc', 'person_count', 'adjusted_gross_income', 'tax_unit_count') - ''', - (period,), - ).fetchall() - finally: - conn.close() - - targets_by_id: dict[int, dict[str, Any]] = {} - for row in rows: - target_id = int(row["target_id"]) - target = targets_by_id.setdefault( - target_id, - { - "target_id": target_id, - "variable": row["variable"], - "value": float(row["value"]), - "period": int(row["period"]), - "stratum_id": int(row["stratum_id"]), - "constraints": [], - }, - ) - if row["constraint_variable"] is not None: - target["constraints"].append( - { - "variable": row["constraint_variable"], - "operation": row["operation"], - "value": row["constraint_value"], - } - ) - - best_targets: dict[tuple[int, str], dict[str, Any]] = {} - for target in targets_by_id.values(): - key = (target["stratum_id"], target["variable"]) - existing = best_targets.get(key) - if existing is None or target["period"] > existing["period"]: - best_targets[key] = target - - fips_to_state = {v: k for k, v in STATE_ABBREV_TO_FIPS.items()} - - focus_targets: list[dict[str, Any]] = [] - for target in best_targets.values(): - constraints = target["constraints"] - if not constraints: - continue - - constraint_vars = {c["variable"] for c in constraints} - if "state_fips" not in constraint_vars: - continue - if "congressional_district_geoid" in constraint_vars: - continue - - state_value = next( - (c["value"] for c in constraints if c["variable"] == "state_fips"), - None, - ) - if state_value is None: - continue - state_fips = f"{int(state_value):02d}" - state_abbrev = fips_to_state.get(state_fips) - if state_abbrev is None: - continue - - if target["variable"] == "aca_ptc": - focus_targets.append( - { - "target_name": f"nation/irs/aca_spending/{state_abbrev.lower()}", - "target": target["value"], - "state_abbrev": state_abbrev, - "kind": "aca_spending", - } - ) - continue - - if target["variable"] == "person_count" and ( - "aca_ptc" in constraint_vars or "is_aca_ptc_eligible" in constraint_vars - ): - focus_targets.append( - { - "target_name": f"state/irs/aca_enrollment/{state_abbrev.lower()}", - "target": target["value"], - "state_abbrev": state_abbrev, - "kind": "aca_enrollment", - } - ) - continue - - if target["variable"] not in {"adjusted_gross_income", "tax_unit_count"}: - continue - - if "adjusted_gross_income" not in constraint_vars: - continue - - lower = float("-inf") - upper = float("inf") - for c in constraints: - if c["variable"] != "adjusted_gross_income": - continue - value = float(c["value"]) - if c["operation"] in (">=", ">"): - lower = max(lower, value) - elif c["operation"] in ("<=", "<"): - upper = min(upper, value) - - band = get_agi_band_label(lower, upper) - focus_targets.append( - { - "target_name": f"state/{state_abbrev}/{target['variable']}/{band}", - "target": target["value"], - "state_abbrev": state_abbrev, - "kind": "agi", - "agi_lower": lower, - "agi_upper": upper, - "is_count": target["variable"] == "tax_unit_count", - } - ) - - return focus_targets - - -def compute_focus(dataset_path: str) -> list[dict[str, float | str]]: - dataset_cls = dataset_from_path(dataset_path, Path(dataset_path).stem) - sim = Microsimulation(dataset=dataset_cls) - sim.default_calculation_period = PERIOD - weights = sim.calculate( - "household_weight", - map_to="household", - period=PERIOD, - ).values.astype(np.float64) - - rows: list[dict[str, float | str]] = [] - - focus_targets = _load_focus_targets_from_db(PERIOD) - aca_value = sim.calculate("aca_ptc", map_to="household", period=2025).values - state_household = sim.calculate("state_code", map_to="household").values - state_person = sim.calculate("state_code", map_to="person").values - in_tax_unit_with_aca = ( - sim.calculate("aca_ptc", map_to="person", period=2025).values > 0 - ) - is_aca_eligible = sim.calculate( - "is_aca_ptc_eligible", map_to="person", period=2025 - ).values - is_enrolled = in_tax_unit_with_aca & is_aca_eligible - agi = sim.calculate("adjusted_gross_income").values - state_tax_unit = sim.map_result( - state_person, "person", "tax_unit", how="value_from_first_person" - ) - - for target in focus_targets: - kind = target["kind"] - state_abbrev = target["state_abbrev"] - - if kind == "aca_spending": - in_state = state_household == state_abbrev - metric = aca_value * in_state - elif kind == "aca_enrollment": - in_state = state_person == state_abbrev - metric = sim.map_result(in_state & is_enrolled, "person", "household") - else: - lower = float(target.get("agi_lower", float("-inf"))) - upper = float(target.get("agi_upper", float("inf"))) - in_state = state_tax_unit == state_abbrev - in_band = (agi > lower) & (agi <= upper) - if target.get("is_count"): - metric = (in_state & in_band & (agi > 0)).astype(float) - else: - metric = np.where(in_state & in_band, agi, 0.0) - metric = sim.map_result(metric, "tax_unit", "household") - - estimate = float(np.sum(metric * weights)) - rows.append( - { - "target_name": target["target_name"], - "target": float(target["target"]), - "estimate": estimate, - } - ) - - return rows - - -baseline_rows = compute_focus(BASELINE_PATH) -candidate_rows = compute_focus(CANDIDATE_PATH) - -baseline_map = {row["target_name"]: row for row in baseline_rows} -candidate_map = {row["target_name"]: row for row in candidate_rows} - -rows = [] -for name in sorted(set(baseline_map) | set(candidate_map)): - base = baseline_map.get(name, {}) - cand = candidate_map.get(name, {}) - target = base.get("target", cand.get("target")) - baseline_est = base.get("estimate") - candidate_est = cand.get("estimate") - rows.append( - { - "target_name": name, - "target": target, - "baseline_estimate": baseline_est, - "candidate_estimate": candidate_est, - "candidate_over_target": ( - candidate_est / target if target not in (None, 0) else None - ), - "candidate_over_baseline": ( - candidate_est / baseline_est - if baseline_est not in (None, 0) - else None - ), - } - ) - -print(json.dumps({"rows": rows}, sort_keys=True)) -""".strip() - - -def compare_us_pe_focus_targets( - *, - baseline_dataset_path: str | Path, - candidate_dataset_path: str | Path, - period: int = 2024, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> dict[str, Any]: - """Compare ACA/AGI focus targets between a baseline and candidate dataset.""" - - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - if policyengine_us_data_python is not None: - command = [str(Path(policyengine_us_data_python).expanduser())] - else: - command = ["uv", "run", "--project", str(resolved_repo), "python"] - completed = subprocess.run( - [ - *command, - "-c", - _PE_FOCUS_TARGETS_SCRIPT, - str(int(period)), - str(Path(baseline_dataset_path).expanduser().resolve()), - str(Path(candidate_dataset_path).expanduser().resolve()), - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - stderr = completed.stderr.strip() - stdout = completed.stdout.strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise RuntimeError(f"PE focus target comparison failed: {detail}") - return json.loads(completed.stdout) - - -__all__ = ["compare_us_pe_focus_targets"] diff --git a/src/microplex_us/pipelines/pe_l0.py b/src/microplex_us/pipelines/pe_l0.py deleted file mode 100644 index 761de42f..00000000 --- a/src/microplex_us/pipelines/pe_l0.py +++ /dev/null @@ -1,460 +0,0 @@ -"""Adapters for PolicyEngine US calibration backends.""" - -from __future__ import annotations - -import inspect -import os -import sys -from collections.abc import Callable -from os import PathLike -from pathlib import Path -from typing import Any, Self - -import numpy as np -import pandas as pd -from microplex.calibration import ( - LinearConstraint, - _build_sparse_constraint_system, - _validate_calibration_inputs, -) -from scipy import sparse as sp - -_PE_US_DATA_REPO_ENV = "MICROPLEX_US_POLICYENGINE_US_DATA_REPO" - - -def make_policyengine_us_data_fit_l0_weights_fn( - repo_root: str | PathLike[str] | None = None, -) -> Callable[..., np.ndarray]: - """Return a lazy wrapper around PE-US-data's L0 weight optimizer. - - Microplex passes adapter-specific diagnostics such as ``target_names`` and - ``initial_weights``. The incumbent PE-US-data function accepts a narrower - signature, so this wrapper keeps the public hook stable while delegating - only supported arguments. - """ - - def _fit_l0_weights(**kwargs: Any) -> np.ndarray: - fit_l0_weights = _load_policyengine_us_data_fit_l0_weights(repo_root) - accepted_parameters = set(inspect.signature(fit_l0_weights).parameters) - call_kwargs = { - key: value for key, value in kwargs.items() if key in accepted_parameters - } - return np.asarray(fit_l0_weights(**call_kwargs), dtype=float) - - return _fit_l0_weights - - -def _load_policyengine_us_data_fit_l0_weights( - repo_root: str | PathLike[str] | None = None, -) -> Callable[..., np.ndarray]: - resolved_repo = _resolve_policyengine_us_data_repo_root(repo_root) - inserted_path: str | None = None - if resolved_repo is not None: - inserted_path = str(resolved_repo) - if inserted_path not in sys.path: - sys.path.insert(0, inserted_path) - try: - from policyengine_us_data.calibration.unified_calibration import ( - fit_l0_weights, - ) - except ImportError as exc: - location = ( - f" at {resolved_repo}" - if resolved_repo is not None - else " from the active Python environment" - ) - raise RuntimeError( - "The pe_l0 backend requires policyengine-us-data's " - f"fit_l0_weights{location}. Set " - f"{_PE_US_DATA_REPO_ENV} or install policyengine-us-data." - ) from exc - finally: - if inserted_path is not None and sys.path[0] == inserted_path: - sys.path.pop(0) - return fit_l0_weights - - -def _resolve_policyengine_us_data_repo_root( - repo_root: str | PathLike[str] | None = None, -) -> Path | None: - candidate = repo_root or os.environ.get(_PE_US_DATA_REPO_ENV) - if candidate is None: - return None - resolved = Path(candidate).expanduser().resolve() - if not (resolved / "policyengine_us_data").exists(): - raise RuntimeError( - "policyengine-us-data repo root does not contain " - f"policyengine_us_data/: {resolved}" - ) - return resolved - - -class PolicyEngineL0Calibrator: - """Legacy L0 adapter for explicit experiments behind the Microplex interface.""" - - def __init__( - self, - *, - lambda_l0: float = 1e-4, - lambda_l2: float = 1e-12, - beta: float = 0.35, - learning_rate: float = 0.15, - epochs: int = 100, - tol: float = 1e-6, - device: str = "cpu", - verbose_freq: int | None = None, - policyengine_us_data_repo_root: str | PathLike[str] | None = None, - policyengine_us_data_python: str | PathLike[str] | None = None, - fit_l0_weights_fn: Callable[..., np.ndarray] | None = None, - ) -> None: - self.lambda_l0 = float(lambda_l0) - self.lambda_l2 = float(lambda_l2) - self.beta = float(beta) - self.learning_rate = float(learning_rate) - self.epochs = int(epochs) - self.tol = float(tol) - self.device = str(device) - self.verbose_freq = verbose_freq - self.policyengine_us_data_repo_root = policyengine_us_data_repo_root - self.policyengine_us_data_python = policyengine_us_data_python - self.fit_l0_weights_fn = fit_l0_weights_fn - - self.weights_: np.ndarray | None = None - self.is_fitted_: bool = False - self.n_records_: int | None = None - self.marginal_targets_: dict[str, dict[str, float]] | None = None - self.continuous_targets_: dict[str, float] | None = None - self.linear_constraints_: tuple[LinearConstraint, ...] = () - self.target_names_: list[str] = [] - self.calibration_error_: float = 0.0 - self.max_error_: float = 0.0 - self.converged_: bool = False - self.n_iterations_: int = 0 - self.effective_backend_: str = "policyengine_l0" - self.loss_history_: list[dict[str, float | int]] = [] - - def fit( - self, - data: pd.DataFrame, - marginal_targets: dict[str, dict[str, float]], - continuous_targets: dict[str, float] | None = None, - weight_col: str = "weight", - linear_constraints: tuple[LinearConstraint, ...] - | list[LinearConstraint] - | None = None, - ) -> Self: - self.n_records_ = len(data) - self.marginal_targets_ = marginal_targets - self.continuous_targets_ = continuous_targets or {} - self.linear_constraints_ = tuple(linear_constraints or ()) - - _validate_calibration_inputs( - data, - marginal_targets, - continuous_targets, - self.linear_constraints_, - ) - - # Build the calibration matrix directly in CSR form to avoid the - # ~24 GB dense intermediate that OOM'd v7 at 1.5M records x - # ~4k constraints. See microplex.calibration._build_sparse_constraint_system. - X_sparse_built, b, names, _ = _build_sparse_constraint_system( - data, - marginal_targets, - continuous_targets, - self.linear_constraints_, - ) - self.target_names_ = names - - if X_sparse_built.shape[0] == 0: - if weight_col in data.columns: - self.weights_ = data[weight_col].to_numpy(dtype=float, copy=True) - else: - self.weights_ = np.ones(len(data), dtype=float) - self.calibration_error_ = 0.0 - self.max_error_ = 0.0 - self.converged_ = True - self.n_iterations_ = 0 - self.is_fitted_ = True - return self - - if weight_col in data.columns: - initial_weights = data[weight_col].to_numpy(dtype=float, copy=True) - else: - initial_weights = np.ones(len(data), dtype=float) - initial_weights = np.maximum(initial_weights, 1e-12) - - X_sparse = X_sparse_built - weights = self._fit_weights( - X_sparse=X_sparse, - targets=b.astype(np.float64), - initial_weights=initial_weights, - target_names=names, - ) - weights = np.maximum(np.asarray(weights, dtype=float), 0.0) - - residual = X_sparse @ weights - b - rel_errors = np.abs(residual) / np.maximum(np.abs(b), 1e-10) - self.weights_ = weights - self.calibration_error_ = float(np.sqrt(np.mean(rel_errors**2))) - self.max_error_ = float(rel_errors.max()) if len(rel_errors) else 0.0 - if self.effective_backend_ == "dense_projected_gradient": - self.converged_ = bool(self.converged_ or self.max_error_ < self.tol) - else: - self.converged_ = bool(self.max_error_ < self.tol) - self.n_iterations_ = self.epochs - self.is_fitted_ = True - return self - - def _fit_weights( - self, - *, - X_sparse, - targets: np.ndarray, - initial_weights: np.ndarray, - target_names: list[str], - ) -> np.ndarray: - if self.lambda_l0 <= 0.0: - self.effective_backend_ = "dense_projected_gradient" - return self._fit_dense_no_l0_weights( - X_sparse=X_sparse, - targets=targets, - initial_weights=initial_weights, - ) - self.effective_backend_ = "policyengine_l0" - if self.fit_l0_weights_fn is not None: - achievable = np.asarray(X_sparse.sum(axis=1)).reshape(-1) > 0 - return self.fit_l0_weights_fn( - X_sparse=X_sparse, - targets=targets, - lambda_l0=self.lambda_l0, - epochs=self.epochs, - device=self.device, - verbose_freq=self.verbose_freq, - beta=self.beta, - lambda_l2=self.lambda_l2, - learning_rate=self.learning_rate, - target_names=target_names, - initial_weights=initial_weights, - achievable=achievable, - ) - raise RuntimeError( - "The pe_l0 backend is legacy/experimental and no longer loads " - "policyengine-us-data implicitly. Pass an explicit fit_l0_weights_fn " - "for an experiment, or use the production entropy/dense calibration " - "path for MP eCPS replacement builds." - ) - - def _fit_dense_no_l0_weights( - self, - *, - X_sparse, - targets: np.ndarray, - initial_weights: np.ndarray, - ) -> np.ndarray: - matrix = X_sparse.tocsr() if sp.issparse(X_sparse) else sp.csr_matrix(X_sparse) - target = np.asarray(targets, dtype=np.float64) - weights = np.maximum(np.asarray(initial_weights, dtype=np.float64), 0.0) - initial_reference = weights.copy() - scale = 1.0 / np.maximum(np.abs(target), 1.0) - - def objective(candidate: np.ndarray) -> float: - residual = (matrix @ candidate - target) * scale - loss = float(np.dot(residual, residual)) - if self.lambda_l2 > 0.0: - delta = candidate - initial_reference - loss += float(self.lambda_l2 * np.dot(delta, delta)) - return loss - - def gradient(candidate: np.ndarray) -> np.ndarray: - residual = (matrix @ candidate - target) * scale - grad = 2.0 * np.asarray(matrix.T @ (residual * scale)).reshape(-1) - if self.lambda_l2 > 0.0: - grad += 2.0 * self.lambda_l2 * (candidate - initial_reference) - return grad - - step_size = 1.0 / _estimate_sparse_quadratic_lipschitz( - matrix, - scale, - self.lambda_l2, - ) - current_loss = objective(weights) - self.loss_history_ = [ - { - "iteration": 0, - "objective_loss": float(current_loss), - "weight_sum": float(weights.sum()), - "positive_record_count": int((weights > 1e-9).sum()), - } - ] - self.converged_ = False - completed_iter = 0 - for iteration in range(1, self.epochs + 1): - completed_iter = iteration - grad = gradient(weights) - accepted = False - iteration_step_size = step_size - candidate = weights - candidate_loss = current_loss - for _ in range(30): - trial = np.maximum(weights - iteration_step_size * grad, 0.0) - trial_loss = objective(trial) - if trial_loss <= current_loss: - candidate = trial - candidate_loss = trial_loss - accepted = True - break - iteration_step_size *= 0.5 - if not accepted: - self.converged_ = True - break - improvement = current_loss - candidate_loss - weights = candidate - current_loss = candidate_loss - self.loss_history_.append( - { - "iteration": int(iteration), - "objective_loss": float(current_loss), - "weight_sum": float(weights.sum()), - "positive_record_count": int((weights > 1e-9).sum()), - } - ) - if improvement < self.tol * max(1.0, current_loss): - self.converged_ = True - break - self.n_iterations_ = completed_iter - return weights - - def transform( - self, - data: pd.DataFrame, - weight_col: str = "weight", - ) -> pd.DataFrame: - if not self.is_fitted_: - raise ValueError("Not fitted.") - if len(data) != self.n_records_: - raise ValueError( - f"Data length ({len(data)}) doesn't match fitted ({self.n_records_})" - ) - result = data.copy() - result[weight_col] = self.weights_ - return result - - def fit_transform( - self, - data: pd.DataFrame, - marginal_targets: dict[str, dict[str, float]], - continuous_targets: dict[str, float] | None = None, - weight_col: str = "weight", - linear_constraints: tuple[LinearConstraint, ...] - | list[LinearConstraint] - | None = None, - ) -> pd.DataFrame: - self.fit( - data, - marginal_targets, - continuous_targets, - weight_col=weight_col, - linear_constraints=linear_constraints, - ) - return self.transform(data, weight_col=weight_col) - - def get_sparsity(self) -> float: - if not self.is_fitted_: - raise ValueError("Not fitted.") - return float((self.weights_ < 1e-9).sum() / self.n_records_) - - def validate(self, data: pd.DataFrame) -> dict[str, Any]: - if not self.is_fitted_: - raise ValueError("Not fitted.") - - weights = self.weights_ - results = { - "backend": self.effective_backend_, - "uses_gates": self.effective_backend_ == "policyengine_l0", - "targets": {}, - "marginal_errors": {}, - "continuous_errors": {}, - "linear_errors": {}, - "sparsity": self.get_sparsity(), - "converged": self.converged_, - "iterations": self.n_iterations_, - "loss_history": self.loss_history_, - } - - if self.marginal_targets_: - for var, var_targets in self.marginal_targets_.items(): - results["marginal_errors"][var] = {} - for category, target in var_targets.items(): - mask = data[var] == category - actual = weights[mask].sum() - rel_error = abs(actual - target) / target if target > 0 else 0.0 - info = { - "actual": actual, - "target": target, - "relative_error": rel_error, - } - results["marginal_errors"][var][category] = info - results["targets"][f"{var}={category}"] = { - **info, - "error": rel_error, - } - - if self.continuous_targets_: - for var, target in self.continuous_targets_.items(): - actual = float((weights * data[var].to_numpy(dtype=float)).sum()) - rel_error = abs(actual - target) / abs(target) if target != 0 else 0.0 - info = { - "actual": actual, - "target": target, - "relative_error": rel_error, - } - results["continuous_errors"][var] = info - results["targets"][var] = { - **info, - "error": rel_error, - } - - for constraint in self.linear_constraints_: - actual = float(weights @ constraint.coefficients) - target = float(constraint.target) - rel_error = abs(actual - target) / abs(target) if target != 0 else 0.0 - results["linear_errors"][constraint.name] = { - "actual": actual, - "target": target, - "relative_error": rel_error, - } - - errors = [t["error"] for t in results["targets"].values()] - errors.extend( - item["relative_error"] for item in results["linear_errors"].values() - ) - results["max_error"] = max(errors) if errors else 0.0 - results["mean_error"] = float(np.mean(errors)) if errors else 0.0 - results["rmse"] = self.calibration_error_ - return results - - -def _estimate_sparse_quadratic_lipschitz( - matrix, - row_scale: np.ndarray, - l2_penalty: float, -) -> float: - if matrix.shape[0] == 0 or matrix.shape[1] == 0: - return max(2.0 * l2_penalty, 1.0) - scale_squared = np.square(np.asarray(row_scale, dtype=np.float64)) - vector = np.ones(matrix.shape[1], dtype=np.float64) - vector /= np.linalg.norm(vector) - for _ in range(25): - transformed = np.asarray( - matrix.T @ (scale_squared * np.asarray(matrix @ vector).reshape(-1)) - ).reshape(-1) - norm = np.linalg.norm(transformed) - if norm < 1e-12: - return max(2.0 * l2_penalty, 1.0) - vector = transformed / norm - transformed = np.asarray( - matrix.T @ (scale_squared * np.asarray(matrix @ vector).reshape(-1)) - ).reshape(-1) - eigenvalue = float(np.dot(vector, transformed)) - return max(2.0 * eigenvalue + 2.0 * l2_penalty, 1e-6) diff --git a/src/microplex_us/pipelines/pe_native_calibration_benchmark.py b/src/microplex_us/pipelines/pe_native_calibration_benchmark.py deleted file mode 100644 index 4290ea2a..00000000 --- a/src/microplex_us/pipelines/pe_native_calibration_benchmark.py +++ /dev/null @@ -1,709 +0,0 @@ -"""Benchmark PE-native calibration strategies on a common target surface.""" - -from __future__ import annotations - -import argparse -import json -import re -import subprocess -import sys -from collections.abc import Mapping, Sequence -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path -from tempfile import TemporaryDirectory -from time import perf_counter -from typing import Any - -import h5py -import numpy as np - -from microplex_us.pipelines.pe_native_loss import loss_arrays_from_inputs -from microplex_us.pipelines.pe_native_optimization import ( - _PE_NATIVE_BROAD_MATRIX_SCRIPT, - optimize_pe_native_loss_weights, - rewrite_policyengine_us_dataset_weights, -) -from microplex_us.pipelines.pe_native_scores import ( - _ENHANCED_CPS_BAD_TARGETS, - build_policyengine_us_data_subprocess_env, - compute_batch_us_pe_native_scores, - resolve_policyengine_us_data_repo_root, -) - -_DEFAULT_PE_NATIVE_BASELINE_CACHE_DIR = ( - Path.home() / ".cache" / "microplex-us" / "pe-native-baseline" -) - - -@dataclass(frozen=True) -class CalibrationBenchmarkVariant: - """One dataset variant to score in a PE-native calibration benchmark.""" - - label: str - method: str - dataset_path: str - generated: bool = False - optimization: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - return { - "label": self.label, - "method": self.method, - "dataset_path": self.dataset_path, - "generated": self.generated, - "optimization": dict(self.optimization), - } - - -def _household_weights( - dataset_path: str | Path, - *, - period: int, -) -> tuple[np.ndarray, np.ndarray]: - path = Path(dataset_path).expanduser().resolve() - period_key = str(period) - with h5py.File(path, "r") as handle: - if "household_id" not in handle or period_key not in handle["household_id"]: - raise ValueError(f"{path} is missing household_id/{period_key}") - if ( - "household_weight" not in handle - or period_key not in handle["household_weight"] - ): - raise ValueError(f"{path} is missing household_weight/{period_key}") - household_ids = np.asarray(handle["household_id"][period_key], dtype=np.int64) - weights = np.asarray( - handle["household_weight"][period_key], - dtype=np.float64, - ) - if household_ids.shape[0] != weights.shape[0]: - raise ValueError(f"{path} household_id and household_weight lengths differ") - return household_ids, weights - - -def _reference_aligned_weights( - household_ids: np.ndarray, - reference_dataset_path: str | Path, - *, - period: int, -) -> tuple[str, np.ndarray | None]: - reference_ids, reference_weights = _household_weights( - reference_dataset_path, - period=period, - ) - if household_ids.shape == reference_ids.shape and np.array_equal( - household_ids, - reference_ids, - ): - return "same_order", reference_weights - if len(np.unique(reference_ids)) != len(reference_ids): - return "reference_duplicate_household_ids", None - reference_by_id = { - int(household_id): float(weight) - for household_id, weight in zip(reference_ids, reference_weights, strict=True) - } - if all(int(household_id) in reference_by_id for household_id in household_ids): - return ( - "matched_by_household_id", - np.asarray( - [reference_by_id[int(household_id)] for household_id in household_ids], - dtype=np.float64, - ), - ) - return "not_comparable", None - - -def compute_household_weight_diagnostics( - dataset_path: str | Path, - *, - period: int = 2024, - reference_dataset_path: str | Path | None = None, -) -> dict[str, Any]: - """Summarize household weight quality and optional distance from a reference.""" - - resolved = Path(dataset_path).expanduser().resolve() - household_ids, weights = _household_weights(resolved, period=period) - n_households = int(len(weights)) - positive = weights > 0.0 - weight_sum = float(weights.sum()) - square_sum = float(np.dot(weights, weights)) - effective_sample_size = ( - weight_sum * weight_sum / square_sum if square_sum > 0.0 else 0.0 - ) - diagnostics: dict[str, Any] = { - "dataset_path": str(resolved), - "period": int(period), - "household_count": n_households, - "positive_household_count": int(positive.sum()), - "zero_household_count": int((weights == 0.0).sum()), - "negative_household_count": int((weights < 0.0).sum()), - "weight_sum": weight_sum, - "weight_mean": float(weights.mean()) if n_households else 0.0, - "weight_median": float(np.median(weights)) if n_households else 0.0, - "weight_min": float(weights.min()) if n_households else 0.0, - "weight_max": float(weights.max()) if n_households else 0.0, - "weight_p95": float(np.quantile(weights, 0.95)) if n_households else 0.0, - "weight_p99": float(np.quantile(weights, 0.99)) if n_households else 0.0, - "max_to_mean_weight_ratio": ( - float(weights.max() / weights.mean()) - if n_households and weights.mean() > 0.0 - else None - ), - "effective_sample_size": float(effective_sample_size), - "effective_sample_size_share": ( - float(effective_sample_size / n_households) if n_households else None - ), - } - - if reference_dataset_path is None: - return diagnostics - - alignment, reference_weights = _reference_aligned_weights( - household_ids, - reference_dataset_path, - period=period, - ) - diagnostics["reference_dataset_path"] = str( - Path(reference_dataset_path).expanduser().resolve() - ) - diagnostics["reference_alignment"] = alignment - if reference_weights is None: - return diagnostics - - delta = weights - reference_weights - reference_sum = float(reference_weights.sum()) - diagnostics.update( - { - "reference_weight_sum": reference_sum, - "weight_sum_delta": float(weight_sum - reference_sum), - "l1_delta_as_share_of_reference_sum": ( - float(np.abs(delta).sum() / abs(reference_sum)) - if reference_sum != 0.0 - else None - ), - "mean_abs_weight_delta": float(np.abs(delta).mean()), - "rms_weight_delta": float(np.sqrt(np.mean(delta * delta))), - "max_abs_weight_delta": float(np.abs(delta).max()) if len(delta) else 0.0, - "changed_household_count": int((np.abs(delta) > 1e-9).sum()), - "changed_household_share": ( - float((np.abs(delta) > 1e-9).mean()) if len(delta) else None - ), - } - ) - return diagnostics - - -def _slugify_label(label: str) -> str: - slug = re.sub(r"[^A-Za-z0-9_.-]+", "-", label.strip()).strip("-") - return slug or "variant" - - -def _log(message: str) -> None: - timestamp = datetime.now().isoformat(timespec="seconds") - print(f"[{timestamp}] {message}", file=sys.stderr, flush=True) - - -def _penalty_label(value: float) -> str: - if value == 0.0: - return "pe_native_unconstrained" - return f"pe_native_l2_{value:g}".replace("+", "") - - -def _parse_existing_candidates(values: Sequence[str] | None) -> dict[str, Path]: - candidates: dict[str, Path] = {} - for value in values or (): - if "=" not in value: - raise ValueError( - "--existing-candidate must be formatted as label=/path/to/file.h5" - ) - label, path = value.split("=", 1) - label = label.strip() - if not label: - raise ValueError("--existing-candidate label cannot be empty") - candidates[label] = Path(path).expanduser() - return candidates - - -def _parse_float_list(value: str | None) -> tuple[float, ...]: - if value is None: - return () - stripped = value.strip() - if not stripped: - return () - return tuple(float(item.strip()) for item in stripped.split(",") if item.strip()) - - -def _resolve_target_total_weight( - *, - input_dataset_path: str | Path, - baseline_dataset_path: str | Path, - period: int, - target_total_weight: float | None, - target_total_weight_source: str, -) -> tuple[float | None, str]: - if target_total_weight is not None: - return float(target_total_weight), "explicit" - if target_total_weight_source == "preserve-input": - return None, "preserve-input" - if target_total_weight_source == "input": - _, input_weights = _household_weights(input_dataset_path, period=period) - return float(input_weights.sum()), "input" - if target_total_weight_source == "baseline": - _, baseline_weights = _household_weights(baseline_dataset_path, period=period) - return float(baseline_weights.sum()), "baseline" - raise ValueError( - "target_total_weight_source must be one of preserve-input, input, baseline" - ) - - -def _extract_pe_native_loss_inputs( - *, - input_dataset_path: str | Path, - period: int, - policyengine_us_data_repo: str | Path | None, - policyengine_us_data_python: str | Path | None, - skip_tax_expenditure_targets: bool, - target_scope_filter: str | None, -) -> dict[str, Any]: - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - if policyengine_us_data_python is not None: - command = [str(Path(policyengine_us_data_python).expanduser())] - else: - command = ["uv", "run", "--project", str(resolved_repo), "python"] - _log("extracting PE-native loss matrix") - with TemporaryDirectory(prefix="microplex-us-pe-native-benchmark-") as temp_dir: - prefix = Path(temp_dir) / "pe_native_matrix" - started_at = perf_counter() - completed = subprocess.run( - [ - *command, - "-c", - _PE_NATIVE_BROAD_MATRIX_SCRIPT, - str(resolved_repo), - json.dumps(_ENHANCED_CPS_BAD_TARGETS), - str(int(period)), - str(Path(input_dataset_path).expanduser().resolve()), - "1" if skip_tax_expenditure_targets else "0", - str(prefix), - target_scope_filter or "", - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - detail = ( - completed.stderr.strip() - or completed.stdout.strip() - or str(completed.returncode) - ) - raise RuntimeError(f"PE-native loss-matrix extraction failed: {detail}") - _log(f"extracted PE-native loss matrix in {perf_counter() - started_at:.1f}s") - return { - "scaled_matrix": np.load(prefix.with_suffix(".matrix.npy")), - "scaled_target": np.load(prefix.with_suffix(".target.npy")), - "initial_weights": np.load(prefix.with_suffix(".weights.npy")), - "unscaled_target": np.load(prefix.with_suffix(".target_unscaled.npy")), - "loss_denominator": np.load(prefix.with_suffix(".loss_denominator.npy")), - "loss_target_weight": np.load( - prefix.with_suffix(".loss_target_weight.npy") - ), - "loss_bucket": np.load( - prefix.with_suffix(".loss_bucket.npy"), allow_pickle=True - ), - "loss_unit": np.load( - prefix.with_suffix(".loss_unit.npy"), allow_pickle=True - ), - "loss_scope": np.load( - prefix.with_suffix(".loss_scope.npy"), allow_pickle=True - ), - "loss_family": np.load( - prefix.with_suffix(".loss_family.npy"), allow_pickle=True - ), - "loss_epsilon": np.load(prefix.with_suffix(".loss_epsilon.npy")), - "metadata": json.loads(prefix.with_suffix(".meta.json").read_text()), - } - - -def build_policyengine_us_native_calibration_benchmark( - *, - input_dataset_path: str | Path, - baseline_dataset_path: str | Path, - output_dir: str | Path, - period: int = 2024, - l2_penalties: Sequence[float] = (0.0, 1e-12, 1e-10, 1e-8), - max_iter: int = 200, - tol: float = 1e-8, - budget: int | None = None, - target_total_weight: float | None = None, - target_total_weight_source: str = "preserve-input", - existing_candidates: Mapping[str, str | Path] | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - batch_households: int | None = None, - baseline_cache_dir: str | Path | None = _DEFAULT_PE_NATIVE_BASELINE_CACHE_DIR, - skip_tax_expenditure_targets: bool = False, - target_scope_filter: str | None = None, - force: bool = False, -) -> dict[str, Any]: - """Run and score PE-native calibration variants against one baseline.""" - - started_at = perf_counter() - input_path = Path(input_dataset_path).expanduser().resolve() - baseline_path = Path(baseline_dataset_path).expanduser().resolve() - destination = Path(output_dir).expanduser().resolve() - destination.mkdir(parents=True, exist_ok=True) - - resolved_target_total_weight, target_total_weight_resolved_from = ( - _resolve_target_total_weight( - input_dataset_path=input_path, - baseline_dataset_path=baseline_path, - period=period, - target_total_weight=target_total_weight, - target_total_weight_source=target_total_weight_source, - ) - ) - - variants: list[CalibrationBenchmarkVariant] = [ - CalibrationBenchmarkVariant( - label="input", - method="existing_input", - dataset_path=str(input_path), - ) - ] - for label, path in (existing_candidates or {}).items(): - variants.append( - CalibrationBenchmarkVariant( - label=label, - method="existing_candidate", - dataset_path=str(Path(path).expanduser().resolve()), - ) - ) - - loss_inputs = ( - _extract_pe_native_loss_inputs( - input_dataset_path=input_path, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - skip_tax_expenditure_targets=skip_tax_expenditure_targets, - target_scope_filter=target_scope_filter, - ) - if l2_penalties - else None - ) - - for penalty in l2_penalties: - penalty = float(penalty) - label = _penalty_label(penalty) - if resolved_target_total_weight is not None: - label = f"{label}_{target_total_weight_resolved_from}_total" - output_path = destination / f"{_slugify_label(label)}.h5" - optimization_path = output_path.with_suffix(".optimization.json") - if force or not output_path.exists(): - if loss_inputs is None: - raise RuntimeError("PE-native loss inputs were not extracted") - _log(f"optimizing {label} with l2_penalty={penalty:g}") - optimization_started_at = perf_counter() - optimized_weights, summary = optimize_pe_native_loss_weights( - scaled_matrix=loss_inputs["scaled_matrix"], - scaled_target=loss_inputs["scaled_target"], - initial_weights=loss_inputs["initial_weights"], - loss_arrays=loss_arrays_from_inputs(loss_inputs), - budget=budget, - max_iter=max_iter, - l2_penalty=penalty, - tol=tol, - target_total_weight=resolved_target_total_weight, - ) - _log( - f"optimized {label} in " - f"{perf_counter() - optimization_started_at:.1f}s; " - f"loss {summary['initial_loss']:.6g} -> " - f"{summary['optimized_loss']:.6g}" - ) - _log(f"rewriting weights for {label}") - rewritten = rewrite_policyengine_us_dataset_weights( - input_dataset_path=input_path, - output_dataset_path=output_path, - household_weights=optimized_weights, - period=period, - ) - optimization = { - "metric": "enhanced_cps_native_loss_weight_optimization", - "period": int(period), - "input_dataset": str(input_path), - "output_dataset": str(rewritten), - "initial_loss": float(summary["initial_loss"]), - "optimized_loss": float(summary["optimized_loss"]), - "loss_delta": float(summary["loss_delta"]), - "initial_weight_sum": float(summary["initial_weight_sum"]), - "optimized_weight_sum": float(summary["optimized_weight_sum"]), - "household_count": int(summary["household_count"]), - "positive_household_count": int(summary["positive_household_count"]), - "budget": summary["budget"], - "converged": bool(summary["converged"]), - "iterations": int(summary["iterations"]), - "target_names": list(loss_inputs["metadata"]["target_names"]), - "skip_tax_expenditure_targets": bool( - loss_inputs["metadata"].get( - "skip_tax_expenditure_targets", - skip_tax_expenditure_targets, - ) - ), - "l2_penalty": penalty, - "target_total_weight": resolved_target_total_weight, - "target_total_weight_resolved_from": target_total_weight_resolved_from, - "optimizer_method": summary.get("method"), - "step_size": summary.get("step_size"), - "initial_step_size": summary.get("initial_step_size"), - "line_search_backtracking_steps": summary.get( - "line_search_backtracking_steps" - ), - "history_interval": summary.get("history_interval"), - "loss_history": summary.get("loss_history", []), - "reused_existing_output": False, - } - optimization_path.write_text( - json.dumps(optimization, indent=2, sort_keys=True, allow_nan=False) - ) - else: - _log(f"reusing existing optimized dataset for {label}") - optimization = ( - json.loads(optimization_path.read_text()) - if optimization_path.exists() - else {} - ) - optimization.update( - { - "l2_penalty": penalty, - "target_total_weight": resolved_target_total_weight, - "target_total_weight_resolved_from": ( - target_total_weight_resolved_from - ), - "reused_existing_output": True, - } - ) - variants.append( - CalibrationBenchmarkVariant( - label=label, - method="pe_native_weight_optimization", - dataset_path=str(output_path.resolve()), - generated=True, - optimization=optimization, - ) - ) - - _log(f"scoring {len(variants)} calibration variants") - scoring_started_at = perf_counter() - scores = compute_batch_us_pe_native_scores( - candidate_dataset_paths=[variant.dataset_path for variant in variants], - baseline_dataset_path=baseline_path, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_scope_filter=target_scope_filter, - ) - _log(f"scored variants in {perf_counter() - scoring_started_at:.1f}s") - scores_by_dataset = { - str(Path(score["broad_loss"]["candidate_dataset"]).resolve()): score - for score in scores - } - - rows: list[dict[str, Any]] = [] - for variant in variants: - dataset_key = str(Path(variant.dataset_path).resolve()) - score = scores_by_dataset[dataset_key] - broad_loss = score["broad_loss"] - rows.append( - { - **variant.to_dict(), - "score_summary": score["summary"], - "broad_loss": broad_loss, - "family_breakdown": score.get("family_breakdown", []), - "weight_diagnostics": compute_household_weight_diagnostics( - variant.dataset_path, - period=period, - reference_dataset_path=input_path, - ), - } - ) - - ranked_rows = sorted( - rows, - key=lambda row: row["score_summary"]["candidate_enhanced_cps_native_loss"], - ) - baseline_loss = ( - float(rows[0]["score_summary"]["baseline_enhanced_cps_native_loss"]) - if rows - else None - ) - payload: dict[str, Any] = { - "schema_version": 1, - "metric": "pe_native_calibration_strategy_benchmark", - "period": int(period), - "input_dataset": str(input_path), - "baseline_dataset": str(baseline_path), - "output_dir": str(destination), - "skip_tax_expenditure_targets": bool(skip_tax_expenditure_targets), - "target_scope_filter": target_scope_filter, - "target_total_weight": resolved_target_total_weight, - "target_total_weight_resolved_from": target_total_weight_resolved_from, - "budget": None if budget is None else int(budget), - "max_iter": int(max_iter), - "tol": float(tol), - "l2_penalties": [float(value) for value in l2_penalties], - "baseline_enhanced_cps_native_loss": baseline_loss, - "best_variant_label": ranked_rows[0]["label"] if ranked_rows else None, - "best_variant_loss": ( - float(ranked_rows[0]["score_summary"]["candidate_enhanced_cps_native_loss"]) - if ranked_rows - else None - ), - "variant_count": len(rows), - "rows": rows, - "ranking": [ - { - "label": row["label"], - "method": row["method"], - "candidate_enhanced_cps_native_loss": row["score_summary"][ - "candidate_enhanced_cps_native_loss" - ], - "enhanced_cps_native_loss_delta": row["score_summary"][ - "enhanced_cps_native_loss_delta" - ], - "effective_sample_size_share": row["weight_diagnostics"][ - "effective_sample_size_share" - ], - "l1_delta_as_share_of_reference_sum": row["weight_diagnostics"].get( - "l1_delta_as_share_of_reference_sum" - ), - } - for row in ranked_rows - ], - "elapsed_seconds": perf_counter() - started_at, - } - return payload - - -def write_policyengine_us_native_calibration_benchmark( - output_path: str | Path, - **kwargs: Any, -) -> Path: - """Build a PE-native calibration benchmark and write it as JSON.""" - - payload = build_policyengine_us_native_calibration_benchmark(**kwargs) - destination = Path(output_path).expanduser().resolve() - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True)) - return destination - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description=( - "Benchmark input, existing, unconstrained, and penalized PE-native " - "calibration variants on the same PE-native broad target surface." - ) - ) - parser.add_argument("--input-dataset", required=True) - parser.add_argument("--baseline-dataset", required=True) - parser.add_argument("--output-dir", required=True) - parser.add_argument( - "--output-path", - help=( - "Benchmark JSON path. Defaults to " - "/pe_native_calibration_benchmark.json." - ), - ) - parser.add_argument("--period", type=int, default=2024) - parser.add_argument( - "--l2-penalties", - default="0,1e-12,1e-10,1e-8", - help=( - "Comma-separated PE-native optimization penalties. " - "Use an empty string to score only existing datasets." - ), - ) - parser.add_argument("--max-iter", type=int, default=200) - parser.add_argument("--tol", type=float, default=1e-8) - parser.add_argument("--budget", type=int) - parser.add_argument("--target-total-weight", type=float) - parser.add_argument( - "--target-total-weight-source", - choices=("preserve-input", "input", "baseline"), - default="preserve-input", - ) - parser.add_argument( - "--existing-candidate", - action="append", - help="Add a precomputed variant as label=/path/to/candidate.h5.", - ) - parser.add_argument("--policyengine-us-data-python") - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--batch-households", type=int) - parser.add_argument( - "--baseline-cache-dir", - default=str(_DEFAULT_PE_NATIVE_BASELINE_CACHE_DIR), - help="Pass an empty string to disable PE-native baseline estimate caching.", - ) - parser.add_argument( - "--skip-tax-expenditure-targets", - action="store_true", - ) - parser.add_argument( - "--target-scope-filter", - choices=("national", "state"), - help="Restrict PE-native optimization/scoring to a target scope.", - ) - parser.add_argument( - "--force", - action="store_true", - help="Regenerate optimized H5 variants even if outputs already exist.", - ) - args = parser.parse_args(argv) - - output_dir = Path(args.output_dir).expanduser() - output_path = ( - Path(args.output_path).expanduser() - if args.output_path - else output_dir / "pe_native_calibration_benchmark.json" - ) - written = write_policyengine_us_native_calibration_benchmark( - output_path, - input_dataset_path=args.input_dataset, - baseline_dataset_path=args.baseline_dataset, - output_dir=output_dir, - period=args.period, - l2_penalties=_parse_float_list(args.l2_penalties), - max_iter=args.max_iter, - tol=args.tol, - budget=args.budget, - target_total_weight=args.target_total_weight, - target_total_weight_source=args.target_total_weight_source, - existing_candidates=_parse_existing_candidates(args.existing_candidate), - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_python=args.policyengine_us_data_python, - batch_households=args.batch_households, - baseline_cache_dir=args.baseline_cache_dir or None, - skip_tax_expenditure_targets=args.skip_tax_expenditure_targets, - target_scope_filter=args.target_scope_filter, - force=args.force, - ) - print(str(written)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) - - -__all__ = [ - "CalibrationBenchmarkVariant", - "build_policyengine_us_native_calibration_benchmark", - "compute_household_weight_diagnostics", - "write_policyengine_us_native_calibration_benchmark", -] diff --git a/src/microplex_us/pipelines/pe_native_loss.py b/src/microplex_us/pipelines/pe_native_loss.py deleted file mode 100644 index 75d33c9b..00000000 --- a/src/microplex_us/pipelines/pe_native_loss.py +++ /dev/null @@ -1,362 +0,0 @@ -"""Shared PE-native robust loss helpers.""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any - -import numpy as np - -PE_NATIVE_ROBUST_LOSS_METRIC = "pe_native_bucketed_baseline_huber_v1" -DEFAULT_BASELINE_WEIGHT_BETA = 1.0 -DEFAULT_BUCKET_EPSILON_FRACTION = 0.02 -DEFAULT_HUBER_DELTA = 1.0 - - -@dataclass(frozen=True) -class PENativeLossArrays: - """Per-target constants for the robust PE-native loss.""" - - target_names: tuple[str, ...] - target_values: np.ndarray - objective_target: np.ndarray - denominator: np.ndarray - target_weight: np.ndarray - bucket_keys: np.ndarray - unit_keys: np.ndarray - scope_keys: np.ndarray - family_keys: np.ndarray - epsilon: np.ndarray - beta: float - huber_delta: float - epsilon_fraction: float - bucket_weight_mode: str - - def metadata(self) -> dict[str, Any]: - unique_buckets, bucket_counts = np.unique( - self.bucket_keys, - return_counts=True, - ) - return { - "loss_metric": PE_NATIVE_ROBUST_LOSS_METRIC, - "loss_config": { - "baseline_weight_beta": float(self.beta), - "bucket_epsilon_fraction": float(self.epsilon_fraction), - "huber_delta": float(self.huber_delta), - "bucket_key": "scope_x_unit", - "bucket_weight_mode": self.bucket_weight_mode, - "residual": "(estimate - target) / (abs(target) + eps_bucket)", - "penalty": "huber", - }, - "loss_buckets": { - str(bucket): int(count) - for bucket, count in zip(unique_buckets, bucket_counts, strict=True) - }, - } - - def sidecar_rows(self) -> list[dict[str, Any]]: - return [ - { - "target_index": int(index), - "target_name": str(name), - "scope": str(self.scope_keys[index]), - "unit": str(self.unit_keys[index]), - "family": str(self.family_keys[index]), - "bucket": str(self.bucket_keys[index]), - "target_value": float(self.target_values[index]), - "objective_target": float(self.objective_target[index]), - "denominator": float(self.denominator[index]), - "epsilon": float(self.epsilon[index]), - "target_weight": float(self.target_weight[index]), - } - for index, name in enumerate(self.target_names) - ] - - -def build_pe_native_loss_arrays( - target_names: list[str] | tuple[str, ...] | np.ndarray, - target_values: np.ndarray, - *, - beta: float = DEFAULT_BASELINE_WEIGHT_BETA, - epsilon_fraction: float = DEFAULT_BUCKET_EPSILON_FRACTION, - huber_delta: float = DEFAULT_HUBER_DELTA, - bucket_weight_mode: str = "equal_bucket", -) -> PENativeLossArrays: - """Build constants for the bucketed, baseline-weighted Huber loss.""" - - names = tuple(str(name) for name in target_names) - targets = np.asarray(target_values, dtype=np.float64) - if targets.ndim != 1: - raise ValueError("target_values must be 1D") - if len(names) != targets.shape[0]: - raise ValueError("target_names and target_values length mismatch") - if targets.size == 0: - raise ValueError("PE-native loss requires at least one target") - if beta < 0.0: - raise ValueError("baseline-weight beta must be nonnegative") - if epsilon_fraction < 0.0: - raise ValueError("bucket epsilon fraction must be nonnegative") - if huber_delta <= 0.0: - raise ValueError("Huber delta must be positive") - if bucket_weight_mode != "equal_bucket": - raise ValueError("Only equal_bucket weighting is implemented") - - scopes = np.asarray( - [infer_pe_native_target_scope(name) for name in names], dtype=object - ) - units = np.asarray( - [infer_pe_native_target_unit(name) for name in names], dtype=object - ) - families = np.asarray( - [classify_pe_native_target_family(name) for name in names], dtype=object - ) - buckets = np.asarray( - [f"{scope}:{unit}" for scope, unit in zip(scopes, units, strict=True)], - dtype=object, - ) - abs_targets = np.abs(targets) - epsilon = np.zeros_like(targets, dtype=np.float64) - target_weight = np.zeros_like(targets, dtype=np.float64) - unique_buckets = sorted({str(bucket) for bucket in buckets}) - bucket_budget = 1.0 / float(len(unique_buckets)) - for bucket in unique_buckets: - mask = buckets == bucket - bucket_targets = abs_targets[mask] - nonzero_targets = bucket_targets[bucket_targets > 0.0] - median_target = ( - float(np.median(nonzero_targets)) if nonzero_targets.size else 1.0 - ) - bucket_epsilon = max(float(epsilon_fraction) * median_target, 1e-12) - epsilon[mask] = bucket_epsilon - baseline_importance = np.power(bucket_targets + bucket_epsilon, beta) - total_importance = float(baseline_importance.sum()) - if total_importance <= 0.0 or not np.isfinite(total_importance): - target_weight[mask] = bucket_budget / float(mask.sum()) - else: - target_weight[mask] = bucket_budget * baseline_importance / total_importance - - denominator = abs_targets + epsilon - return PENativeLossArrays( - target_names=names, - target_values=targets, - objective_target=targets.astype(np.float64, copy=True), - denominator=denominator, - target_weight=target_weight, - bucket_keys=buckets, - unit_keys=units, - scope_keys=scopes, - family_keys=families, - epsilon=epsilon, - beta=float(beta), - huber_delta=float(huber_delta), - epsilon_fraction=float(epsilon_fraction), - bucket_weight_mode=bucket_weight_mode, - ) - - -def pe_native_huber_loss_terms( - estimate: np.ndarray, - loss_arrays: PENativeLossArrays, -) -> np.ndarray: - rel = pe_native_relative_error(estimate, loss_arrays) - return loss_arrays.target_weight * huber_value(rel, loss_arrays.huber_delta) - - -def pe_native_huber_loss( - estimate: np.ndarray, - loss_arrays: PENativeLossArrays, -) -> float: - return float(pe_native_huber_loss_terms(estimate, loss_arrays).sum()) - - -def pe_native_huber_gradient_factor( - estimate: np.ndarray, - loss_arrays: PENativeLossArrays, -) -> np.ndarray: - rel = pe_native_relative_error(estimate, loss_arrays) - return ( - loss_arrays.target_weight - * huber_derivative(rel, loss_arrays.huber_delta) - / loss_arrays.denominator - ) - - -def pe_native_relative_error( - estimate: np.ndarray, - loss_arrays: PENativeLossArrays, -) -> np.ndarray: - estimate_array = np.asarray(estimate, dtype=np.float64) - if estimate_array.shape != loss_arrays.objective_target.shape: - raise ValueError("estimate and target shapes differ") - return (estimate_array - loss_arrays.objective_target) / loss_arrays.denominator - - -def huber_value(values: np.ndarray, delta: float) -> np.ndarray: - values_array = np.asarray(values, dtype=np.float64) - abs_values = np.abs(values_array) - return np.where( - abs_values <= delta, - 0.5 * np.square(values_array), - delta * (abs_values - 0.5 * delta), - ) - - -def huber_derivative(values: np.ndarray, delta: float) -> np.ndarray: - return np.clip(np.asarray(values, dtype=np.float64), -delta, delta) - - -def infer_pe_native_target_scope(target_name: str) -> str: - if target_name.startswith("nation/"): - return "national" - return "state" - - -def infer_pe_native_target_unit(target_name: str) -> str: - normalized = target_name.lower().replace("-", "_") - parts = normalized.split("/") - if normalized.endswith("/snap_hhs") or normalized.endswith("/snap-hhs"): - return "households" - if any(part in {"amount", "total"} for part in parts): - return "dollars" - if ( - len(parts) >= 3 - and parts[0] == "nation" - and parts[1] == "cbo" - and parts[2] == "income_by_source" - ): - return "dollars" - if any(part in {"count", "returns", "filers"} for part in parts): - return "returns" - if "spending" in normalized or "cost" in normalized or "tax" in normalized: - return "dollars" - if "net_worth" in normalized or "income" in normalized: - return "dollars" - if "enrollment" in normalized or "population" in normalized: - return "people" - if "/age/" in normalized or "population_by_age" in normalized: - return "people" - if "household" in normalized or "hhs" in normalized: - return "households" - return "other" - - -def classify_pe_native_target_family(target_name: str) -> str: - """Classify one PE target name into broad diagnostic families.""" - - parts = target_name.split("/") - if target_name.startswith("state/census/age/"): - return "state_age_distribution" - if target_name.startswith("state/census/population_by_state/"): - return "state_population" - if target_name.startswith("state/census/population_under_5_by_state/"): - return "state_population_under_5" - if target_name.startswith("nation/irs/aca_spending/"): - return "state_aca_spending" - if target_name.startswith("state/irs/aca_enrollment/"): - return "state_aca_enrollment" - if target_name.startswith("irs/medicaid_enrollment/"): - return "state_medicaid_enrollment" - if target_name.endswith("/snap-cost"): - return "state_snap_cost" - if target_name.endswith("/snap-hhs"): - return "state_snap_households" - if target_name.startswith("state/real_estate_taxes/"): - return "state_real_estate_taxes" - if len(parts) >= 3 and parts[0] == "state" and parts[2] == "adjusted_gross_income": - return "state_agi_distribution" - if target_name.startswith("nation/jct/"): - return "national_tax_expenditures" - if target_name.startswith("nation/net_worth/"): - return "national_net_worth" - if target_name.startswith("nation/ssa/"): - return "national_ssa" - if target_name.startswith("nation/census/population_by_age/"): - return "national_population_by_age" - if target_name == "nation/census/infants": - return "national_infants" - if target_name.startswith("nation/census/agi_in_spm_threshold_decile_"): - return "national_spm_threshold_agi" - if target_name.startswith("nation/census/count_in_spm_threshold_decile_"): - return "national_spm_threshold_count" - if target_name.startswith("nation/census/"): - return "national_census_other" - if target_name.startswith("nation/irs/"): - return "national_irs_other" - return "other" - - -def loss_arrays_from_inputs(loss_inputs: dict[str, Any]) -> PENativeLossArrays | None: - metadata = dict(loss_inputs.get("metadata") or {}) - if metadata.get("loss_metric") != PE_NATIVE_ROBUST_LOSS_METRIC: - return None - target_names = tuple(str(name) for name in metadata.get("target_names", ())) - return PENativeLossArrays( - target_names=target_names, - target_values=np.asarray(loss_inputs["unscaled_target"], dtype=np.float64), - objective_target=np.asarray(loss_inputs["scaled_target"], dtype=np.float64), - denominator=np.asarray(loss_inputs["loss_denominator"], dtype=np.float64), - target_weight=np.asarray(loss_inputs["loss_target_weight"], dtype=np.float64), - bucket_keys=np.asarray(loss_inputs["loss_bucket"], dtype=object), - unit_keys=np.asarray(loss_inputs["loss_unit"], dtype=object), - scope_keys=np.asarray(loss_inputs["loss_scope"], dtype=object), - family_keys=np.asarray(loss_inputs["loss_family"], dtype=object), - epsilon=np.asarray(loss_inputs["loss_epsilon"], dtype=np.float64), - beta=float(metadata.get("loss_config", {}).get("baseline_weight_beta", 1.0)), - huber_delta=float(metadata.get("loss_config", {}).get("huber_delta", 1.0)), - epsilon_fraction=float( - metadata.get("loss_config", {}).get("bucket_epsilon_fraction", 0.02) - ), - bucket_weight_mode=str( - metadata.get("loss_config", {}).get("bucket_weight_mode", "equal_bucket") - ), - ) - - -def subset_loss_arrays( - loss_arrays: PENativeLossArrays, - mask: np.ndarray, -) -> PENativeLossArrays: - mask_array = np.asarray(mask, dtype=bool) - if mask_array.shape != loss_arrays.objective_target.shape: - raise ValueError("loss-array mask shape mismatch") - return PENativeLossArrays( - target_names=tuple( - name - for name, keep in zip(loss_arrays.target_names, mask_array, strict=True) - if keep - ), - target_values=loss_arrays.target_values[mask_array], - objective_target=loss_arrays.objective_target[mask_array], - denominator=loss_arrays.denominator[mask_array], - target_weight=loss_arrays.target_weight[mask_array], - bucket_keys=loss_arrays.bucket_keys[mask_array], - unit_keys=loss_arrays.unit_keys[mask_array], - scope_keys=loss_arrays.scope_keys[mask_array], - family_keys=loss_arrays.family_keys[mask_array], - epsilon=loss_arrays.epsilon[mask_array], - beta=loss_arrays.beta, - huber_delta=loss_arrays.huber_delta, - epsilon_fraction=loss_arrays.epsilon_fraction, - bucket_weight_mode=loss_arrays.bucket_weight_mode, - ) - - -__all__ = [ - "DEFAULT_BASELINE_WEIGHT_BETA", - "DEFAULT_BUCKET_EPSILON_FRACTION", - "DEFAULT_HUBER_DELTA", - "PE_NATIVE_ROBUST_LOSS_METRIC", - "PENativeLossArrays", - "build_pe_native_loss_arrays", - "classify_pe_native_target_family", - "huber_derivative", - "huber_value", - "infer_pe_native_target_scope", - "infer_pe_native_target_unit", - "loss_arrays_from_inputs", - "pe_native_huber_gradient_factor", - "pe_native_huber_loss", - "pe_native_huber_loss_terms", - "pe_native_relative_error", - "subset_loss_arrays", -] diff --git a/src/microplex_us/pipelines/pe_native_optimization.py b/src/microplex_us/pipelines/pe_native_optimization.py deleted file mode 100644 index eb21126d..00000000 --- a/src/microplex_us/pipelines/pe_native_optimization.py +++ /dev/null @@ -1,751 +0,0 @@ -"""Direct PE-native weight optimization for exported PolicyEngine US datasets.""" - -from __future__ import annotations - -import json -import math -import shutil -import subprocess -from collections.abc import Callable -from dataclasses import dataclass -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Any - -import h5py -import numpy as np - -from microplex_us.pipelines.pe_native_loss import ( - PENativeLossArrays, - loss_arrays_from_inputs, - pe_native_huber_gradient_factor, - pe_native_huber_loss, -) -from microplex_us.pipelines.pe_native_scores import ( - _ENHANCED_CPS_BAD_TARGETS, - build_policyengine_us_data_subprocess_env, - resolve_policyengine_us_data_repo_root, -) - -_PE_NATIVE_BROAD_MATRIX_SCRIPT = """ -import hashlib -import json -import shutil -import sys -import tempfile -from pathlib import Path - -import numpy as np -from policyengine_core.data import Dataset - -REPO_ROOT = sys.argv[1] -if REPO_ROOT not in sys.path: - sys.path.insert(0, REPO_ROOT) - -from policyengine_us import Microsimulation -import policyengine_us_data.utils.loss as loss_utils - - -def patch_policyengine_us_data_uprating_aliases(): - import policyengine_us_data.utils.soi as soi_utils - - original = soi_utils.create_policyengine_uprating_factors_table - - def patched_create_policyengine_uprating_factors_table(*args, **kwargs): - table = original(*args, **kwargs) - if ( - "employment_income" not in table.index - and "employment_income_before_lsr" in table.index - ): - table.loc["employment_income"] = table.loc[ - "employment_income_before_lsr" - ] - return table - - soi_utils.create_policyengine_uprating_factors_table = ( - patched_create_policyengine_uprating_factors_table - ) - - -patch_policyengine_us_data_uprating_aliases() - - -def load_microplex_loss_helpers(): - import importlib.util - - for entry in sys.path: - candidate = Path(entry) / "microplex_us" / "pipelines" / "pe_native_loss.py" - if not candidate.exists(): - continue - spec = importlib.util.spec_from_file_location( - "microplex_us_pe_native_loss_standalone", - candidate, - ) - if spec is None or spec.loader is None: - continue - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) - return module - raise ModuleNotFoundError("Could not load microplex_us pe_native_loss helper") - - -_LOSS_HELPERS = load_microplex_loss_helpers() -build_pe_native_loss_arrays = _LOSS_HELPERS.build_pe_native_loss_arrays -pe_native_huber_loss = _LOSS_HELPERS.pe_native_huber_loss - -BAD_TARGETS = tuple(json.loads(sys.argv[2])) -PERIOD = int(sys.argv[3]) -DATASET_PATH = sys.argv[4] -if len(sys.argv) >= 7: - SKIP_TAX_EXPENDITURE_TARGETS = sys.argv[5] == "1" - OUTPUT_PREFIX = Path(sys.argv[6]) -else: - SKIP_TAX_EXPENDITURE_TARGETS = False - OUTPUT_PREFIX = Path(sys.argv[5]) -TARGET_SCOPE_FILTER = sys.argv[7] if len(sys.argv) >= 8 and sys.argv[7] else None -TARGET_DB_PATH = ( - Path(sys.argv[8]).expanduser().resolve() - if len(sys.argv) >= 9 and sys.argv[8] - else None -) -_TARGET_DB_TEMP_DIR = None - - -def sha256_path(path: Path) -> str: - digest = hashlib.sha256() - with path.open("rb") as handle: - for chunk in iter(lambda: handle.read(1024 * 1024), b""): - digest.update(chunk) - return digest.hexdigest() - - -def patch_policyengine_us_data_target_db(target_db_path: Path | None): - if target_db_path is None: - return None - if not target_db_path.exists(): - raise FileNotFoundError(f"PolicyEngine target DB not found: {target_db_path}") - global _TARGET_DB_TEMP_DIR - _TARGET_DB_TEMP_DIR = tempfile.TemporaryDirectory( - prefix="microplex-pe-target-db-" - ) - temp_storage = Path(_TARGET_DB_TEMP_DIR.name) - calibration_dir = temp_storage / "calibration" - calibration_dir.mkdir(parents=True, exist_ok=True) - shutil.copy2(target_db_path, calibration_dir / "policy_data.db") - loss_utils.STORAGE_FOLDER = temp_storage - return { - "path": str(target_db_path), - "sha256": sha256_path(target_db_path), - "size_bytes": int(target_db_path.stat().st_size), - } - - -TARGET_DB_DESCRIPTOR = patch_policyengine_us_data_target_db(TARGET_DB_PATH) - - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -def scope_keep_mask(target_names): - if TARGET_SCOPE_FILTER is None: - return np.ones(target_names.shape, dtype=bool) - if TARGET_SCOPE_FILTER == "national": - return np.asarray( - [str(name).startswith("nation/") for name in target_names], - dtype=bool, - ) - if TARGET_SCOPE_FILTER == "state": - return np.asarray( - [not str(name).startswith("nation/") for name in target_names], - dtype=bool, - ) - raise ValueError(f"Unsupported target scope filter: {TARGET_SCOPE_FILTER}") - - -dataset_cls = dataset_from_path( - DATASET_PATH, - Path(DATASET_PATH).stem.replace("-", "_"), -) -loss_matrix, targets_array = loss_utils.build_loss_matrix(dataset_cls, PERIOD) -target_names = np.asarray(loss_matrix.columns) -zero_mask = np.isclose(targets_array, 0.0, atol=0.1) -bad_mask = np.isin(target_names, BAD_TARGETS) -keep_mask = ~(zero_mask | bad_mask) & scope_keep_mask(target_names) - -filtered = loss_matrix.loc[:, keep_mask] -filtered_targets = np.asarray(targets_array[keep_mask], dtype=np.float64) -if filtered_targets.size == 0: - raise ValueError("PE-native loss matrix has no targets after filtering") -is_national = np.asarray(filtered.columns.str.startswith("nation/"), dtype=bool) -n_national = int(is_national.sum()) -n_state = int((~is_national).sum()) - -loss_arrays = build_pe_native_loss_arrays( - filtered.columns.tolist(), - filtered_targets, -) -matrix = filtered.to_numpy(dtype=np.float64) -target = loss_arrays.objective_target - -sim = Microsimulation(dataset=dataset_cls) -sim.default_calculation_period = PERIOD -weights = sim.calculate( - "household_weight", - map_to="household", - period=PERIOD, -).values.astype(np.float64) - -np.save(OUTPUT_PREFIX.with_suffix(".matrix.npy"), matrix) -np.save(OUTPUT_PREFIX.with_suffix(".target.npy"), target) -np.save(OUTPUT_PREFIX.with_suffix(".weights.npy"), weights) -np.save(OUTPUT_PREFIX.with_suffix(".target_unscaled.npy"), filtered_targets) -np.save(OUTPUT_PREFIX.with_suffix(".loss_denominator.npy"), loss_arrays.denominator) -np.save(OUTPUT_PREFIX.with_suffix(".loss_target_weight.npy"), loss_arrays.target_weight) -np.save(OUTPUT_PREFIX.with_suffix(".loss_bucket.npy"), loss_arrays.bucket_keys) -np.save(OUTPUT_PREFIX.with_suffix(".loss_unit.npy"), loss_arrays.unit_keys) -np.save(OUTPUT_PREFIX.with_suffix(".loss_scope.npy"), loss_arrays.scope_keys) -np.save(OUTPUT_PREFIX.with_suffix(".loss_family.npy"), loss_arrays.family_keys) -np.save(OUTPUT_PREFIX.with_suffix(".loss_epsilon.npy"), loss_arrays.epsilon) -with open(OUTPUT_PREFIX.with_suffix(".meta.json"), "w") as handle: - json.dump( - { - **loss_arrays.metadata(), - "target_names": filtered.columns.tolist(), - "target_loss_metadata": loss_arrays.sidecar_rows(), - "n_targets_total": int(len(target_names)), - "n_targets_kept": int(keep_mask.sum()), - "n_targets_zero_dropped": int(zero_mask.sum()), - "n_targets_bad_dropped": int(bad_mask.sum()), - "n_national_targets": n_national, - "n_state_targets": n_state, - "target_scope_filter": TARGET_SCOPE_FILTER, - "policyengine_targets_db": TARGET_DB_DESCRIPTOR, - "weight_sum": float(weights.sum()), - "candidate_loss_before": float( - pe_native_huber_loss(matrix.T @ weights, loss_arrays) - ), - }, - handle, - sort_keys=True, - ) -""".strip() - - -@dataclass(frozen=True) -class PolicyEngineUSNativeWeightOptimizationResult: - metric: str - period: int - input_dataset: str - output_dataset: str - initial_loss: float - optimized_loss: float - loss_delta: float - initial_weight_sum: float - optimized_weight_sum: float - household_count: int - positive_household_count: int - budget: int | None - converged: bool - iterations: int - target_names: tuple[str, ...] - - def to_dict(self) -> dict[str, Any]: - return { - "metric": self.metric, - "period": self.period, - "input_dataset": self.input_dataset, - "output_dataset": self.output_dataset, - "initial_loss": self.initial_loss, - "optimized_loss": self.optimized_loss, - "loss_delta": self.loss_delta, - "initial_weight_sum": self.initial_weight_sum, - "optimized_weight_sum": self.optimized_weight_sum, - "household_count": self.household_count, - "positive_household_count": self.positive_household_count, - "budget": self.budget, - "converged": self.converged, - "iterations": self.iterations, - "target_names": list(self.target_names), - } - - -def _project_to_simplex(values: np.ndarray, total: float) -> np.ndarray: - """Project onto {x >= 0, sum x = total}.""" - if total < 0: - raise ValueError("total must be nonnegative") - if len(values) == 0: - return values.copy() - clipped = np.maximum(values.astype(np.float64, copy=False), 0.0) - current_sum = float(clipped.sum()) - if np.isclose(current_sum, total, rtol=0.0, atol=1e-6): - return clipped - if total <= 0.0: - return np.zeros_like(clipped) - - u = np.sort(clipped)[::-1] - cssv = np.cumsum(u) - total - rho_candidates = u - cssv / np.arange(1, len(u) + 1) > 0 - if not np.any(rho_candidates): - projected = np.zeros_like(clipped) - projected[np.argmax(clipped)] = total - return projected - rho = int(np.nonzero(rho_candidates)[0][-1]) - theta = cssv[rho] / float(rho + 1) - return np.maximum(clipped - theta, 0.0) - - -def _project_to_budget_simplex( - values: np.ndarray, - total: float, - budget: int | None, -) -> np.ndarray: - if budget is None or budget >= len(values): - return _project_to_simplex(values, total) - if budget <= 0: - raise ValueError("budget must be positive when provided") - projected = np.zeros_like(values, dtype=np.float64) - top_idx = np.argpartition(values, -budget)[-budget:] - projected[top_idx] = _project_to_simplex(values[top_idx], total) - return projected - - -def _estimate_quadratic_lipschitz(matrix: np.ndarray, l2_penalty: float) -> float: - if matrix.size == 0: - return max(2.0 * l2_penalty, 1.0) - n_households = matrix.shape[0] - vector = np.ones(n_households, dtype=np.float64) - vector /= np.linalg.norm(vector) - for _ in range(25): - transformed = matrix @ (matrix.T @ vector) - norm = np.linalg.norm(transformed) - if norm < 1e-12: - return max(2.0 * l2_penalty, 1.0) - vector = transformed / norm - transformed = matrix @ (matrix.T @ vector) - eigenvalue = float(np.dot(vector, transformed)) - return max(2.0 * eigenvalue + 2.0 * l2_penalty, 1e-6) - - -def optimize_pe_native_loss_weights( - *, - scaled_matrix: np.ndarray, - scaled_target: np.ndarray, - initial_weights: np.ndarray, - loss_arrays: PENativeLossArrays | None = None, - budget: int | None = None, - max_iter: int = 200, - l2_penalty: float = 0.0, - tol: float = 1e-8, - target_total_weight: float | None = None, - history_callback: Callable[[int, np.ndarray, float], None] | None = None, -) -> tuple[np.ndarray, dict[str, Any]]: - """Optimize nonnegative household weights directly on the PE-native loss matrix. - - Algorithm: **projected (proximal) gradient descent** on the least-squares - PE-native loss ``||matrix.T @ w - target||^2`` (plus an optional L2 term - toward the initial weights). Each iteration takes a gradient step and - projects onto the nonnegative budget simplex (``_project_to_budget_simplex``) - with a Lipschitz-derived step size, backtracking line search, and - monotone-descent acceptance. - - NOTE on naming: commit history calls this the "APG refit", but it is **not** - Nesterov-accelerated — there is no momentum/extrapolation term, so it is - plain projected GD, not accelerated proximal gradient (APG). It is also - distinct from the dataset-build weight calibration (microcalibrate / - entropy / pe_l0); this routine is used for the eCPS-replacement PE-native - *refit and scoring* (symmetric refit). See docs/calibrator-decision.md. - - If *target_total_weight* is provided, the simplex projection targets that - total instead of the initial weight sum. This allows the optimizer to - rescale the weight budget (e.g. to match a known population total) while - simultaneously redistributing weights to minimise the PE-native loss. - """ - matrix = np.asarray(scaled_matrix, dtype=np.float64) - target = np.asarray(scaled_target, dtype=np.float64) - weights0 = np.asarray(initial_weights, dtype=np.float64) - if matrix.ndim != 2: - raise ValueError("scaled_matrix must be 2D") - if target.ndim != 1 or target.shape[0] != matrix.shape[1]: - raise ValueError("scaled_target must match scaled_matrix target dimension") - if weights0.ndim != 1 or weights0.shape[0] != matrix.shape[0]: - raise ValueError("initial_weights must match scaled_matrix household dimension") - if ( - loss_arrays is not None - and loss_arrays.objective_target.shape[0] != matrix.shape[1] - ): - raise ValueError("loss_arrays target dimension must match scaled_matrix") - - initial_weight_sum = float(weights0.sum()) - total_weight = ( - float(target_total_weight) - if target_total_weight is not None - else initial_weight_sum - ) - weights = _project_to_budget_simplex(weights0, total_weight, budget) - initial_reference = weights.copy() - if loss_arrays is None: - lipschitz_matrix = matrix - else: - lipschitz_scale = np.sqrt(loss_arrays.target_weight) / loss_arrays.denominator - lipschitz_matrix = matrix * lipschitz_scale[np.newaxis, :] - lipschitz = _estimate_quadratic_lipschitz(lipschitz_matrix, l2_penalty) - step_size = 1.0 / lipschitz - - def objective(candidate: np.ndarray) -> float: - estimate = matrix.T @ candidate - if loss_arrays is None: - residual = estimate - target - base = float(np.dot(residual, residual)) - else: - base = pe_native_huber_loss(estimate, loss_arrays) - if l2_penalty > 0.0: - delta = candidate - initial_reference - base += float(l2_penalty * np.dot(delta, delta)) - return base - - current_loss = objective(weights) - loss_history: list[dict[str, float | int]] = [ - { - "iteration": 0, - "objective_loss": float(current_loss), - "weight_sum": float(weights.sum()), - "positive_household_count": int((weights > 1e-9).sum()), - } - ] - if history_callback is not None: - history_callback(0, weights, current_loss) - converged = False - completed_iter = 0 - total_backtracking_steps = 0 - momentum = 1.0 - search_weights = weights.copy() - min_step_size = step_size * 1e-12 - max_step_size = step_size * 1e8 - - def gradient_at(candidate: np.ndarray) -> np.ndarray: - estimate = matrix.T @ candidate - if loss_arrays is None: - residual = estimate - target - gradient = 2.0 * (matrix @ residual) - else: - gradient = matrix @ pe_native_huber_gradient_factor( - estimate, - loss_arrays, - ) - if l2_penalty > 0.0: - gradient += 2.0 * l2_penalty * (candidate - initial_reference) - return gradient - - for iteration in range(1, max_iter + 1): - gradient = gradient_at(search_weights) - completed_iter = iteration - - candidate = weights - candidate_loss = current_loss - accepted_descent_step = False - iteration_step_size = step_size - accepted_backtrack = 0 - - for start, start_gradient in ( - (search_weights, gradient), - (weights, None), - ): - if accepted_descent_step: - break - if start_gradient is None: - start_gradient = gradient_at(start) - iteration_step_size = step_size - for backtrack in range(40): - trial = _project_to_budget_simplex( - start - iteration_step_size * start_gradient, - total_weight, - budget, - ) - trial_loss = objective(trial) - if trial_loss <= current_loss: - candidate = trial - candidate_loss = trial_loss - accepted_descent_step = True - accepted_backtrack = backtrack - total_backtracking_steps += backtrack - break - iteration_step_size *= 0.5 - if iteration_step_size < min_step_size: - break - if not accepted_descent_step: - momentum = 1.0 - search_weights = weights.copy() - if not accepted_descent_step: - converged = True - break - - improvement = current_loss - candidate_loss - previous_weights = weights - weights = candidate - current_loss = candidate_loss - if accepted_backtrack == 0: - step_size = min(iteration_step_size * 1.25, max_step_size) - else: - step_size = iteration_step_size - next_momentum = 0.5 * (1.0 + math.sqrt(1.0 + 4.0 * momentum**2)) - extrapolated = weights + ((momentum - 1.0) / next_momentum) * ( - weights - previous_weights - ) - search_weights = _project_to_budget_simplex( - extrapolated, - total_weight, - budget, - ) - if float(np.dot(weights - previous_weights, search_weights - weights)) > 0.0: - next_momentum = 1.0 - search_weights = weights.copy() - momentum = next_momentum - loss_history.append( - { - "iteration": int(iteration), - "objective_loss": float(current_loss), - "weight_sum": float(weights.sum()), - "positive_household_count": int((weights > 1e-9).sum()), - "step_size": float(step_size), - "backtracking_steps": int(accepted_backtrack), - } - ) - if history_callback is not None: - history_callback(iteration, weights, current_loss) - if improvement < tol * max(1.0, current_loss): - converged = True - break - - summary = { - "initial_loss": float(objective(initial_reference)), - "optimized_loss": float(current_loss), - "loss_delta": float(current_loss - objective(initial_reference)), - "initial_weight_sum": initial_weight_sum, - "target_total_weight": total_weight, - "optimized_weight_sum": float(weights.sum()), - "household_count": int(len(weights)), - "positive_household_count": int((weights > 1e-9).sum()), - "budget": None if budget is None else int(budget), - "iterations": int(completed_iter), - "converged": bool(converged), - "method": "monotone_accelerated_projected_gradient", - "step_size": float(step_size), - "initial_step_size": float(1.0 / lipschitz), - "line_search_backtracking_steps": int(total_backtracking_steps), - "loss_history": loss_history, - } - return weights, summary - - -def rewrite_policyengine_us_dataset_weights( - *, - input_dataset_path: str | Path, - output_dataset_path: str | Path, - household_weights: np.ndarray, - period: int = 2024, -) -> Path: - """Copy a TIME_PERIOD_ARRAYS H5 and replace all exported weight arrays.""" - source = Path(input_dataset_path).expanduser().resolve() - output = Path(output_dataset_path).expanduser().resolve() - if source != output: - shutil.copy2(source, output) - - period_key = str(period) - weights = np.asarray(household_weights, dtype=np.float32) - with h5py.File(output, "r+") as handle: - household_ids = handle["household_id"][period_key][:] - if len(household_ids) != len(weights): - raise ValueError( - "household_weights length does not match household_id array" - ) - household_map = { - int(household_id): float(weight) - for household_id, weight in zip(household_ids, weights, strict=True) - } - handle["household_weight"][period_key][...] = weights - - if "person_weight" in handle and "person_household_id" in handle: - person_households = handle["person_household_id"][period_key][:] - person_weights = np.array( - [ - household_map[int(household_id)] - for household_id in person_households - ], - dtype=np.float32, - ) - handle["person_weight"][period_key][...] = person_weights - - person_households = ( - handle["person_household_id"][period_key][:] - if "person_household_id" in handle - else None - ) - for group in ("tax_unit", "spm_unit", "family", "marital_unit"): - group_weight_name = f"{group}_weight" - group_id_name = f"{group}_id" - person_group_name = f"person_{group}_id" - if ( - group_weight_name not in handle - or group_id_name not in handle - or person_group_name not in handle - or person_households is None - # Production datasets (e.g. the published enhanced CPS) may - # leave derived entity-weight groups empty because PolicyEngine - # computes those weights from household_weight at runtime. The - # group then exists without a value for this period, so skip - # propagation rather than raising KeyError on [period_key]. - or period_key not in handle[group_weight_name] - ): - continue - person_group_ids = handle[person_group_name][period_key][:] - group_to_household: dict[int, int] = {} - for group_id, household_id in zip( - person_group_ids, - person_households, - strict=True, - ): - group_to_household.setdefault(int(group_id), int(household_id)) - group_ids = handle[group_id_name][period_key][:] - group_weights = np.array( - [ - household_map[group_to_household[int(group_id)]] - for group_id in group_ids - ], - dtype=np.float32, - ) - handle[group_weight_name][period_key][...] = group_weights - return output - - -def optimize_policyengine_us_native_loss_dataset( - *, - input_dataset_path: str | Path, - output_dataset_path: str | Path, - period: int = 2024, - budget: int | None = None, - max_iter: int = 200, - l2_penalty: float = 0.0, - tol: float = 1e-8, - target_total_weight: float | None = None, - policyengine_us_data_repo: str | Path | None = None, -) -> PolicyEngineUSNativeWeightOptimizationResult: - """Optimize household weights of an exported PE-US dataset on the broad native loss.""" - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - with TemporaryDirectory(prefix="microplex-us-pe-native-opt-") as temp_dir: - prefix = Path(temp_dir) / "pe_native_matrix" - completed = subprocess.run( - [ - "uv", - "run", - "--project", - str(resolved_repo), - "python", - "-c", - _PE_NATIVE_BROAD_MATRIX_SCRIPT, - str(resolved_repo), - json.dumps(_ENHANCED_CPS_BAD_TARGETS), - str(int(period)), - str(Path(input_dataset_path).expanduser().resolve()), - str(prefix), - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - detail = ( - completed.stderr.strip() - or completed.stdout.strip() - or str(completed.returncode) - ) - raise RuntimeError(f"PE-native loss-matrix extraction failed: {detail}") - - scaled_matrix = np.load(prefix.with_suffix(".matrix.npy")) - scaled_target = np.load(prefix.with_suffix(".target.npy")) - initial_weights = np.load(prefix.with_suffix(".weights.npy")) - metadata = json.loads(prefix.with_suffix(".meta.json").read_text()) - loss_inputs = { - "scaled_matrix": scaled_matrix, - "scaled_target": scaled_target, - "initial_weights": initial_weights, - "unscaled_target": np.load(prefix.with_suffix(".target_unscaled.npy")), - "loss_denominator": np.load(prefix.with_suffix(".loss_denominator.npy")), - "loss_target_weight": np.load( - prefix.with_suffix(".loss_target_weight.npy") - ), - "loss_bucket": np.load( - prefix.with_suffix(".loss_bucket.npy"), allow_pickle=True - ), - "loss_unit": np.load( - prefix.with_suffix(".loss_unit.npy"), allow_pickle=True - ), - "loss_scope": np.load( - prefix.with_suffix(".loss_scope.npy"), allow_pickle=True - ), - "loss_family": np.load( - prefix.with_suffix(".loss_family.npy"), allow_pickle=True - ), - "loss_epsilon": np.load(prefix.with_suffix(".loss_epsilon.npy")), - "metadata": metadata, - } - loss_arrays = loss_arrays_from_inputs(loss_inputs) - - optimized_weights, summary = optimize_pe_native_loss_weights( - scaled_matrix=scaled_matrix, - scaled_target=scaled_target, - initial_weights=initial_weights, - loss_arrays=loss_arrays, - budget=budget, - max_iter=max_iter, - l2_penalty=l2_penalty, - tol=tol, - target_total_weight=target_total_weight, - ) - rewritten = rewrite_policyengine_us_dataset_weights( - input_dataset_path=input_dataset_path, - output_dataset_path=output_dataset_path, - household_weights=optimized_weights, - period=period, - ) - return PolicyEngineUSNativeWeightOptimizationResult( - metric=str( - metadata.get( - "loss_metric", - "enhanced_cps_native_loss_weight_optimization", - ) - ), - period=int(period), - input_dataset=str(Path(input_dataset_path).expanduser().resolve()), - output_dataset=str(rewritten), - initial_loss=float(summary["initial_loss"]), - optimized_loss=float(summary["optimized_loss"]), - loss_delta=float(summary["loss_delta"]), - initial_weight_sum=float(summary["initial_weight_sum"]), - optimized_weight_sum=float(summary["optimized_weight_sum"]), - household_count=int(summary["household_count"]), - positive_household_count=int(summary["positive_household_count"]), - budget=summary["budget"], - converged=bool(summary["converged"]), - iterations=int(summary["iterations"]), - target_names=tuple(metadata["target_names"]), - ) - - -__all__ = [ - "PolicyEngineUSNativeWeightOptimizationResult", - "optimize_pe_native_loss_weights", - "optimize_policyengine_us_native_loss_dataset", - "rewrite_policyengine_us_dataset_weights", -] diff --git a/src/microplex_us/pipelines/pe_native_scores.py b/src/microplex_us/pipelines/pe_native_scores.py deleted file mode 100644 index 16c1e572..00000000 --- a/src/microplex_us/pipelines/pe_native_scores.py +++ /dev/null @@ -1,3826 +0,0 @@ -"""PolicyEngine-native scoring helpers for US Microplex artifacts.""" - -from __future__ import annotations - -import argparse -import json -import os -import re -import subprocess -import sys -from dataclasses import dataclass, field -from pathlib import Path -from time import perf_counter -from typing import Any - -_DEFAULT_PE_US_DATA_REPO = Path.home() / "PolicyEngine" / "policyengine-us-data" -_PE_US_DATA_PYTHON_ENV = "MICROPLEX_US_POLICYENGINE_US_DATA_PYTHON" -_PE_US_DATA_REPO_ENV = "MICROPLEX_US_POLICYENGINE_US_DATA_REPO" -_PE_NATIVE_SCORE_BASE_ENV_VARS: tuple[str, ...] = ( - "HOME", - "PATH", - "TMPDIR", - "LANG", - "LC_ALL", - "TZ", -) -_EITC_AGI_CHILD_DOMAIN_VARIABLE = "adjusted_gross_income,eitc,eitc_child_count" -_EITC_AGI_CHILD_LABEL = re.compile( - r"^nation/irs/eitc/(?Preturns|amount)/" - r"c(?P\d+)_(?P[^_]+)_(?P[^/]+)$" -) - -_ENHANCED_CPS_BAD_TARGETS: tuple[str, ...] = ( - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "state/RI/adjusted_gross_income/amount/-inf_1", - "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", -) - -_PE_NATIVE_BROAD_SCORE_SCRIPT = """ -import hashlib -import json -import shutil -import sys -import tempfile -from pathlib import Path - -import numpy as np -from policyengine_core.data import Dataset - -REPO_ROOT = sys.argv[1] -if REPO_ROOT not in sys.path: - sys.path.insert(0, REPO_ROOT) - -from policyengine_us import Microsimulation -import policyengine_us_data.utils.loss as loss_utils - - -def patch_policyengine_us_data_uprating_aliases(): - import policyengine_us_data.utils.soi as soi_utils - - original = soi_utils.create_policyengine_uprating_factors_table - - def patched_create_policyengine_uprating_factors_table(*args, **kwargs): - table = original(*args, **kwargs) - if ( - "employment_income" not in table.index - and "employment_income_before_lsr" in table.index - ): - table.loc["employment_income"] = table.loc[ - "employment_income_before_lsr" - ] - return table - - soi_utils.create_policyengine_uprating_factors_table = ( - patched_create_policyengine_uprating_factors_table - ) - - -patch_policyengine_us_data_uprating_aliases() - - -def load_microplex_loss_helpers(): - import importlib.util - - for entry in sys.path: - candidate = Path(entry) / "microplex_us" / "pipelines" / "pe_native_loss.py" - if not candidate.exists(): - continue - spec = importlib.util.spec_from_file_location( - "microplex_us_pe_native_loss_standalone", - candidate, - ) - if spec is None or spec.loader is None: - continue - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) - return module - raise ModuleNotFoundError("Could not load microplex_us pe_native_loss helper") - - -_LOSS_HELPERS = load_microplex_loss_helpers() -PE_NATIVE_ROBUST_LOSS_METRIC = _LOSS_HELPERS.PE_NATIVE_ROBUST_LOSS_METRIC -build_pe_native_loss_arrays = _LOSS_HELPERS.build_pe_native_loss_arrays -pe_native_huber_loss_terms = _LOSS_HELPERS.pe_native_huber_loss_terms -pe_native_relative_error = _LOSS_HELPERS.pe_native_relative_error - -BAD_TARGETS = tuple(json.loads(sys.argv[2])) -PERIOD = int(sys.argv[3]) -CANDIDATE_DATASET = sys.argv[4] -BASELINE_DATASET = sys.argv[5] -TARGET_SCOPE_FILTER = sys.argv[6] if len(sys.argv) >= 7 and sys.argv[6] else None -TARGET_DB_PATH = ( - Path(sys.argv[7]).expanduser().resolve() - if len(sys.argv) >= 8 and sys.argv[7] - else None -) -_TARGET_DB_TEMP_DIR = None - - -def sha256_path(path: Path) -> str: - digest = hashlib.sha256() - with path.open("rb") as handle: - for chunk in iter(lambda: handle.read(1024 * 1024), b""): - digest.update(chunk) - return digest.hexdigest() - - -def patch_policyengine_us_data_target_db(target_db_path: Path | None): - if target_db_path is None: - return None - if not target_db_path.exists(): - raise FileNotFoundError(f"PolicyEngine target DB not found: {target_db_path}") - global _TARGET_DB_TEMP_DIR - _TARGET_DB_TEMP_DIR = tempfile.TemporaryDirectory( - prefix="microplex-pe-target-db-" - ) - temp_storage = Path(_TARGET_DB_TEMP_DIR.name) - calibration_dir = temp_storage / "calibration" - calibration_dir.mkdir(parents=True, exist_ok=True) - shutil.copy2(target_db_path, calibration_dir / "policy_data.db") - loss_utils.STORAGE_FOLDER = temp_storage - return { - "path": str(target_db_path), - "sha256": sha256_path(target_db_path), - "size_bytes": int(target_db_path.stat().st_size), - } - - -TARGET_DB_DESCRIPTOR = patch_policyengine_us_data_target_db(TARGET_DB_PATH) - - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -def scope_keep_mask(target_names): - if TARGET_SCOPE_FILTER is None: - return np.ones(target_names.shape, dtype=bool) - if TARGET_SCOPE_FILTER == "national": - return np.asarray( - [str(name).startswith("nation/") for name in target_names], - dtype=bool, - ) - if TARGET_SCOPE_FILTER == "state": - return np.asarray( - [not str(name).startswith("nation/") for name in target_names], - dtype=bool, - ) - raise ValueError(f"Unsupported target scope filter: {TARGET_SCOPE_FILTER}") - - -def classify_target_family(target_name: str) -> str: - parts = target_name.split("/") - if target_name.startswith("state/census/age/"): - return "state_age_distribution" - if target_name.startswith("state/census/population_by_state/"): - return "state_population" - if target_name.startswith("state/census/population_under_5_by_state/"): - return "state_population_under_5" - if target_name.startswith("nation/irs/aca_spending/"): - return "state_aca_spending" - if target_name.startswith("state/irs/aca_enrollment/"): - return "state_aca_enrollment" - if target_name.startswith("irs/medicaid_enrollment/"): - return "state_medicaid_enrollment" - if target_name.endswith("/snap-cost"): - return "state_snap_cost" - if target_name.endswith("/snap-hhs"): - return "state_snap_households" - if target_name.startswith("state/real_estate_taxes/"): - return "state_real_estate_taxes" - if len(parts) >= 3 and parts[0] == "state" and parts[2] == "adjusted_gross_income": - return "state_agi_distribution" - if target_name.startswith("nation/jct/"): - return "national_tax_expenditures" - if target_name.startswith("nation/net_worth/"): - return "national_net_worth" - if target_name.startswith("nation/ssa/"): - return "national_ssa" - if target_name.startswith("nation/census/population_by_age/"): - return "national_population_by_age" - if target_name == "nation/census/infants": - return "national_infants" - if target_name.startswith("nation/census/agi_in_spm_threshold_decile_"): - return "national_spm_threshold_agi" - if target_name.startswith("nation/census/count_in_spm_threshold_decile_"): - return "national_spm_threshold_count" - if target_name.startswith("nation/census/"): - return "national_census_other" - if target_name.startswith("nation/irs/"): - return "national_irs_other" - return "other" - - -def build_family_breakdown(target_names, candidate_terms, baseline_terms, candidate_rel_error, baseline_rel_error): - family_rows = [] - target_names = list(target_names) - unique_families = sorted({classify_target_family(name) for name in target_names}) - for family in unique_families: - idx = [i for i, name in enumerate(target_names) if classify_target_family(name) == family] - if not idx: - continue - candidate_slice = candidate_terms[idx] - baseline_slice = baseline_terms[idx] - candidate_rel_slice = candidate_rel_error[idx] - baseline_rel_slice = baseline_rel_error[idx] - family_rows.append( - { - "family": family, - "n_targets": int(len(idx)), - "candidate_loss_contribution": float(candidate_slice.sum()), - "baseline_loss_contribution": float(baseline_slice.sum()), - "loss_contribution_delta": float(candidate_slice.sum() - baseline_slice.sum()), - "candidate_mean_weighted_loss": float(candidate_slice.mean()), - "baseline_mean_weighted_loss": float(baseline_slice.mean()), - "candidate_mean_unweighted_msre": float(np.mean(np.square(candidate_rel_slice))), - "baseline_mean_unweighted_msre": float(np.mean(np.square(baseline_rel_slice))), - "unweighted_msre_delta": float( - np.mean(np.square(candidate_rel_slice)) - - np.mean(np.square(baseline_rel_slice)) - ), - } - ) - family_rows.sort(key=lambda row: row["loss_contribution_delta"], reverse=True) - return family_rows - - -def compute(dataset_path: str) -> dict[str, float | int]: - dataset_cls = dataset_from_path( - dataset_path, - Path(dataset_path).stem.replace("-", "_"), - ) - loss_matrix, targets_array = loss_utils.build_loss_matrix(dataset_cls, PERIOD) - target_names = np.asarray(loss_matrix.columns) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) - bad_mask = np.isin(target_names, BAD_TARGETS) - keep_mask = ~(zero_mask | bad_mask) & scope_keep_mask(target_names) - - filtered = loss_matrix.loc[:, keep_mask] - filtered_targets = np.asarray(targets_array[keep_mask], dtype=np.float64) - if filtered_targets.size == 0: - raise ValueError("PE-native broad loss has no targets after filtering") - is_national = np.asarray(filtered.columns.str.startswith("nation/"), dtype=bool) - n_national = int(is_national.sum()) - n_state = int((~is_national).sum()) - - loss_arrays = build_pe_native_loss_arrays( - filtered.columns.tolist(), - filtered_targets, - ) - - sim = Microsimulation(dataset=dataset_cls) - sim.default_calculation_period = PERIOD - weights = sim.calculate( - "household_weight", - map_to="household", - period=PERIOD, - ).values.astype(np.float64) - - estimate = weights @ filtered.to_numpy(dtype=np.float64) - rel_error = pe_native_relative_error(estimate, loss_arrays) - weighted_terms = pe_native_huber_loss_terms(estimate, loss_arrays) - loss_value = float(weighted_terms.sum()) - unweighted_msre = float(np.mean(np.square(rel_error))) - - return { - "metric": PE_NATIVE_ROBUST_LOSS_METRIC, - "loss_config": loss_arrays.metadata().get("loss_config"), - "loss": loss_value, - "unweighted_msre": unweighted_msre, - "n_targets_total": int(len(target_names)), - "n_targets_kept": int(keep_mask.sum()), - "n_targets_zero_dropped": int(zero_mask.sum()), - "n_targets_bad_dropped": int(bad_mask.sum()), - "n_national_targets": n_national, - "n_state_targets": n_state, - "target_scope_filter": TARGET_SCOPE_FILTER, - "weight_sum": float(weights.sum()), - "target_names": filtered.columns.tolist(), - "weighted_terms": weighted_terms.tolist(), - "rel_error": rel_error.tolist(), - "target_loss_metadata": loss_arrays.sidecar_rows(), - } - - -candidate = compute(CANDIDATE_DATASET) -baseline = compute(BASELINE_DATASET) - -if candidate["n_targets_kept"] != baseline["n_targets_kept"]: - raise ValueError( - "Candidate and baseline produced different target counts after filtering: " - f"{candidate['n_targets_kept']} vs {baseline['n_targets_kept']}" - ) -if candidate["target_names"] != baseline["target_names"]: - raise ValueError("Candidate and baseline produced different target names after filtering") - -payload = { - "metric": candidate["metric"], - "loss_config": candidate.get("loss_config"), - "period": PERIOD, - "candidate_dataset": CANDIDATE_DATASET, - "baseline_dataset": BASELINE_DATASET, - "candidate_enhanced_cps_native_loss": candidate["loss"], - "baseline_enhanced_cps_native_loss": baseline["loss"], - "enhanced_cps_native_loss_delta": candidate["loss"] - baseline["loss"], - "candidate_unweighted_msre": candidate["unweighted_msre"], - "baseline_unweighted_msre": baseline["unweighted_msre"], - "unweighted_msre_delta": ( - candidate["unweighted_msre"] - baseline["unweighted_msre"] - ), - "n_targets_total": candidate["n_targets_total"], - "n_targets_kept": candidate["n_targets_kept"], - "n_targets_zero_dropped": candidate["n_targets_zero_dropped"], - "n_targets_bad_dropped": candidate["n_targets_bad_dropped"], - "n_national_targets": candidate["n_national_targets"], - "n_state_targets": candidate["n_state_targets"], - "target_scope_filter": TARGET_SCOPE_FILTER, - "policyengine_targets_db": TARGET_DB_DESCRIPTOR, - "candidate_weight_sum": candidate["weight_sum"], - "baseline_weight_sum": baseline["weight_sum"], - "family_breakdown": build_family_breakdown( - candidate["target_names"], - np.asarray(candidate["weighted_terms"], dtype=np.float64), - np.asarray(baseline["weighted_terms"], dtype=np.float64), - np.asarray(candidate["rel_error"], dtype=np.float64), - np.asarray(baseline["rel_error"], dtype=np.float64), - ), -} -print(json.dumps(payload, sort_keys=True)) -""".strip() - -_PE_NATIVE_BROAD_BATCH_SCORE_SCRIPT = """ -import json -import sys -from pathlib import Path - -import numpy as np -from policyengine_core.data import Dataset - -REPO_ROOT = sys.argv[1] -if REPO_ROOT not in sys.path: - sys.path.insert(0, REPO_ROOT) - -from policyengine_us import Microsimulation -from policyengine_us_data.utils.loss import build_loss_matrix - - -def patch_policyengine_us_data_uprating_aliases(): - import policyengine_us_data.utils.soi as soi_utils - - original = soi_utils.create_policyengine_uprating_factors_table - - def patched_create_policyengine_uprating_factors_table(*args, **kwargs): - table = original(*args, **kwargs) - if ( - "employment_income" not in table.index - and "employment_income_before_lsr" in table.index - ): - table.loc["employment_income"] = table.loc[ - "employment_income_before_lsr" - ] - return table - - soi_utils.create_policyengine_uprating_factors_table = ( - patched_create_policyengine_uprating_factors_table - ) - - -patch_policyengine_us_data_uprating_aliases() - - -def load_microplex_loss_helpers(): - import importlib.util - - for entry in sys.path: - candidate = Path(entry) / "microplex_us" / "pipelines" / "pe_native_loss.py" - if not candidate.exists(): - continue - spec = importlib.util.spec_from_file_location( - "microplex_us_pe_native_loss_standalone", - candidate, - ) - if spec is None or spec.loader is None: - continue - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) - return module - raise ModuleNotFoundError("Could not load microplex_us pe_native_loss helper") - - -_LOSS_HELPERS = load_microplex_loss_helpers() -PE_NATIVE_ROBUST_LOSS_METRIC = _LOSS_HELPERS.PE_NATIVE_ROBUST_LOSS_METRIC -build_pe_native_loss_arrays = _LOSS_HELPERS.build_pe_native_loss_arrays -pe_native_huber_loss_terms = _LOSS_HELPERS.pe_native_huber_loss_terms -pe_native_relative_error = _LOSS_HELPERS.pe_native_relative_error - -BAD_TARGETS = tuple(json.loads(sys.argv[2])) -PERIOD = int(sys.argv[3]) -BASELINE_DATASET = sys.argv[4] -CANDIDATE_DATASETS = tuple(json.loads(sys.argv[5])) -TARGET_SCOPE_FILTER = sys.argv[6] if len(sys.argv) >= 7 and sys.argv[6] else None - - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -def scope_keep_mask(target_names): - if TARGET_SCOPE_FILTER is None: - return np.ones(target_names.shape, dtype=bool) - if TARGET_SCOPE_FILTER == "national": - return np.asarray( - [str(name).startswith("nation/") for name in target_names], - dtype=bool, - ) - if TARGET_SCOPE_FILTER == "state": - return np.asarray( - [not str(name).startswith("nation/") for name in target_names], - dtype=bool, - ) - raise ValueError(f"Unsupported target scope filter: {TARGET_SCOPE_FILTER}") - - -def classify_target_family(target_name: str) -> str: - parts = target_name.split("/") - if target_name.startswith("state/census/age/"): - return "state_age_distribution" - if target_name.startswith("state/census/population_by_state/"): - return "state_population" - if target_name.startswith("state/census/population_under_5_by_state/"): - return "state_population_under_5" - if target_name.startswith("nation/irs/aca_spending/"): - return "state_aca_spending" - if target_name.startswith("state/irs/aca_enrollment/"): - return "state_aca_enrollment" - if target_name.startswith("irs/medicaid_enrollment/"): - return "state_medicaid_enrollment" - if target_name.endswith("/snap-cost"): - return "state_snap_cost" - if target_name.endswith("/snap-hhs"): - return "state_snap_households" - if target_name.startswith("state/real_estate_taxes/"): - return "state_real_estate_taxes" - if len(parts) >= 3 and parts[0] == "state" and parts[2] == "adjusted_gross_income": - return "state_agi_distribution" - if target_name.startswith("nation/jct/"): - return "national_tax_expenditures" - if target_name.startswith("nation/net_worth/"): - return "national_net_worth" - if target_name.startswith("nation/ssa/"): - return "national_ssa" - if target_name.startswith("nation/census/population_by_age/"): - return "national_population_by_age" - if target_name == "nation/census/infants": - return "national_infants" - if target_name.startswith("nation/census/agi_in_spm_threshold_decile_"): - return "national_spm_threshold_agi" - if target_name.startswith("nation/census/count_in_spm_threshold_decile_"): - return "national_spm_threshold_count" - if target_name.startswith("nation/census/"): - return "national_census_other" - if target_name.startswith("nation/irs/"): - return "national_irs_other" - return "other" - - -def build_family_breakdown(target_names, candidate_terms, baseline_terms, candidate_rel_error, baseline_rel_error): - family_rows = [] - target_names = list(target_names) - unique_families = sorted({classify_target_family(name) for name in target_names}) - for family in unique_families: - idx = [i for i, name in enumerate(target_names) if classify_target_family(name) == family] - if not idx: - continue - candidate_slice = candidate_terms[idx] - baseline_slice = baseline_terms[idx] - candidate_rel_slice = candidate_rel_error[idx] - baseline_rel_slice = baseline_rel_error[idx] - family_rows.append( - { - "family": family, - "n_targets": int(len(idx)), - "candidate_loss_contribution": float(candidate_slice.sum()), - "baseline_loss_contribution": float(baseline_slice.sum()), - "loss_contribution_delta": float(candidate_slice.sum() - baseline_slice.sum()), - "candidate_mean_weighted_loss": float(candidate_slice.mean()), - "baseline_mean_weighted_loss": float(baseline_slice.mean()), - "candidate_mean_unweighted_msre": float(np.mean(np.square(candidate_rel_slice))), - "baseline_mean_unweighted_msre": float(np.mean(np.square(baseline_rel_slice))), - "unweighted_msre_delta": float( - np.mean(np.square(candidate_rel_slice)) - - np.mean(np.square(baseline_rel_slice)) - ), - } - ) - family_rows.sort(key=lambda row: row["loss_contribution_delta"], reverse=True) - return family_rows - - -def compute(dataset_path: str) -> dict[str, float | int]: - dataset_cls = dataset_from_path( - dataset_path, - Path(dataset_path).stem.replace("-", "_"), - ) - loss_matrix, targets_array = build_loss_matrix(dataset_cls, PERIOD) - target_names = np.asarray(loss_matrix.columns) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) - bad_mask = np.isin(target_names, BAD_TARGETS) - keep_mask = ~(zero_mask | bad_mask) & scope_keep_mask(target_names) - - filtered = loss_matrix.loc[:, keep_mask] - filtered_targets = np.asarray(targets_array[keep_mask], dtype=np.float64) - if filtered_targets.size == 0: - raise ValueError("PE-native broad loss has no targets after filtering") - is_national = np.asarray(filtered.columns.str.startswith("nation/"), dtype=bool) - n_national = int(is_national.sum()) - n_state = int((~is_national).sum()) - - loss_arrays = build_pe_native_loss_arrays( - filtered.columns.tolist(), - filtered_targets, - ) - - sim = Microsimulation(dataset=dataset_cls) - sim.default_calculation_period = PERIOD - weights = sim.calculate( - "household_weight", - map_to="household", - period=PERIOD, - ).values.astype(np.float64) - - estimate = weights @ filtered.to_numpy(dtype=np.float64) - rel_error = pe_native_relative_error(estimate, loss_arrays) - weighted_terms = pe_native_huber_loss_terms(estimate, loss_arrays) - loss_value = float(weighted_terms.sum()) - unweighted_msre = float(np.mean(np.square(rel_error))) - - return { - "dataset": dataset_path, - "metric": PE_NATIVE_ROBUST_LOSS_METRIC, - "loss_config": loss_arrays.metadata().get("loss_config"), - "loss": loss_value, - "unweighted_msre": unweighted_msre, - "n_targets_total": int(len(target_names)), - "n_targets_kept": int(keep_mask.sum()), - "n_targets_zero_dropped": int(zero_mask.sum()), - "n_targets_bad_dropped": int(bad_mask.sum()), - "n_national_targets": n_national, - "n_state_targets": n_state, - "target_scope_filter": TARGET_SCOPE_FILTER, - "weight_sum": float(weights.sum()), - "target_names": filtered.columns.tolist(), - "weighted_terms": weighted_terms.tolist(), - "rel_error": rel_error.tolist(), - "target_loss_metadata": loss_arrays.sidecar_rows(), - } - - -baseline = compute(BASELINE_DATASET) -payload = [] -for candidate_dataset in CANDIDATE_DATASETS: - candidate = compute(candidate_dataset) - if candidate["n_targets_kept"] != baseline["n_targets_kept"]: - raise ValueError( - "Candidate and baseline produced different target counts after filtering: " - f"{candidate['n_targets_kept']} vs {baseline['n_targets_kept']}" - ) - if candidate["target_names"] != baseline["target_names"]: - raise ValueError("Candidate and baseline produced different target names after filtering") - payload.append( - { - "metric": candidate["metric"], - "loss_config": candidate.get("loss_config"), - "period": PERIOD, - "candidate_dataset": candidate_dataset, - "baseline_dataset": BASELINE_DATASET, - "candidate_enhanced_cps_native_loss": candidate["loss"], - "baseline_enhanced_cps_native_loss": baseline["loss"], - "enhanced_cps_native_loss_delta": candidate["loss"] - baseline["loss"], - "candidate_beats_baseline": candidate["loss"] < baseline["loss"], - "candidate_unweighted_msre": candidate["unweighted_msre"], - "baseline_unweighted_msre": baseline["unweighted_msre"], - "unweighted_msre_delta": ( - candidate["unweighted_msre"] - baseline["unweighted_msre"] - ), - "n_targets_total": candidate["n_targets_total"], - "n_targets_kept": candidate["n_targets_kept"], - "n_targets_zero_dropped": candidate["n_targets_zero_dropped"], - "n_targets_bad_dropped": candidate["n_targets_bad_dropped"], - "n_national_targets": candidate["n_national_targets"], - "n_state_targets": candidate["n_state_targets"], - "target_scope_filter": TARGET_SCOPE_FILTER, - "candidate_weight_sum": candidate["weight_sum"], - "baseline_weight_sum": baseline["weight_sum"], - "family_breakdown": build_family_breakdown( - candidate["target_names"], - np.asarray(candidate["weighted_terms"], dtype=np.float64), - np.asarray(baseline["weighted_terms"], dtype=np.float64), - np.asarray(candidate["rel_error"], dtype=np.float64), - np.asarray(baseline["rel_error"], dtype=np.float64), - ), - } - ) -print(json.dumps(payload, sort_keys=True)) -""".strip() - -_PE_NATIVE_TARGET_DELTA_SCRIPT = """ -import json -import sys -from pathlib import Path - -import numpy as np -from policyengine_core.data import Dataset - -REPO_ROOT = sys.argv[1] -if REPO_ROOT not in sys.path: - sys.path.insert(0, REPO_ROOT) - -from policyengine_us import Microsimulation -from policyengine_us_data.utils.loss import build_loss_matrix - - -def patch_policyengine_us_data_uprating_aliases(): - import policyengine_us_data.utils.soi as soi_utils - - original = soi_utils.create_policyengine_uprating_factors_table - - def patched_create_policyengine_uprating_factors_table(*args, **kwargs): - table = original(*args, **kwargs) - if ( - "employment_income" not in table.index - and "employment_income_before_lsr" in table.index - ): - table.loc["employment_income"] = table.loc[ - "employment_income_before_lsr" - ] - return table - - soi_utils.create_policyengine_uprating_factors_table = ( - patched_create_policyengine_uprating_factors_table - ) - - -patch_policyengine_us_data_uprating_aliases() - - -def load_microplex_loss_helpers(): - import importlib.util - - for entry in sys.path: - candidate = Path(entry) / "microplex_us" / "pipelines" / "pe_native_loss.py" - if not candidate.exists(): - continue - spec = importlib.util.spec_from_file_location( - "microplex_us_pe_native_loss_standalone", - candidate, - ) - if spec is None or spec.loader is None: - continue - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) - return module - raise ModuleNotFoundError("Could not load microplex_us pe_native_loss helper") - - -_LOSS_HELPERS = load_microplex_loss_helpers() -PE_NATIVE_ROBUST_LOSS_METRIC = _LOSS_HELPERS.PE_NATIVE_ROBUST_LOSS_METRIC -build_pe_native_loss_arrays = _LOSS_HELPERS.build_pe_native_loss_arrays -pe_native_huber_loss_terms = _LOSS_HELPERS.pe_native_huber_loss_terms -pe_native_relative_error = _LOSS_HELPERS.pe_native_relative_error - -BAD_TARGETS = tuple(json.loads(sys.argv[2])) -PERIOD = int(sys.argv[3]) -FROM_DATASET = sys.argv[4] -TO_DATASET = sys.argv[5] -TOP_K = int(sys.argv[6]) - - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -def compute(dataset_path: str): - dataset_cls = dataset_from_path( - dataset_path, - Path(dataset_path).stem.replace("-", "_"), - ) - loss_matrix, targets_array = build_loss_matrix(dataset_cls, PERIOD) - target_names = np.asarray(loss_matrix.columns) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) - bad_mask = np.isin(target_names, BAD_TARGETS) - keep_mask = ~(zero_mask | bad_mask) - - filtered = loss_matrix.loc[:, keep_mask] - filtered_targets = np.asarray(targets_array[keep_mask], dtype=np.float64) - is_national = np.asarray(filtered.columns.str.startswith("nation/"), dtype=bool) - n_national = int(is_national.sum()) - n_state = int((~is_national).sum()) - if n_national == 0 or n_state == 0: - raise ValueError( - "PE-native broad loss requires both national and state targets after filtering" - ) - - loss_arrays = build_pe_native_loss_arrays( - filtered.columns.tolist(), - filtered_targets, - ) - - sim = Microsimulation(dataset=dataset_cls) - sim.default_calculation_period = PERIOD - weights = sim.calculate( - "household_weight", - map_to="household", - period=PERIOD, - ).values.astype(np.float64) - - estimate = weights @ filtered.to_numpy(dtype=np.float64) - rel_error = pe_native_relative_error(estimate, loss_arrays) - weighted_terms = pe_native_huber_loss_terms(estimate, loss_arrays) - return { - "target_names": filtered.columns.tolist(), - "targets": filtered_targets.tolist(), - "estimate": estimate.tolist(), - "rel_error": rel_error.tolist(), - "weighted_terms": weighted_terms.tolist(), - } - - -def classify_target_family(target_name: str) -> str: - parts = target_name.split("/") - if target_name.startswith("state/census/age/"): - return "state_age_distribution" - if target_name.startswith("state/census/population_by_state/"): - return "state_population" - if target_name.startswith("state/census/population_under_5_by_state/"): - return "state_population_under_5" - if target_name.startswith("nation/irs/aca_spending/"): - return "state_aca_spending" - if target_name.startswith("state/irs/aca_enrollment/"): - return "state_aca_enrollment" - if target_name.startswith("irs/medicaid_enrollment/"): - return "state_medicaid_enrollment" - if target_name.endswith("/snap-cost"): - return "state_snap_cost" - if target_name.endswith("/snap-hhs"): - return "state_snap_households" - if target_name.startswith("state/real_estate_taxes/"): - return "state_real_estate_taxes" - if len(parts) >= 3 and parts[0] == "state" and parts[2] == "adjusted_gross_income": - return "state_agi_distribution" - if target_name.startswith("nation/jct/"): - return "national_tax_expenditures" - if target_name.startswith("nation/net_worth/"): - return "national_net_worth" - if target_name.startswith("nation/ssa/"): - return "national_ssa" - if target_name.startswith("nation/census/population_by_age/"): - return "national_population_by_age" - if target_name == "nation/census/infants": - return "national_infants" - if target_name.startswith("nation/census/agi_in_spm_threshold_decile_"): - return "national_spm_threshold_agi" - if target_name.startswith("nation/census/count_in_spm_threshold_decile_"): - return "national_spm_threshold_count" - if target_name.startswith("nation/census/"): - return "national_census_other" - if target_name.startswith("nation/irs/"): - return "national_irs_other" - return "other" - - -def target_scope(target_name: str) -> str: - if target_name.startswith("nation/"): - return "national" - if target_name.startswith("state/") or target_name.endswith("/snap-cost") or target_name.endswith("/snap-hhs"): - return "state" - return "other" - - -def abs_pct_error(estimate: float, target: float) -> float: - return abs(estimate - target) / max(abs(target), 1.0) * 100.0 - - -def build_target_rows(from_payload, to_payload): - rows = [] - for idx, name in enumerate(from_payload["target_names"]): - from_term = float(from_payload["weighted_terms"][idx]) - to_term = float(to_payload["weighted_terms"][idx]) - from_error = float(from_payload["rel_error"][idx]) - to_error = float(to_payload["rel_error"][idx]) - target_value = float(from_payload["targets"][idx]) - from_estimate = float(from_payload["estimate"][idx]) - to_estimate = float(to_payload["estimate"][idx]) - if to_error < from_error: - winner = "to" - elif from_error < to_error: - winner = "from" - else: - winner = "tie" - rows.append( - { - "target_name": name, - "target_family": classify_target_family(name), - "target_scope": target_scope(name), - "winner": winner, - "weighted_term_delta": to_term - from_term, - "from_weighted_term": from_term, - "to_weighted_term": to_term, - "target_value": target_value, - "from_estimate": from_estimate, - "to_estimate": to_estimate, - "from_rel_error": from_error, - "to_rel_error": to_error, - "from_abs_pct_error": abs_pct_error(from_estimate, target_value), - "to_abs_pct_error": abs_pct_error(to_estimate, target_value), - } - ) - return rows - - -def summarize_target_rows(rows, *, group_field=None): - if group_field is None: - grouped = [("all", rows)] - else: - values = sorted({row[group_field] for row in rows}) - grouped = [(value, [row for row in rows if row[group_field] == value]) for value in values] - - summaries = [] - for value, group_rows in grouped: - n_targets = len(group_rows) - from_wins = sum(1 for row in group_rows if row["winner"] == "from") - to_wins = sum(1 for row in group_rows if row["winner"] == "to") - ties = n_targets - from_wins - to_wins - from_loss = float(np.mean([row["from_weighted_term"] for row in group_rows])) - to_loss = float(np.mean([row["to_weighted_term"] for row in group_rows])) - summary = { - "n_targets": n_targets, - "from_wins": from_wins, - "to_wins": to_wins, - "ties": ties, - "from_win_rate": from_wins / n_targets if n_targets else None, - "to_win_rate": to_wins / n_targets if n_targets else None, - "from_loss": from_loss, - "to_loss": to_loss, - "loss_delta": to_loss - from_loss, - "mean_weighted_term_delta": float( - np.mean([row["weighted_term_delta"] for row in group_rows]) - ), - } - if group_field is not None: - summary[group_field] = value - summaries.append(summary) - return summaries[0] if group_field is None else summaries - - -from_payload = compute(FROM_DATASET) -to_payload = compute(TO_DATASET) - -if from_payload["target_names"] != to_payload["target_names"]: - raise ValueError("Datasets produced different target names after filtering") - -rows = build_target_rows(from_payload, to_payload) -rows.sort(key=lambda row: row["weighted_term_delta"], reverse=True) -payload = { - "metric": "enhanced_cps_native_loss_target_delta", - "period": PERIOD, - "from_dataset": FROM_DATASET, - "to_dataset": TO_DATASET, - "summary": summarize_target_rows(rows), - "family_summaries": summarize_target_rows(rows, group_field="target_family"), - "scope_summaries": summarize_target_rows(rows, group_field="target_scope"), - "targets": rows, - "top_regressions": rows[:TOP_K], - "top_improvements": list(reversed(rows[-TOP_K:])), -} -print(json.dumps(payload, sort_keys=True)) -""".strip() - -_PE_NATIVE_TARGET_DELTA_BATCH_SCRIPT = """ -import json -import sys -from pathlib import Path - -import numpy as np -from policyengine_core.data import Dataset - -REPO_ROOT = sys.argv[1] -if REPO_ROOT not in sys.path: - sys.path.insert(0, REPO_ROOT) - -from policyengine_us import Microsimulation -from policyengine_us_data.utils.loss import build_loss_matrix - - -def patch_policyengine_us_data_uprating_aliases(): - import policyengine_us_data.utils.soi as soi_utils - - original = soi_utils.create_policyengine_uprating_factors_table - - def patched_create_policyengine_uprating_factors_table(*args, **kwargs): - table = original(*args, **kwargs) - if ( - "employment_income" not in table.index - and "employment_income_before_lsr" in table.index - ): - table.loc["employment_income"] = table.loc[ - "employment_income_before_lsr" - ] - return table - - soi_utils.create_policyengine_uprating_factors_table = ( - patched_create_policyengine_uprating_factors_table - ) - - -patch_policyengine_us_data_uprating_aliases() - - -def load_microplex_loss_helpers(): - import importlib.util - - for entry in sys.path: - candidate = Path(entry) / "microplex_us" / "pipelines" / "pe_native_loss.py" - if not candidate.exists(): - continue - spec = importlib.util.spec_from_file_location( - "microplex_us_pe_native_loss_standalone", - candidate, - ) - if spec is None or spec.loader is None: - continue - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) - return module - raise ModuleNotFoundError("Could not load microplex_us pe_native_loss helper") - - -_LOSS_HELPERS = load_microplex_loss_helpers() -PE_NATIVE_ROBUST_LOSS_METRIC = _LOSS_HELPERS.PE_NATIVE_ROBUST_LOSS_METRIC -build_pe_native_loss_arrays = _LOSS_HELPERS.build_pe_native_loss_arrays -pe_native_huber_loss_terms = _LOSS_HELPERS.pe_native_huber_loss_terms -pe_native_relative_error = _LOSS_HELPERS.pe_native_relative_error - -BAD_TARGETS = tuple(json.loads(sys.argv[2])) -PERIOD = int(sys.argv[3]) -BASELINE_DATASET = sys.argv[4] -CANDIDATE_DATASETS = json.loads(sys.argv[5]) -TOP_K = int(sys.argv[6]) - - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -def compute(dataset_path: str): - dataset_cls = dataset_from_path( - dataset_path, - Path(dataset_path).stem.replace("-", "_"), - ) - loss_matrix, targets_array = build_loss_matrix(dataset_cls, PERIOD) - target_names = np.asarray(loss_matrix.columns) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) - bad_mask = np.isin(target_names, BAD_TARGETS) - keep_mask = ~(zero_mask | bad_mask) - - filtered = loss_matrix.loc[:, keep_mask] - filtered_targets = np.asarray(targets_array[keep_mask], dtype=np.float64) - is_national = np.asarray(filtered.columns.str.startswith("nation/"), dtype=bool) - n_national = int(is_national.sum()) - n_state = int((~is_national).sum()) - if n_national == 0 or n_state == 0: - raise ValueError( - "PE-native broad loss requires both national and state targets after filtering" - ) - - loss_arrays = build_pe_native_loss_arrays( - filtered.columns.tolist(), - filtered_targets, - ) - - sim = Microsimulation(dataset=dataset_cls) - sim.default_calculation_period = PERIOD - weights = sim.calculate( - "household_weight", - map_to="household", - period=PERIOD, - ).values.astype(np.float64) - - estimate = weights @ filtered.to_numpy(dtype=np.float64) - rel_error = pe_native_relative_error(estimate, loss_arrays) - weighted_terms = pe_native_huber_loss_terms(estimate, loss_arrays) - return { - "target_names": filtered.columns.tolist(), - "targets": filtered_targets.tolist(), - "estimate": estimate.tolist(), - "rel_error": rel_error.tolist(), - "weighted_terms": weighted_terms.tolist(), - } - - -def classify_target_family(target_name: str) -> str: - parts = target_name.split("/") - if target_name.startswith("state/census/age/"): - return "state_age_distribution" - if target_name.startswith("state/census/population_by_state/"): - return "state_population" - if target_name.startswith("state/census/population_under_5_by_state/"): - return "state_population_under_5" - if target_name.startswith("nation/irs/aca_spending/"): - return "state_aca_spending" - if target_name.startswith("state/irs/aca_enrollment/"): - return "state_aca_enrollment" - if target_name.startswith("irs/medicaid_enrollment/"): - return "state_medicaid_enrollment" - if target_name.endswith("/snap-cost"): - return "state_snap_cost" - if target_name.endswith("/snap-hhs"): - return "state_snap_households" - if target_name.startswith("state/real_estate_taxes/"): - return "state_real_estate_taxes" - if len(parts) >= 3 and parts[0] == "state" and parts[2] == "adjusted_gross_income": - return "state_agi_distribution" - if target_name.startswith("nation/jct/"): - return "national_tax_expenditures" - if target_name.startswith("nation/net_worth/"): - return "national_net_worth" - if target_name.startswith("nation/ssa/"): - return "national_ssa" - if target_name.startswith("nation/census/population_by_age/"): - return "national_population_by_age" - if target_name == "nation/census/infants": - return "national_infants" - if target_name.startswith("nation/census/agi_in_spm_threshold_decile_"): - return "national_spm_threshold_agi" - if target_name.startswith("nation/census/count_in_spm_threshold_decile_"): - return "national_spm_threshold_count" - if target_name.startswith("nation/census/"): - return "national_census_other" - if target_name.startswith("nation/irs/"): - return "national_irs_other" - return "other" - - -def target_scope(target_name: str) -> str: - if target_name.startswith("nation/"): - return "national" - if target_name.startswith("state/") or target_name.endswith("/snap-cost") or target_name.endswith("/snap-hhs"): - return "state" - return "other" - - -def abs_pct_error(estimate: float, target: float) -> float: - return abs(estimate - target) / max(abs(target), 1.0) * 100.0 - - -def build_target_rows(from_payload, to_payload): - rows = [] - for idx, name in enumerate(from_payload["target_names"]): - from_term = float(from_payload["weighted_terms"][idx]) - to_term = float(to_payload["weighted_terms"][idx]) - from_error = float(from_payload["rel_error"][idx]) - to_error = float(to_payload["rel_error"][idx]) - target_value = float(from_payload["targets"][idx]) - from_estimate = float(from_payload["estimate"][idx]) - to_estimate = float(to_payload["estimate"][idx]) - if to_error < from_error: - winner = "to" - elif from_error < to_error: - winner = "from" - else: - winner = "tie" - rows.append( - { - "target_name": name, - "target_family": classify_target_family(name), - "target_scope": target_scope(name), - "winner": winner, - "weighted_term_delta": to_term - from_term, - "from_weighted_term": from_term, - "to_weighted_term": to_term, - "target_value": target_value, - "from_estimate": from_estimate, - "to_estimate": to_estimate, - "from_rel_error": from_error, - "to_rel_error": to_error, - "from_abs_pct_error": abs_pct_error(from_estimate, target_value), - "to_abs_pct_error": abs_pct_error(to_estimate, target_value), - } - ) - return rows - - -def summarize_target_rows(rows, *, group_field=None): - if group_field is None: - grouped = [("all", rows)] - else: - values = sorted({row[group_field] for row in rows}) - grouped = [(value, [row for row in rows if row[group_field] == value]) for value in values] - - summaries = [] - for value, group_rows in grouped: - n_targets = len(group_rows) - from_wins = sum(1 for row in group_rows if row["winner"] == "from") - to_wins = sum(1 for row in group_rows if row["winner"] == "to") - ties = n_targets - from_wins - to_wins - from_loss = float(np.mean([row["from_weighted_term"] for row in group_rows])) - to_loss = float(np.mean([row["to_weighted_term"] for row in group_rows])) - summary = { - "n_targets": n_targets, - "from_wins": from_wins, - "to_wins": to_wins, - "ties": ties, - "from_win_rate": from_wins / n_targets if n_targets else None, - "to_win_rate": to_wins / n_targets if n_targets else None, - "from_loss": from_loss, - "to_loss": to_loss, - "loss_delta": to_loss - from_loss, - "mean_weighted_term_delta": float( - np.mean([row["weighted_term_delta"] for row in group_rows]) - ), - } - if group_field is not None: - summary[group_field] = value - summaries.append(summary) - return summaries[0] if group_field is None else summaries - - -baseline_payload = compute(BASELINE_DATASET) -results = [] -for candidate_dataset in CANDIDATE_DATASETS: - candidate_payload = compute(candidate_dataset) - if baseline_payload["target_names"] != candidate_payload["target_names"]: - raise ValueError("Datasets produced different target names after filtering") - - rows = build_target_rows(baseline_payload, candidate_payload) - rows.sort(key=lambda row: row["weighted_term_delta"], reverse=True) - results.append( - { - "metric": "enhanced_cps_native_loss_target_delta", - "period": PERIOD, - "from_dataset": BASELINE_DATASET, - "to_dataset": candidate_dataset, - "summary": summarize_target_rows(rows), - "family_summaries": summarize_target_rows(rows, group_field="target_family"), - "scope_summaries": summarize_target_rows(rows, group_field="target_scope"), - "targets": rows, - "top_regressions": rows[:TOP_K], - "top_improvements": list(reversed(rows[-TOP_K:])), - } - ) - -print(json.dumps(results, sort_keys=True)) -""".strip() - -_PE_NATIVE_SUPPORT_AUDIT_SCRIPT = """ -import json -import sys -from pathlib import Path - -import h5py -import numpy as np -from policyengine_core.data import Dataset - -REPO_ROOT = sys.argv[1] -if REPO_ROOT not in sys.path: - sys.path.insert(0, REPO_ROOT) - -from policyengine_us import Microsimulation - -PERIOD = int(sys.argv[2]) -CANDIDATE_DATASET = sys.argv[3] -BASELINE_DATASET = sys.argv[4] - -STATE_FIPS_TO_ABBR = { - 1: "AL", 2: "AK", 4: "AZ", 5: "AR", 6: "CA", 8: "CO", 9: "CT", 10: "DE", - 11: "DC", 12: "FL", 13: "GA", 15: "HI", 16: "ID", 17: "IL", 18: "IN", - 19: "IA", 20: "KS", 21: "KY", 22: "LA", 23: "ME", 24: "MD", 25: "MA", - 26: "MI", 27: "MN", 28: "MS", 29: "MO", 30: "MT", 31: "NE", 32: "NV", - 33: "NH", 34: "NJ", 35: "NM", 36: "NY", 37: "NC", 38: "ND", 39: "OH", - 40: "OK", 41: "OR", 42: "PA", 44: "RI", 45: "SC", 46: "SD", 47: "TN", - 48: "TX", 49: "UT", 50: "VT", 51: "VA", 53: "WA", 54: "WV", 55: "WI", - 56: "WY", -} -CRITICAL_PERSON_VARIABLES = ( - "has_marketplace_health_coverage", - "has_esi", - "medicare_part_b_premiums", - "child_support_expense", - "self_employment_income_before_lsr", - "rental_income", - "non_sch_d_capital_gains", -) -HIGH_SIGNAL_MFS_AGI_BINS = ( - ("75k_to_100k", 75_000.0, 100_000.0), - ("100k_to_200k", 100_000.0, 200_000.0), - ("200k_to_500k", 200_000.0, 500_000.0), - ("500k_plus", 500_000.0, np.inf), -) -HIGH_SIGNAL_HOH_AGI_BINS = ( - ("20k_to_25k", 20_000.0, 25_000.0), - ("25k_to_30k", 25_000.0, 30_000.0), - ("30k_to_40k", 30_000.0, 40_000.0), - ("200k_to_500k", 200_000.0, 500_000.0), - ("500k_to_1m", 500_000.0, 1_000_000.0), - ("1m_plus", 1_000_000.0, np.inf), -) -AGE_BUCKETS = ( - ("0_to_4", 0, 5), - ("5_to_17", 5, 18), - ("18_to_29", 18, 30), - ("30_to_44", 30, 45), - ("45_to_64", 45, 65), - ("65_plus", 65, np.inf), -) -SSI_AGE_BUCKETS = ( - ("all", -np.inf, np.inf), - ("under_18", 0, 18), - ("18_to_64", 18, 65), - ("65_plus", 65, np.inf), -) -MEDICARE_PART_B_AGE_BUCKETS = ( - ("age_0_to_9", 0, 10), - ("age_10_to_19", 10, 20), - ("age_20_to_29", 20, 30), - ("age_30_to_39", 30, 40), - ("age_40_to_49", 40, 50), - ("age_50_to_59", 50, 60), - ("age_60_to_64", 60, 65), - ("age_65_plus", 65, np.inf), -) - - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -def stored_variables_for(dataset_path: str) -> set[str]: - with h5py.File(dataset_path, "r") as handle: - return set(handle.keys()) - - -def calculate_first_available(sim, variables, *, period: int, map_to: str | None = None): - last_error = None - for variable in variables: - try: - if map_to is None: - values = sim.calculate(variable, period=period).values - else: - values = sim.calculate(variable, period=period, map_to=map_to).values - return variable, values - except ValueError as exc: - if "does not exist" not in str(exc): - raise - last_error = exc - if last_error is not None: - raise last_error - raise ValueError("No candidate variables supplied") - - -def state_abbr(value) -> str: - if value is None: - return "NA" - try: - numeric = int(value) - except (TypeError, ValueError): - return str(value) - return STATE_FIPS_TO_ABBR.get(numeric, str(numeric)) - - -def normalize_status(value) -> str: - if hasattr(value, "name"): - return str(value.name) - text = str(value) - if "." in text: - text = text.rsplit(".", 1)[-1] - normalized = text.strip().upper().replace(" ", "_") - if normalized in { - "SINGLE", - "JOINT", - "SEPARATE", - "HEAD_OF_HOUSEHOLD", - "SURVIVING_SPOUSE", - }: - return normalized - return normalized - - -def summarize_numeric(values, weights, *, stored: bool) -> dict[str, float | int | bool]: - arr = np.nan_to_num(np.asarray(values, dtype=np.float64), nan=0.0) - w = np.asarray(weights, dtype=np.float64) - positive = arr > 0.0 - negative = arr < 0.0 - nonzero = arr != 0.0 - return { - "stored": bool(stored), - "nonzero_count": int(nonzero.sum()), - "positive_count": int(positive.sum()), - "negative_count": int(negative.sum()), - "weighted_nonzero": float(w[nonzero].sum()), - "weighted_positive": float(w[positive].sum()), - "weighted_negative": float(w[negative].sum()), - "value_sum": float((arr * w).sum()), - } - - -def summarize_bool(values, weights, *, stored: bool) -> dict[str, float | int | bool]: - arr = np.asarray(values).astype(bool) - w = np.asarray(weights, dtype=np.float64) - return { - "stored": bool(stored), - "true_count": int(arr.sum()), - "false_count": int((~arr).sum()), - "weighted_true": float(w[arr].sum()), - "weighted_false": float(w[~arr].sum()), - } - - -def build_snapshot(dataset_path: str) -> dict: - dataset_cls = dataset_from_path( - dataset_path, - Path(dataset_path).stem.replace("-", "_"), - ) - stored_variables = stored_variables_for(dataset_path) - sim = Microsimulation(dataset=dataset_cls) - sim.default_calculation_period = PERIOD - - person_weights = sim.calculate("person_weight", period=PERIOD).values.astype(np.float64) - household_weights = sim.calculate("household_weight", period=PERIOD).values.astype(np.float64) - tax_unit_weights = sim.calculate("tax_unit_weight", period=PERIOD).values.astype(np.float64) - person_state = sim.calculate("state_fips", map_to="person", period=PERIOD).values - household_state = sim.calculate("state_fips", map_to="household", period=PERIOD).values - person_age = sim.calculate("age", period=PERIOD).values.astype(np.float64) - marketplace = sim.calculate("has_marketplace_health_coverage", period=PERIOD).values - filing_status = sim.calculate("filing_status", period=PERIOD).values - adjusted_gross_income = sim.calculate("adjusted_gross_income", period=PERIOD).values.astype(np.float64) - ssi = sim.calculate("ssi", period=PERIOD).values.astype(np.float64) - medicare_part_b_variable, medicare_part_b_premiums = calculate_first_available( - sim, - ( - "medicare_part_b_premiums", - "medicare_part_b_premiums_reported", - "medicare_part_b_premium", - ), - period=PERIOD, - ) - medicare_part_b_premiums = medicare_part_b_premiums.astype(np.float64) - aca_ptc_household = sim.calculate("aca_ptc", map_to="household", period=PERIOD).values.astype(np.float64) - - critical_support = {} - for variable in CRITICAL_PERSON_VARIABLES: - calculated_variable = variable - if variable == "medicare_part_b_premiums": - calculated_variable, values = calculate_first_available( - sim, - ( - "medicare_part_b_premiums", - "medicare_part_b_premiums_reported", - "medicare_part_b_premium", - ), - period=PERIOD, - ) - else: - values = sim.calculate(variable, period=PERIOD).values - stored = variable in stored_variables or calculated_variable in stored_variables - if np.asarray(values).dtype == np.bool_: - critical_support[variable] = summarize_bool( - values, - person_weights, - stored=stored, - ) - else: - critical_support[variable] = summarize_numeric( - values, - person_weights, - stored=stored, - ) - critical_support[variable]["calculated_variable"] = calculated_variable - - normalized_filing_status = np.asarray([normalize_status(value) for value in filing_status]) - filing_status_counts = {} - for status in ("SINGLE", "JOINT", "SEPARATE", "HEAD_OF_HOUSEHOLD", "SURVIVING_SPOUSE"): - mask = normalized_filing_status == status - filing_status_counts[status] = { - "count": int(mask.sum()), - "weighted_count": float(tax_unit_weights[mask].sum()), - } - - def agi_support_for_status(status: str, bins) -> list[dict]: - status_mask = normalized_filing_status == status - rows = [] - for label, lower, upper in bins: - mask = status_mask & (adjusted_gross_income >= lower) & (adjusted_gross_income < upper) - rows.append( - { - "agi_bin": label, - "count": int(mask.sum()), - "weighted_count": float(tax_unit_weights[mask].sum()), - "weighted_agi": float((adjusted_gross_income[mask] * tax_unit_weights[mask]).sum()), - } - ) - return rows - - def person_value_by_age(values, buckets) -> list[dict]: - arr = np.nan_to_num(np.asarray(values, dtype=np.float64), nan=0.0) - rows = [] - for label, lower, upper in buckets: - age_mask = (person_age >= lower) & (person_age < upper) - positive = age_mask & (arr > 0.0) - rows.append( - { - "age_bucket": label, - "person_count": int(age_mask.sum()), - "positive_count": int(positive.sum()), - "weighted_people": float(person_weights[age_mask].sum()), - "weighted_positive": float(person_weights[positive].sum()), - "value_sum": float((arr[age_mask] * person_weights[age_mask]).sum()), - } - ) - return rows - - mfs_agi_support = agi_support_for_status("SEPARATE", HIGH_SIGNAL_MFS_AGI_BINS) - hoh_agi_support = agi_support_for_status("HEAD_OF_HOUSEHOLD", HIGH_SIGNAL_HOH_AGI_BINS) - ssi_by_age = person_value_by_age(ssi, SSI_AGE_BUCKETS) - medicare_part_b_by_age = person_value_by_age( - medicare_part_b_premiums, - MEDICARE_PART_B_AGE_BUCKETS, - ) - - state_aca_ptc = {} - for state in sorted({state_abbr(value) for value in household_state}): - state_mask = np.asarray([state_abbr(value) == state for value in household_state], dtype=bool) - positive = state_mask & (aca_ptc_household > 0.0) - state_aca_ptc[state] = { - "weighted_households": float(household_weights[state_mask].sum()), - "weighted_positive_households": float(household_weights[positive].sum()), - "weighted_aca_ptc": float((aca_ptc_household[state_mask] * household_weights[state_mask]).sum()), - } - - states = sorted({state_abbr(value) for value in person_state}) - state_marketplace = {} - state_age_bucket = {} - marketplace_bool = np.asarray(marketplace).astype(bool) - for state in states: - state_mask = np.asarray([state_abbr(value) == state for value in person_state], dtype=bool) - enrolled = state_mask & marketplace_bool - state_marketplace[state] = { - "weighted_people": float(person_weights[state_mask].sum()), - "weighted_marketplace_enrollment": float(person_weights[enrolled].sum()), - } - bucket_weights = {} - nonempty = 0 - for label, lower, upper in AGE_BUCKETS: - mask = state_mask & (person_age >= lower) & (person_age < upper) - weight = float(person_weights[mask].sum()) - bucket_weights[label] = weight - if weight > 0.0: - nonempty += 1 - state_age_bucket[state] = { - "nonempty_buckets": int(nonempty), - "bucket_weights": bucket_weights, - } - - return { - "dataset": dataset_path, - "stored_variable_count": int(len(stored_variables)), - "stored_variables": sorted(stored_variables), - "critical_input_support": critical_support, - "filing_status_weighted_counts": filing_status_counts, - "mfs_high_agi_support": mfs_agi_support, - "hoh_agi_support": hoh_agi_support, - "ssi_by_age": ssi_by_age, - "medicare_part_b_premiums_variable": medicare_part_b_variable, - "medicare_part_b_premiums_by_age": medicare_part_b_by_age, - "state_aca_ptc_spending": state_aca_ptc, - "state_marketplace_enrollment": state_marketplace, - "state_age_bucket_support": state_age_bucket, - } - - -def compare_snapshots(candidate: dict, baseline: dict) -> dict: - critical_rows = [] - for variable in CRITICAL_PERSON_VARIABLES: - candidate_row = candidate["critical_input_support"][variable] - baseline_row = baseline["critical_input_support"][variable] - candidate_weighted = candidate_row.get("weighted_nonzero", candidate_row.get("weighted_true", 0.0)) - baseline_weighted = baseline_row.get("weighted_nonzero", baseline_row.get("weighted_true", 0.0)) - critical_rows.append( - { - "variable": variable, - "candidate_stored": bool(candidate_row.get("stored", False)), - "baseline_stored": bool(baseline_row.get("stored", False)), - "candidate_weighted_nonzero": float(candidate_weighted), - "baseline_weighted_nonzero": float(baseline_weighted), - "weighted_nonzero_delta": float(candidate_weighted - baseline_weighted), - } - ) - - filing_status_rows = [] - for status in ("SINGLE", "JOINT", "SEPARATE", "HEAD_OF_HOUSEHOLD", "SURVIVING_SPOUSE"): - candidate_row = candidate["filing_status_weighted_counts"][status] - baseline_row = baseline["filing_status_weighted_counts"][status] - filing_status_rows.append( - { - "filing_status": status, - "candidate_weighted_count": float(candidate_row["weighted_count"]), - "baseline_weighted_count": float(baseline_row["weighted_count"]), - "weighted_count_delta": float(candidate_row["weighted_count"] - baseline_row["weighted_count"]), - } - ) - - baseline_bins = {row["agi_bin"]: row for row in baseline["mfs_high_agi_support"]} - mfs_rows = [] - for row in candidate["mfs_high_agi_support"]: - other = baseline_bins[row["agi_bin"]] - mfs_rows.append( - { - "agi_bin": row["agi_bin"], - "candidate_weighted_count": float(row["weighted_count"]), - "baseline_weighted_count": float(other["weighted_count"]), - "weighted_count_delta": float(row["weighted_count"] - other["weighted_count"]), - "candidate_weighted_agi": float(row["weighted_agi"]), - "baseline_weighted_agi": float(other["weighted_agi"]), - "weighted_agi_delta": float(row["weighted_agi"] - other["weighted_agi"]), - } - ) - - baseline_bins = {row["agi_bin"]: row for row in baseline["hoh_agi_support"]} - hoh_rows = [] - for row in candidate["hoh_agi_support"]: - other = baseline_bins[row["agi_bin"]] - hoh_rows.append( - { - "agi_bin": row["agi_bin"], - "candidate_weighted_count": float(row["weighted_count"]), - "baseline_weighted_count": float(other["weighted_count"]), - "weighted_count_delta": float(row["weighted_count"] - other["weighted_count"]), - "candidate_weighted_agi": float(row["weighted_agi"]), - "baseline_weighted_agi": float(other["weighted_agi"]), - "weighted_agi_delta": float(row["weighted_agi"] - other["weighted_agi"]), - } - ) - - def age_value_delta(name: str) -> list[dict]: - baseline_bins = {row["age_bucket"]: row for row in baseline[name]} - rows = [] - for row in candidate[name]: - other = baseline_bins[row["age_bucket"]] - rows.append( - { - "age_bucket": row["age_bucket"], - "candidate_weighted_positive": float(row["weighted_positive"]), - "baseline_weighted_positive": float(other["weighted_positive"]), - "weighted_positive_delta": float(row["weighted_positive"] - other["weighted_positive"]), - "candidate_value_sum": float(row["value_sum"]), - "baseline_value_sum": float(other["value_sum"]), - "value_sum_delta": float(row["value_sum"] - other["value_sum"]), - } - ) - return rows - - ssi_rows = age_value_delta("ssi_by_age") - for row in ssi_rows: - row["candidate_weighted_recipients"] = row.pop("candidate_weighted_positive") - row["baseline_weighted_recipients"] = row.pop("baseline_weighted_positive") - row["weighted_recipient_delta"] = row.pop("weighted_positive_delta") - row["candidate_ssi"] = row.pop("candidate_value_sum") - row["baseline_ssi"] = row.pop("baseline_value_sum") - row["ssi_delta"] = row.pop("value_sum_delta") - - medicare_part_b_rows = age_value_delta("medicare_part_b_premiums_by_age") - - all_states = sorted( - set(candidate["state_aca_ptc_spending"]) - | set(baseline["state_aca_ptc_spending"]) - ) - state_aca_ptc_rows = [] - for state in all_states: - candidate_row = candidate["state_aca_ptc_spending"].get( - state, - {"weighted_aca_ptc": 0.0, "weighted_positive_households": 0.0}, - ) - baseline_row = baseline["state_aca_ptc_spending"].get( - state, - {"weighted_aca_ptc": 0.0, "weighted_positive_households": 0.0}, - ) - state_aca_ptc_rows.append( - { - "state": state, - "candidate_weighted_aca_ptc": float(candidate_row["weighted_aca_ptc"]), - "baseline_weighted_aca_ptc": float(baseline_row["weighted_aca_ptc"]), - "weighted_aca_ptc_delta": float(candidate_row["weighted_aca_ptc"] - baseline_row["weighted_aca_ptc"]), - "candidate_weighted_positive_households": float(candidate_row["weighted_positive_households"]), - "baseline_weighted_positive_households": float(baseline_row["weighted_positive_households"]), - "weighted_positive_household_delta": float( - candidate_row["weighted_positive_households"] - - baseline_row["weighted_positive_households"] - ), - } - ) - state_aca_ptc_rows.sort( - key=lambda row: abs(row["weighted_aca_ptc_delta"]), - reverse=True, - ) - - all_states = sorted( - set(candidate["state_marketplace_enrollment"]) - | set(baseline["state_marketplace_enrollment"]) - ) - state_marketplace_rows = [] - for state in all_states: - candidate_row = candidate["state_marketplace_enrollment"].get( - state, - {"weighted_marketplace_enrollment": 0.0}, - ) - baseline_row = baseline["state_marketplace_enrollment"].get( - state, - {"weighted_marketplace_enrollment": 0.0}, - ) - state_marketplace_rows.append( - { - "state": state, - "candidate_weighted_marketplace_enrollment": float(candidate_row["weighted_marketplace_enrollment"]), - "baseline_weighted_marketplace_enrollment": float(baseline_row["weighted_marketplace_enrollment"]), - "weighted_marketplace_enrollment_delta": float( - candidate_row["weighted_marketplace_enrollment"] - - baseline_row["weighted_marketplace_enrollment"] - ), - } - ) - state_marketplace_rows.sort( - key=lambda row: abs(row["weighted_marketplace_enrollment_delta"]), - reverse=True, - ) - - all_states = sorted( - set(candidate["state_age_bucket_support"]) - | set(baseline["state_age_bucket_support"]) - ) - state_age_rows = [] - for state in all_states: - candidate_row = candidate["state_age_bucket_support"].get( - state, - {"bucket_weights": {}}, - ) - baseline_row = baseline["state_age_bucket_support"].get( - state, - {"bucket_weights": {}}, - ) - for label, _lower, _upper in AGE_BUCKETS: - candidate_weight = float(candidate_row["bucket_weights"].get(label, 0.0)) - baseline_weight = float(baseline_row["bucket_weights"].get(label, 0.0)) - state_age_rows.append( - { - "state": state, - "age_bucket": label, - "candidate_weight": candidate_weight, - "baseline_weight": baseline_weight, - "weight_delta": candidate_weight - baseline_weight, - } - ) - state_age_rows.sort(key=lambda row: abs(row["weight_delta"]), reverse=True) - - return { - "critical_input_support": critical_rows, - "filing_status_weighted_delta": filing_status_rows, - "mfs_high_agi_delta": mfs_rows, - "hoh_agi_delta": hoh_rows, - "ssi_by_age_delta": ssi_rows, - "medicare_part_b_premiums_by_age_delta": medicare_part_b_rows, - "state_aca_ptc_spending_top_gaps": state_aca_ptc_rows[:15], - "state_marketplace_enrollment_top_gaps": state_marketplace_rows[:15], - "state_age_bucket_top_gaps": state_age_rows[:20], - } - - -candidate = build_snapshot(CANDIDATE_DATASET) -baseline = build_snapshot(BASELINE_DATASET) -payload = { - "metric": "enhanced_cps_support_audit", - "period": PERIOD, - "candidate_dataset": CANDIDATE_DATASET, - "baseline_dataset": BASELINE_DATASET, - "candidate": candidate, - "baseline": baseline, - "comparisons": compare_snapshots(candidate, baseline), -} -print(json.dumps(payload, sort_keys=True)) -""".strip() - -_PE_NATIVE_SUPPORT_AUDIT_BATCH_SCRIPT = """ -import json -import sys -from pathlib import Path - -import h5py -import numpy as np -from policyengine_core.data import Dataset - -REPO_ROOT = sys.argv[1] -if REPO_ROOT not in sys.path: - sys.path.insert(0, REPO_ROOT) - -from policyengine_us import Microsimulation - -PERIOD = int(sys.argv[2]) -BASELINE_DATASET = sys.argv[3] -CANDIDATE_DATASETS = json.loads(sys.argv[4]) - -STATE_FIPS_TO_ABBR = { - 1: "AL", 2: "AK", 4: "AZ", 5: "AR", 6: "CA", 8: "CO", 9: "CT", 10: "DE", - 11: "DC", 12: "FL", 13: "GA", 15: "HI", 16: "ID", 17: "IL", 18: "IN", - 19: "IA", 20: "KS", 21: "KY", 22: "LA", 23: "ME", 24: "MD", 25: "MA", - 26: "MI", 27: "MN", 28: "MS", 29: "MO", 30: "MT", 31: "NE", 32: "NV", - 33: "NH", 34: "NJ", 35: "NM", 36: "NY", 37: "NC", 38: "ND", 39: "OH", - 40: "OK", 41: "OR", 42: "PA", 44: "RI", 45: "SC", 46: "SD", 47: "TN", - 48: "TX", 49: "UT", 50: "VT", 51: "VA", 53: "WA", 54: "WV", 55: "WI", - 56: "WY", -} -CRITICAL_PERSON_VARIABLES = ( - "has_marketplace_health_coverage", - "has_esi", - "medicare_part_b_premiums", - "child_support_expense", - "self_employment_income_before_lsr", - "rental_income", - "non_sch_d_capital_gains", -) -HIGH_SIGNAL_MFS_AGI_BINS = ( - ("75k_to_100k", 75_000.0, 100_000.0), - ("100k_to_200k", 100_000.0, 200_000.0), - ("200k_to_500k", 200_000.0, 500_000.0), - ("500k_plus", 500_000.0, np.inf), -) -HIGH_SIGNAL_HOH_AGI_BINS = ( - ("20k_to_25k", 20_000.0, 25_000.0), - ("25k_to_30k", 25_000.0, 30_000.0), - ("30k_to_40k", 30_000.0, 40_000.0), - ("200k_to_500k", 200_000.0, 500_000.0), - ("500k_to_1m", 500_000.0, 1_000_000.0), - ("1m_plus", 1_000_000.0, np.inf), -) -AGE_BUCKETS = ( - ("0_to_4", 0, 5), - ("5_to_17", 5, 18), - ("18_to_29", 18, 30), - ("30_to_44", 30, 45), - ("45_to_64", 45, 65), - ("65_plus", 65, np.inf), -) -SSI_AGE_BUCKETS = ( - ("all", -np.inf, np.inf), - ("under_18", 0, 18), - ("18_to_64", 18, 65), - ("65_plus", 65, np.inf), -) -MEDICARE_PART_B_AGE_BUCKETS = ( - ("age_0_to_9", 0, 10), - ("age_10_to_19", 10, 20), - ("age_20_to_29", 20, 30), - ("age_30_to_39", 30, 40), - ("age_40_to_49", 40, 50), - ("age_50_to_59", 50, 60), - ("age_60_to_64", 60, 65), - ("age_65_plus", 65, np.inf), -) - - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -def stored_variables_for(dataset_path: str) -> set[str]: - with h5py.File(dataset_path, "r") as handle: - return set(handle.keys()) - - -def calculate_first_available(sim, variables, *, period: int, map_to: str | None = None): - last_error = None - for variable in variables: - try: - if map_to is None: - values = sim.calculate(variable, period=period).values - else: - values = sim.calculate(variable, period=period, map_to=map_to).values - return variable, values - except ValueError as exc: - if "does not exist" not in str(exc): - raise - last_error = exc - if last_error is not None: - raise last_error - raise ValueError("No candidate variables supplied") - - -def state_abbr(value) -> str: - if value is None: - return "NA" - try: - numeric = int(value) - except (TypeError, ValueError): - return str(value) - return STATE_FIPS_TO_ABBR.get(numeric, str(numeric)) - - -def normalize_status(value) -> str: - if hasattr(value, "name"): - return str(value.name) - text = str(value) - if "." in text: - text = text.rsplit(".", 1)[-1] - normalized = text.strip().upper().replace(" ", "_") - if normalized in { - "SINGLE", - "JOINT", - "SEPARATE", - "HEAD_OF_HOUSEHOLD", - "SURVIVING_SPOUSE", - }: - return normalized - return normalized - - -def summarize_numeric(values, weights, *, stored: bool) -> dict[str, float | int | bool]: - arr = np.nan_to_num(np.asarray(values, dtype=np.float64), nan=0.0) - w = np.asarray(weights, dtype=np.float64) - positive = arr > 0.0 - negative = arr < 0.0 - nonzero = arr != 0.0 - return { - "stored": bool(stored), - "nonzero_count": int(nonzero.sum()), - "positive_count": int(positive.sum()), - "negative_count": int(negative.sum()), - "weighted_nonzero": float(w[nonzero].sum()), - "weighted_positive": float(w[positive].sum()), - "weighted_negative": float(w[negative].sum()), - "value_sum": float((arr * w).sum()), - } - - -def summarize_bool(values, weights, *, stored: bool) -> dict[str, float | int | bool]: - arr = np.asarray(values).astype(bool) - w = np.asarray(weights, dtype=np.float64) - return { - "stored": bool(stored), - "true_count": int(arr.sum()), - "false_count": int((~arr).sum()), - "weighted_true": float(w[arr].sum()), - "weighted_false": float(w[~arr].sum()), - } - - -def build_snapshot(dataset_path: str) -> dict: - dataset_cls = dataset_from_path( - dataset_path, - Path(dataset_path).stem.replace("-", "_"), - ) - stored_variables = stored_variables_for(dataset_path) - sim = Microsimulation(dataset=dataset_cls) - sim.default_calculation_period = PERIOD - - person_weights = sim.calculate("person_weight", period=PERIOD).values.astype(np.float64) - household_weights = sim.calculate("household_weight", period=PERIOD).values.astype(np.float64) - tax_unit_weights = sim.calculate("tax_unit_weight", period=PERIOD).values.astype(np.float64) - person_state = sim.calculate("state_fips", map_to="person", period=PERIOD).values - household_state = sim.calculate("state_fips", map_to="household", period=PERIOD).values - person_age = sim.calculate("age", period=PERIOD).values.astype(np.float64) - marketplace = sim.calculate("has_marketplace_health_coverage", period=PERIOD).values - filing_status = sim.calculate("filing_status", period=PERIOD).values - adjusted_gross_income = sim.calculate("adjusted_gross_income", period=PERIOD).values.astype(np.float64) - ssi = sim.calculate("ssi", period=PERIOD).values.astype(np.float64) - medicare_part_b_variable, medicare_part_b_premiums = calculate_first_available( - sim, - ( - "medicare_part_b_premiums", - "medicare_part_b_premiums_reported", - "medicare_part_b_premium", - ), - period=PERIOD, - ) - medicare_part_b_premiums = medicare_part_b_premiums.astype(np.float64) - aca_ptc_household = sim.calculate("aca_ptc", map_to="household", period=PERIOD).values.astype(np.float64) - - critical_support = {} - for variable in CRITICAL_PERSON_VARIABLES: - calculated_variable = variable - if variable == "medicare_part_b_premiums": - calculated_variable, values = calculate_first_available( - sim, - ( - "medicare_part_b_premiums", - "medicare_part_b_premiums_reported", - "medicare_part_b_premium", - ), - period=PERIOD, - ) - else: - values = sim.calculate(variable, period=PERIOD).values - stored = variable in stored_variables or calculated_variable in stored_variables - if np.asarray(values).dtype == np.bool_: - critical_support[variable] = summarize_bool( - values, - person_weights, - stored=stored, - ) - else: - critical_support[variable] = summarize_numeric( - values, - person_weights, - stored=stored, - ) - critical_support[variable]["calculated_variable"] = calculated_variable - - normalized_filing_status = np.asarray([normalize_status(value) for value in filing_status]) - filing_status_counts = {} - for status in ("SINGLE", "JOINT", "SEPARATE", "HEAD_OF_HOUSEHOLD", "SURVIVING_SPOUSE"): - mask = normalized_filing_status == status - filing_status_counts[status] = { - "count": int(mask.sum()), - "weighted_count": float(tax_unit_weights[mask].sum()), - } - - def agi_support_for_status(status: str, bins) -> list[dict]: - status_mask = normalized_filing_status == status - rows = [] - for label, lower, upper in bins: - mask = status_mask & (adjusted_gross_income >= lower) & (adjusted_gross_income < upper) - rows.append( - { - "agi_bin": label, - "count": int(mask.sum()), - "weighted_count": float(tax_unit_weights[mask].sum()), - "weighted_agi": float((adjusted_gross_income[mask] * tax_unit_weights[mask]).sum()), - } - ) - return rows - - def person_value_by_age(values, buckets) -> list[dict]: - arr = np.nan_to_num(np.asarray(values, dtype=np.float64), nan=0.0) - rows = [] - for label, lower, upper in buckets: - age_mask = (person_age >= lower) & (person_age < upper) - positive = age_mask & (arr > 0.0) - rows.append( - { - "age_bucket": label, - "person_count": int(age_mask.sum()), - "positive_count": int(positive.sum()), - "weighted_people": float(person_weights[age_mask].sum()), - "weighted_positive": float(person_weights[positive].sum()), - "value_sum": float((arr[age_mask] * person_weights[age_mask]).sum()), - } - ) - return rows - - mfs_agi_support = agi_support_for_status("SEPARATE", HIGH_SIGNAL_MFS_AGI_BINS) - hoh_agi_support = agi_support_for_status("HEAD_OF_HOUSEHOLD", HIGH_SIGNAL_HOH_AGI_BINS) - ssi_by_age = person_value_by_age(ssi, SSI_AGE_BUCKETS) - medicare_part_b_by_age = person_value_by_age( - medicare_part_b_premiums, - MEDICARE_PART_B_AGE_BUCKETS, - ) - - state_aca_ptc = {} - for state in sorted({state_abbr(value) for value in household_state}): - state_mask = np.asarray([state_abbr(value) == state for value in household_state], dtype=bool) - positive = state_mask & (aca_ptc_household > 0.0) - state_aca_ptc[state] = { - "weighted_households": float(household_weights[state_mask].sum()), - "weighted_positive_households": float(household_weights[positive].sum()), - "weighted_aca_ptc": float((aca_ptc_household[state_mask] * household_weights[state_mask]).sum()), - } - - states = sorted({state_abbr(value) for value in person_state}) - state_marketplace = {} - state_age_bucket = {} - marketplace_bool = np.asarray(marketplace).astype(bool) - for state in states: - state_mask = np.asarray([state_abbr(value) == state for value in person_state], dtype=bool) - enrolled = state_mask & marketplace_bool - state_marketplace[state] = { - "weighted_people": float(person_weights[state_mask].sum()), - "weighted_marketplace_enrollment": float(person_weights[enrolled].sum()), - } - bucket_weights = {} - nonempty = 0 - for label, lower, upper in AGE_BUCKETS: - mask = state_mask & (person_age >= lower) & (person_age < upper) - weight = float(person_weights[mask].sum()) - bucket_weights[label] = weight - if weight > 0.0: - nonempty += 1 - state_age_bucket[state] = { - "nonempty_buckets": int(nonempty), - "bucket_weights": bucket_weights, - } - - return { - "dataset": dataset_path, - "stored_variable_count": int(len(stored_variables)), - "stored_variables": sorted(stored_variables), - "critical_input_support": critical_support, - "filing_status_weighted_counts": filing_status_counts, - "mfs_high_agi_support": mfs_agi_support, - "hoh_agi_support": hoh_agi_support, - "ssi_by_age": ssi_by_age, - "medicare_part_b_premiums_variable": medicare_part_b_variable, - "medicare_part_b_premiums_by_age": medicare_part_b_by_age, - "state_aca_ptc_spending": state_aca_ptc, - "state_marketplace_enrollment": state_marketplace, - "state_age_bucket_support": state_age_bucket, - } - - -def compare_snapshots(candidate: dict, baseline: dict) -> dict: - critical_rows = [] - for variable in CRITICAL_PERSON_VARIABLES: - candidate_row = candidate["critical_input_support"][variable] - baseline_row = baseline["critical_input_support"][variable] - candidate_weighted = candidate_row.get("weighted_nonzero", candidate_row.get("weighted_true", 0.0)) - baseline_weighted = baseline_row.get("weighted_nonzero", baseline_row.get("weighted_true", 0.0)) - critical_rows.append( - { - "variable": variable, - "candidate_stored": bool(candidate_row.get("stored", False)), - "baseline_stored": bool(baseline_row.get("stored", False)), - "candidate_weighted_nonzero": float(candidate_weighted), - "baseline_weighted_nonzero": float(baseline_weighted), - "weighted_nonzero_delta": float(candidate_weighted - baseline_weighted), - } - ) - - filing_status_rows = [] - for status in ("SINGLE", "JOINT", "SEPARATE", "HEAD_OF_HOUSEHOLD", "SURVIVING_SPOUSE"): - candidate_row = candidate["filing_status_weighted_counts"][status] - baseline_row = baseline["filing_status_weighted_counts"][status] - filing_status_rows.append( - { - "filing_status": status, - "candidate_weighted_count": float(candidate_row["weighted_count"]), - "baseline_weighted_count": float(baseline_row["weighted_count"]), - "weighted_count_delta": float(candidate_row["weighted_count"] - baseline_row["weighted_count"]), - } - ) - - baseline_bins = {row["agi_bin"]: row for row in baseline["mfs_high_agi_support"]} - mfs_rows = [] - for row in candidate["mfs_high_agi_support"]: - other = baseline_bins[row["agi_bin"]] - mfs_rows.append( - { - "agi_bin": row["agi_bin"], - "candidate_weighted_count": float(row["weighted_count"]), - "baseline_weighted_count": float(other["weighted_count"]), - "weighted_count_delta": float(row["weighted_count"] - other["weighted_count"]), - "candidate_weighted_agi": float(row["weighted_agi"]), - "baseline_weighted_agi": float(other["weighted_agi"]), - "weighted_agi_delta": float(row["weighted_agi"] - other["weighted_agi"]), - } - ) - - baseline_bins = {row["agi_bin"]: row for row in baseline["hoh_agi_support"]} - hoh_rows = [] - for row in candidate["hoh_agi_support"]: - other = baseline_bins[row["agi_bin"]] - hoh_rows.append( - { - "agi_bin": row["agi_bin"], - "candidate_weighted_count": float(row["weighted_count"]), - "baseline_weighted_count": float(other["weighted_count"]), - "weighted_count_delta": float(row["weighted_count"] - other["weighted_count"]), - "candidate_weighted_agi": float(row["weighted_agi"]), - "baseline_weighted_agi": float(other["weighted_agi"]), - "weighted_agi_delta": float(row["weighted_agi"] - other["weighted_agi"]), - } - ) - - def age_value_delta(name: str) -> list[dict]: - baseline_bins = {row["age_bucket"]: row for row in baseline[name]} - rows = [] - for row in candidate[name]: - other = baseline_bins[row["age_bucket"]] - rows.append( - { - "age_bucket": row["age_bucket"], - "candidate_weighted_positive": float(row["weighted_positive"]), - "baseline_weighted_positive": float(other["weighted_positive"]), - "weighted_positive_delta": float(row["weighted_positive"] - other["weighted_positive"]), - "candidate_value_sum": float(row["value_sum"]), - "baseline_value_sum": float(other["value_sum"]), - "value_sum_delta": float(row["value_sum"] - other["value_sum"]), - } - ) - return rows - - ssi_rows = age_value_delta("ssi_by_age") - for row in ssi_rows: - row["candidate_weighted_recipients"] = row.pop("candidate_weighted_positive") - row["baseline_weighted_recipients"] = row.pop("baseline_weighted_positive") - row["weighted_recipient_delta"] = row.pop("weighted_positive_delta") - row["candidate_ssi"] = row.pop("candidate_value_sum") - row["baseline_ssi"] = row.pop("baseline_value_sum") - row["ssi_delta"] = row.pop("value_sum_delta") - - medicare_part_b_rows = age_value_delta("medicare_part_b_premiums_by_age") - - all_states = sorted( - set(candidate["state_aca_ptc_spending"]) - | set(baseline["state_aca_ptc_spending"]) - ) - state_aca_ptc_rows = [] - for state in all_states: - candidate_row = candidate["state_aca_ptc_spending"].get( - state, - {"weighted_aca_ptc": 0.0, "weighted_positive_households": 0.0}, - ) - baseline_row = baseline["state_aca_ptc_spending"].get( - state, - {"weighted_aca_ptc": 0.0, "weighted_positive_households": 0.0}, - ) - state_aca_ptc_rows.append( - { - "state": state, - "candidate_weighted_aca_ptc": float(candidate_row["weighted_aca_ptc"]), - "baseline_weighted_aca_ptc": float(baseline_row["weighted_aca_ptc"]), - "weighted_aca_ptc_delta": float(candidate_row["weighted_aca_ptc"] - baseline_row["weighted_aca_ptc"]), - "candidate_weighted_positive_households": float(candidate_row["weighted_positive_households"]), - "baseline_weighted_positive_households": float(baseline_row["weighted_positive_households"]), - "weighted_positive_household_delta": float( - candidate_row["weighted_positive_households"] - - baseline_row["weighted_positive_households"] - ), - } - ) - state_aca_ptc_rows.sort( - key=lambda row: abs(row["weighted_aca_ptc_delta"]), - reverse=True, - ) - - all_states = sorted( - set(candidate["state_marketplace_enrollment"]) - | set(baseline["state_marketplace_enrollment"]) - ) - state_marketplace_rows = [] - for state in all_states: - candidate_row = candidate["state_marketplace_enrollment"].get( - state, - {"weighted_marketplace_enrollment": 0.0}, - ) - baseline_row = baseline["state_marketplace_enrollment"].get( - state, - {"weighted_marketplace_enrollment": 0.0}, - ) - state_marketplace_rows.append( - { - "state": state, - "candidate_weighted_marketplace_enrollment": float(candidate_row["weighted_marketplace_enrollment"]), - "baseline_weighted_marketplace_enrollment": float(baseline_row["weighted_marketplace_enrollment"]), - "weighted_marketplace_enrollment_delta": float( - candidate_row["weighted_marketplace_enrollment"] - - baseline_row["weighted_marketplace_enrollment"] - ), - } - ) - state_marketplace_rows.sort( - key=lambda row: abs(row["weighted_marketplace_enrollment_delta"]), - reverse=True, - ) - - all_states = sorted( - set(candidate["state_age_bucket_support"]) - | set(baseline["state_age_bucket_support"]) - ) - state_age_rows = [] - for state in all_states: - candidate_row = candidate["state_age_bucket_support"].get( - state, - {"bucket_weights": {}}, - ) - baseline_row = baseline["state_age_bucket_support"].get( - state, - {"bucket_weights": {}}, - ) - for label, _lower, _upper in AGE_BUCKETS: - candidate_weight = float(candidate_row["bucket_weights"].get(label, 0.0)) - baseline_weight = float(baseline_row["bucket_weights"].get(label, 0.0)) - state_age_rows.append( - { - "state": state, - "age_bucket": label, - "candidate_weight": candidate_weight, - "baseline_weight": baseline_weight, - "weight_delta": candidate_weight - baseline_weight, - } - ) - state_age_rows.sort(key=lambda row: abs(row["weight_delta"]), reverse=True) - - return { - "critical_input_support": critical_rows, - "filing_status_weighted_delta": filing_status_rows, - "mfs_high_agi_delta": mfs_rows, - "hoh_agi_delta": hoh_rows, - "ssi_by_age_delta": ssi_rows, - "medicare_part_b_premiums_by_age_delta": medicare_part_b_rows, - "state_aca_ptc_spending_top_gaps": state_aca_ptc_rows[:15], - "state_marketplace_enrollment_top_gaps": state_marketplace_rows[:15], - "state_age_bucket_top_gaps": state_age_rows[:20], - } - - -baseline = build_snapshot(BASELINE_DATASET) -results = [] -for candidate_dataset in CANDIDATE_DATASETS: - candidate = build_snapshot(candidate_dataset) - results.append( - { - "candidate_dataset": candidate_dataset, - "candidate": candidate, - "comparisons": compare_snapshots(candidate, baseline), - } - ) - -payload = { - "metric": "enhanced_cps_support_audit_batch", - "period": PERIOD, - "baseline_dataset": BASELINE_DATASET, - "baseline": baseline, - "results": results, -} -print(json.dumps(payload, sort_keys=True)) -""".strip() - - -@dataclass(frozen=True) -class PolicyEngineUSEnhancedCPSNativeScores: - """Exact enhanced-CPS native-loss comparison for one candidate/baseline pair.""" - - metric: str - period: int - candidate_dataset: str - baseline_dataset: str - candidate_enhanced_cps_native_loss: float - baseline_enhanced_cps_native_loss: float - enhanced_cps_native_loss_delta: float - candidate_unweighted_msre: float - baseline_unweighted_msre: float - unweighted_msre_delta: float - n_targets_total: int - n_targets_kept: int - n_targets_zero_dropped: int - n_targets_bad_dropped: int - n_national_targets: int - n_state_targets: int - candidate_weight_sum: float - baseline_weight_sum: float - family_breakdown: tuple[dict[str, Any], ...] = field(default_factory=tuple) - loss_config: dict[str, Any] | None = None - target_scope_filter: str | None = None - - def to_dict(self) -> dict[str, Any]: - return { - "metric": self.metric, - "period": self.period, - "candidate_dataset": self.candidate_dataset, - "baseline_dataset": self.baseline_dataset, - "candidate_enhanced_cps_native_loss": ( - self.candidate_enhanced_cps_native_loss - ), - "baseline_enhanced_cps_native_loss": ( - self.baseline_enhanced_cps_native_loss - ), - "enhanced_cps_native_loss_delta": self.enhanced_cps_native_loss_delta, - "candidate_unweighted_msre": self.candidate_unweighted_msre, - "baseline_unweighted_msre": self.baseline_unweighted_msre, - "unweighted_msre_delta": self.unweighted_msre_delta, - "n_targets_total": self.n_targets_total, - "n_targets_kept": self.n_targets_kept, - "n_targets_zero_dropped": self.n_targets_zero_dropped, - "n_targets_bad_dropped": self.n_targets_bad_dropped, - "n_national_targets": self.n_national_targets, - "n_state_targets": self.n_state_targets, - "candidate_weight_sum": self.candidate_weight_sum, - "baseline_weight_sum": self.baseline_weight_sum, - "family_breakdown": list(self.family_breakdown), - "loss_config": self.loss_config, - "target_scope_filter": self.target_scope_filter, - } - - @classmethod - def from_dict( - cls, payload: dict[str, Any] - ) -> PolicyEngineUSEnhancedCPSNativeScores: - return cls( - metric=str(payload["metric"]), - period=int(payload["period"]), - candidate_dataset=str(payload["candidate_dataset"]), - baseline_dataset=str(payload["baseline_dataset"]), - candidate_enhanced_cps_native_loss=float( - payload["candidate_enhanced_cps_native_loss"] - ), - baseline_enhanced_cps_native_loss=float( - payload["baseline_enhanced_cps_native_loss"] - ), - enhanced_cps_native_loss_delta=float( - payload["enhanced_cps_native_loss_delta"] - ), - candidate_unweighted_msre=float(payload["candidate_unweighted_msre"]), - baseline_unweighted_msre=float(payload["baseline_unweighted_msre"]), - unweighted_msre_delta=float(payload["unweighted_msre_delta"]), - n_targets_total=int(payload["n_targets_total"]), - n_targets_kept=int(payload["n_targets_kept"]), - n_targets_zero_dropped=int(payload["n_targets_zero_dropped"]), - n_targets_bad_dropped=int(payload["n_targets_bad_dropped"]), - n_national_targets=int(payload["n_national_targets"]), - n_state_targets=int(payload["n_state_targets"]), - candidate_weight_sum=float(payload["candidate_weight_sum"]), - baseline_weight_sum=float(payload["baseline_weight_sum"]), - family_breakdown=tuple(payload.get("family_breakdown", ())), - loss_config=payload.get("loss_config"), - target_scope_filter=payload.get("target_scope_filter"), - ) - - -PolicyEngineUSNativeBroadLossScore = PolicyEngineUSEnhancedCPSNativeScores - - -def resolve_policyengine_us_data_repo_root( - repo_root: str | Path | None = None, -) -> Path: - """Resolve the local policyengine-us-data checkout used for native scoring.""" - - candidates: list[Path] = [] - if repo_root is not None: - candidates.append(Path(repo_root)) - env_repo = os.environ.get(_PE_US_DATA_REPO_ENV) - if env_repo: - candidates.append(Path(env_repo)) - candidates.append(_DEFAULT_PE_US_DATA_REPO) - - for candidate in candidates: - resolved = candidate.expanduser().resolve() - if (resolved / "policyengine_us_data").exists(): - return resolved - searched = ", ".join(str(path.expanduser()) for path in candidates) - raise FileNotFoundError( - f"Could not resolve policyengine-us-data repo root. Searched: {searched}" - ) - - -def resolve_policyengine_us_data_python( - python_executable: str | Path | None = None, - *, - repo_root: str | Path | None = None, -) -> Path: - """Resolve a Python executable with policyengine-us-data installed.""" - - candidates: list[Path] = [] - if python_executable is not None: - candidates.append(Path(python_executable)) - env_python = os.environ.get(_PE_US_DATA_PYTHON_ENV) - if env_python: - candidates.append(Path(env_python)) - resolved_repo = resolve_policyengine_us_data_repo_root(repo_root) - candidates.extend( - ( - resolved_repo / ".venv" / "bin" / "python", - resolved_repo / "venv" / "bin" / "python", - ) - ) - - for candidate in candidates: - expanded = candidate.expanduser() - if expanded.exists() and os.access(expanded, os.X_OK): - return expanded - searched = ", ".join(str(path.expanduser()) for path in candidates) - raise FileNotFoundError( - "Could not resolve a usable policyengine-us-data Python executable. " - f"Searched: {searched}" - ) - - -def build_policyengine_us_data_pythonpath( - repo_root: str | Path | None = None, - *, - existing_pythonpath: str | None = None, -) -> str: - """Build the native-scoring PYTHONPATH for local PE-US-data checkouts.""" - - resolved_repo = resolve_policyengine_us_data_repo_root(repo_root) - microplex_src = Path(__file__).resolve().parents[2] - path_entries: list[str] = [str(resolved_repo), str(microplex_src)] - - sibling_microimpute = resolved_repo.parent / "microimpute" - if (sibling_microimpute / "microimpute").exists(): - path_entries.append(str(sibling_microimpute)) - - if existing_pythonpath: - path_entries.extend( - entry for entry in existing_pythonpath.split(os.pathsep) if entry - ) - return os.pathsep.join(path_entries) - - -def build_policyengine_us_data_subprocess_env( - repo_root: str | Path | None = None, - *, - base_env: dict[str, str] | None = None, -) -> dict[str, str]: - """Build a clean subprocess env for PE-native scoring helpers.""" - - source_env = dict(os.environ if base_env is None else base_env) - env = { - key: source_env[key] - for key in _PE_NATIVE_SCORE_BASE_ENV_VARS - if key in source_env and source_env[key] - } - env["PYTHONPATH"] = build_policyengine_us_data_pythonpath( - repo_root, - existing_pythonpath=source_env.get("PYTHONPATH"), - ) - return env - - -def compute_policyengine_us_enhanced_cps_native_scores( - candidate_dataset: str | Path, - baseline_dataset: str | Path, - *, - period: int = 2024, - policyengine_us_data_python: str | Path | None = None, - policyengine_us_data_repo: str | Path | None = None, - target_scope_filter: str | None = None, - policyengine_targets_db_path: str | Path | None = None, -) -> PolicyEngineUSEnhancedCPSNativeScores: - """Score one candidate and baseline under the exact enhanced-CPS loss.""" - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - resolved_targets_db = ( - Path(policyengine_targets_db_path).expanduser().resolve() - if policyengine_targets_db_path is not None - else None - ) - if resolved_targets_db is not None and not resolved_targets_db.exists(): - raise FileNotFoundError(f"PolicyEngine target DB not found: {resolved_targets_db}") - if policyengine_us_data_python is not None: - command = [str(Path(policyengine_us_data_python).expanduser())] - else: - command = ["uv", "run", "--project", str(resolved_repo), "python"] - completed = subprocess.run( - [ - *command, - "-c", - _PE_NATIVE_BROAD_SCORE_SCRIPT, - str(resolved_repo), - json.dumps(_ENHANCED_CPS_BAD_TARGETS), - str(int(period)), - str(Path(candidate_dataset).expanduser().resolve()), - str(Path(baseline_dataset).expanduser().resolve()), - target_scope_filter or "", - str(resolved_targets_db) if resolved_targets_db is not None else "", - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - stderr = completed.stderr.strip() - stdout = completed.stdout.strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise RuntimeError(f"PE-native broad loss scoring failed: {detail}") - payload = json.loads(completed.stdout) - return PolicyEngineUSEnhancedCPSNativeScores.from_dict(payload) - - -def score_policyengine_us_native_broad_loss( - candidate_dataset: str | Path, - baseline_dataset: str | Path, - *, - period: int = 2024, - python_executable: str | Path | None = None, - repo_root: str | Path | None = None, - target_scope_filter: str | None = None, - policyengine_targets_db_path: str | Path | None = None, -) -> PolicyEngineUSEnhancedCPSNativeScores: - """Backward-compatible alias for the exact enhanced-CPS loss scorer.""" - return compute_policyengine_us_enhanced_cps_native_scores( - candidate_dataset, - baseline_dataset, - period=period, - policyengine_us_data_python=python_executable, - policyengine_us_data_repo=repo_root, - target_scope_filter=target_scope_filter, - policyengine_targets_db_path=policyengine_targets_db_path, - ) - - -def compute_us_pe_native_scores( - *, - candidate_dataset_path: str | Path, - baseline_dataset_path: str | Path, - period: int = 2024, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - target_scope_filter: str | None = None, - policyengine_targets_db_path: str | Path | None = None, -) -> dict[str, Any]: - """Build the saved manifest payload for PE-native broad scoring.""" - - score = compute_policyengine_us_enhanced_cps_native_scores( - candidate_dataset_path, - baseline_dataset_path, - period=period, - policyengine_us_data_python=policyengine_us_data_python, - policyengine_us_data_repo=policyengine_us_data_repo, - target_scope_filter=target_scope_filter, - policyengine_targets_db_path=policyengine_targets_db_path, - ) - return { - "metric": score.metric, - "period": score.period, - "summary": { - "candidate_enhanced_cps_native_loss": ( - score.candidate_enhanced_cps_native_loss - ), - "baseline_enhanced_cps_native_loss": ( - score.baseline_enhanced_cps_native_loss - ), - "enhanced_cps_native_loss_delta": score.enhanced_cps_native_loss_delta, - "candidate_beats_baseline": score.enhanced_cps_native_loss_delta < 0.0, - "candidate_unweighted_msre": score.candidate_unweighted_msre, - "baseline_unweighted_msre": score.baseline_unweighted_msre, - "unweighted_msre_delta": score.unweighted_msre_delta, - "n_targets_total": score.n_targets_total, - "n_targets_kept": score.n_targets_kept, - "n_targets_zero_dropped": score.n_targets_zero_dropped, - "n_targets_bad_dropped": score.n_targets_bad_dropped, - "n_national_targets": score.n_national_targets, - "n_state_targets": score.n_state_targets, - "loss_config": score.loss_config, - "target_scope_filter": score.target_scope_filter, - }, - "broad_loss": score.to_dict(), - "family_breakdown": list(score.family_breakdown), - } - - -def compute_batch_us_pe_native_scores( - *, - candidate_dataset_paths: list[str | Path] | tuple[str | Path, ...], - baseline_dataset_path: str | Path, - period: int = 2024, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - target_scope_filter: str | None = None, -) -> list[dict[str, Any]]: - """Score multiple candidates against one baseline in a single PE-native subprocess.""" - - if not candidate_dataset_paths: - return [] - started_at = perf_counter() - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - if policyengine_us_data_python is not None: - command = [str(Path(policyengine_us_data_python).expanduser())] - else: - command = ["uv", "run", "--project", str(resolved_repo), "python"] - completed = subprocess.run( - [ - *command, - "-c", - _PE_NATIVE_BROAD_BATCH_SCORE_SCRIPT, - str(resolved_repo), - json.dumps(_ENHANCED_CPS_BAD_TARGETS), - str(int(period)), - str(Path(baseline_dataset_path).expanduser().resolve()), - json.dumps( - [ - str(Path(candidate_path).expanduser().resolve()) - for candidate_path in candidate_dataset_paths - ] - ), - target_scope_filter or "", - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - stderr = completed.stderr.strip() - stdout = completed.stdout.strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise RuntimeError(f"PE-native batch broad loss scoring failed: {detail}") - payload = json.loads(completed.stdout) - elapsed_seconds = perf_counter() - started_at - results = [ - { - "metric": item["metric"], - "period": int(item["period"]), - "summary": { - "loss_config": item.get("loss_config"), - "candidate_enhanced_cps_native_loss": float( - item["candidate_enhanced_cps_native_loss"] - ), - "baseline_enhanced_cps_native_loss": float( - item["baseline_enhanced_cps_native_loss"] - ), - "enhanced_cps_native_loss_delta": float( - item["enhanced_cps_native_loss_delta"] - ), - "candidate_beats_baseline": bool(item["candidate_beats_baseline"]), - "candidate_unweighted_msre": float(item["candidate_unweighted_msre"]), - "baseline_unweighted_msre": float(item["baseline_unweighted_msre"]), - "unweighted_msre_delta": float(item["unweighted_msre_delta"]), - "n_targets_total": int(item["n_targets_total"]), - "n_targets_kept": int(item["n_targets_kept"]), - "n_targets_zero_dropped": int(item["n_targets_zero_dropped"]), - "n_targets_bad_dropped": int(item["n_targets_bad_dropped"]), - "n_national_targets": int(item["n_national_targets"]), - "n_state_targets": int(item["n_state_targets"]), - "target_scope_filter": item.get("target_scope_filter"), - }, - "broad_loss": { - "metric": item["metric"], - "loss_config": item.get("loss_config"), - "period": int(item["period"]), - "candidate_dataset": str(item["candidate_dataset"]), - "baseline_dataset": str(item["baseline_dataset"]), - "candidate_enhanced_cps_native_loss": float( - item["candidate_enhanced_cps_native_loss"] - ), - "baseline_enhanced_cps_native_loss": float( - item["baseline_enhanced_cps_native_loss"] - ), - "enhanced_cps_native_loss_delta": float( - item["enhanced_cps_native_loss_delta"] - ), - "candidate_beats_baseline": bool(item["candidate_beats_baseline"]), - "candidate_unweighted_msre": float(item["candidate_unweighted_msre"]), - "baseline_unweighted_msre": float(item["baseline_unweighted_msre"]), - "unweighted_msre_delta": float(item["unweighted_msre_delta"]), - "n_targets_total": int(item["n_targets_total"]), - "n_targets_kept": int(item["n_targets_kept"]), - "n_targets_zero_dropped": int(item["n_targets_zero_dropped"]), - "n_targets_bad_dropped": int(item["n_targets_bad_dropped"]), - "n_national_targets": int(item["n_national_targets"]), - "n_state_targets": int(item["n_state_targets"]), - "target_scope_filter": item.get("target_scope_filter"), - "candidate_weight_sum": float(item["candidate_weight_sum"]), - "baseline_weight_sum": float(item["baseline_weight_sum"]), - "family_breakdown": list(item.get("family_breakdown", [])), - }, - "family_breakdown": list(item.get("family_breakdown", [])), - } - for item in payload - ] - for item in results: - item["timing"] = { - "batch_elapsed_seconds": float(elapsed_seconds), - "batch_candidate_count": len(candidate_dataset_paths), - } - return results - - -@dataclass(frozen=True) -class PENativeTargetLookupKey: - """Structured lookup key for a legacy PE-native target label.""" - - variable: str - count_children: int - agi_lower: float - agi_upper: float - - def as_tuple(self) -> tuple[str, int, float, float]: - return (self.variable, self.count_children, self.agi_lower, self.agi_upper) - - @staticmethod - def _json_safe_bound(value: float) -> float | str: - if value == float("inf"): - return "inf" - if value == float("-inf"): - return "-inf" - return value - - def expected_constraints(self) -> list[dict[str, str | float | int]]: - if self.count_children < 3: - child_constraint: dict[str, str | float | int] = { - "variable": "eitc_child_count", - "operation": "==", - "value": self.count_children, - } - else: - child_constraint = { - "variable": "eitc_child_count", - "operation": ">", - "value": 2, - } - return [ - {"variable": "tax_unit_is_filer", "operation": "==", "value": 1}, - {"variable": "eitc", "operation": ">", "value": 0}, - child_constraint, - { - "variable": "adjusted_gross_income", - "operation": ">=", - "value": self._json_safe_bound(self.agi_lower), - }, - { - "variable": "adjusted_gross_income", - "operation": "<", - "value": self._json_safe_bound(self.agi_upper), - }, - ] - - def expected_target(self) -> dict[str, Any]: - return { - "variable": self.variable, - "geo_level": "national", - "geographic_id": "US", - "domain_variable": _EITC_AGI_CHILD_DOMAIN_VARIABLE, - "constraints": self.expected_constraints(), - } - - -def _parse_pe_native_numeric_token(token: str) -> float: - if token == "-inf": - return float("-inf") - if token == "inf": - return float("inf") - multipliers = { - "bn": 1_000_000_000.0, - "m": 1_000_000.0, - "k": 1_000.0, - } - for suffix, multiplier in multipliers.items(): - if token.endswith(suffix): - return float(token[: -len(suffix)]) * multiplier - return float(token) - - -def parse_pe_native_target_lookup_key( - target_name: str, -) -> PENativeTargetLookupKey | None: - """Parse PE-native labels that now have structured DB equivalents.""" - - match = _EITC_AGI_CHILD_LABEL.match(target_name) - if match is None: - return None - metric = match.group("metric") - variable = "tax_unit_count" if metric == "returns" else "eitc" - return PENativeTargetLookupKey( - variable=variable, - count_children=int(match.group("count_children")), - agi_lower=_parse_pe_native_numeric_token(match.group("agi_lower")), - agi_upper=_parse_pe_native_numeric_token(match.group("agi_upper")), - ) - - -def _constraint_value_as_float(value: str) -> float | None: - try: - return float(value) - except (TypeError, ValueError): - return None - - -def _target_lookup_key_from_policyengine_target( - target: Any, -) -> tuple[str, int, float, float] | None: - if target.geo_level != "national": - return None - if target.variable not in {"eitc", "tax_unit_count"}: - return None - if target.domain_variable != _EITC_AGI_CHILD_DOMAIN_VARIABLE: - return None - - agi_lower: float | None = None - agi_upper: float | None = None - count_children: int | None = None - has_eitc_positive_constraint = False - - for constraint in target.constraints: - value = str(constraint.value) - numeric_value = _constraint_value_as_float(value) - if ( - constraint.variable == "adjusted_gross_income" - and constraint.operation == ">=" - and numeric_value is not None - ): - agi_lower = numeric_value - elif ( - constraint.variable == "adjusted_gross_income" - and constraint.operation == "<" - and numeric_value is not None - ): - agi_upper = numeric_value - elif constraint.variable == "eitc" and constraint.operation == ">": - has_eitc_positive_constraint = numeric_value == 0 - elif constraint.variable == "eitc_child_count" and numeric_value is not None: - if constraint.operation == "==": - count_children = int(numeric_value) - elif constraint.operation == ">" and numeric_value == 2: - count_children = 3 - elif constraint.operation == ">=" and numeric_value == 3: - count_children = 3 - - if ( - agi_lower is None - or agi_upper is None - or count_children is None - or not has_eitc_positive_constraint - ): - return None - return (target.variable, count_children, agi_lower, agi_upper) - - -def _policyengine_target_payload(target: Any) -> dict[str, Any]: - return { - "target_id": target.target_id, - "variable": target.variable, - "period": target.period, - "value": target.value, - "source": target.source, - "notes": target.notes, - "geo_level": target.geo_level, - "geographic_id": target.geographic_id, - "domain_variable": target.domain_variable, - "constraints": [ - { - "variable": constraint.variable, - "operation": constraint.operation, - "value": constraint.value, - } - for constraint in target.constraints - ], - } - - -def _load_policyengine_target_match_index( - target_db_path: str | Path, - *, - period: int, -) -> dict[tuple[str, int, float, float], list[dict[str, Any]]]: - from microplex_us.policyengine.us import PolicyEngineUSDBTargetProvider - - provider = PolicyEngineUSDBTargetProvider(target_db_path, validate=False) - targets = provider.load_targets( - period=period, - variables=["eitc", "tax_unit_count"], - domain_variable_values=[_EITC_AGI_CHILD_DOMAIN_VARIABLE], - geo_levels=["national"], - ) - matches: dict[tuple[str, int, float, float], list[dict[str, Any]]] = {} - for target in targets: - key = _target_lookup_key_from_policyengine_target(target) - if key is None: - continue - matches.setdefault(key, []).append(_policyengine_target_payload(target)) - return matches - - -def _default_policyengine_targets_db_path( - policyengine_us_data_repo: str | Path | None, -) -> Path | None: - try: - repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - except FileNotFoundError: - return None - path = repo / "policyengine_us_data" / "storage" / "calibration" / "policy_data.db" - return path if path.exists() else None - - -def annotate_pe_native_target_db_matches( - payload: dict[str, Any], - *, - target_db_path: str | Path | None, - period: int, -) -> dict[str, Any]: - """Attach structured PolicyEngine target DB matches to diagnostic rows.""" - - rows = list(payload.get("targets") or []) - resolved_db_path = Path(target_db_path).expanduser() if target_db_path else None - match_index: dict[tuple[str, int, float, float], list[dict[str, Any]]] = {} - target_db_error = None - if resolved_db_path is not None and resolved_db_path.exists(): - try: - match_index = _load_policyengine_target_match_index( - resolved_db_path, - period=period, - ) - except Exception as exc: # pragma: no cover - defensive diagnostic path - target_db_error = str(exc) - - counts = { - "matched": 0, - "legacy_only": 0, - "unparsed": 0, - "ambiguous": 0, - "db_unavailable": 0, - } - annotations_by_name: dict[str, dict[str, Any]] = {} - for row in rows: - target_name = str(row.get("target_name", "")) - key = parse_pe_native_target_lookup_key(target_name) - if key is None: - annotation: dict[str, Any] = {"policyengine_target_match": "unparsed"} - elif ( - resolved_db_path is None or not resolved_db_path.exists() or target_db_error - ): - annotation = { - "policyengine_target_match": "db_unavailable", - "policyengine_target_expected": key.expected_target(), - } - else: - matches = match_index.get(key.as_tuple(), []) - if len(matches) == 1: - match = matches[0] - annotation = { - "policyengine_target_match": "matched", - "policyengine_target_id": match["target_id"], - "policyengine_target_variable": match["variable"], - "policyengine_target_period": match["period"], - "policyengine_target_value": match["value"], - "policyengine_target_source": match["source"], - "policyengine_target_geo_level": match["geo_level"], - "policyengine_target_geographic_id": match["geographic_id"], - "policyengine_target_domain_variable": match["domain_variable"], - "policyengine_target_constraints": match["constraints"], - } - elif len(matches) > 1: - annotation = { - "policyengine_target_match": "ambiguous", - "policyengine_target_match_count": len(matches), - "policyengine_target_matches": matches, - "policyengine_target_expected": key.expected_target(), - } - else: - annotation = { - "policyengine_target_match": "legacy_only", - "policyengine_target_expected": key.expected_target(), - } - counts[annotation["policyengine_target_match"]] += 1 - row.update(annotation) - annotations_by_name[target_name] = annotation - - for list_name in ("top_improvements", "top_regressions"): - for row in payload.get(list_name) or []: - annotation = annotations_by_name.get(str(row.get("target_name", ""))) - if annotation: - row.update(annotation) - - parsed_total = counts["matched"] + counts["legacy_only"] + counts["ambiguous"] - payload["target_db_summary"] = { - "target_db_path": str(resolved_db_path) if resolved_db_path else None, - "target_db_error": target_db_error, - **counts, - "parsed_targets": parsed_total, - "match_rate": counts["matched"] / parsed_total if parsed_total else None, - } - return payload - - -def compare_us_pe_native_target_deltas( - *, - from_dataset_path: str | Path, - to_dataset_path: str | Path, - period: int = 2024, - top_k: int = 25, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> dict[str, Any]: - """Compare per-target PE-native weighted-loss terms between two datasets.""" - - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - if policyengine_us_data_python is not None: - command = [str(Path(policyengine_us_data_python).expanduser())] - else: - command = ["uv", "run", "--project", str(resolved_repo), "python"] - completed = subprocess.run( - [ - *command, - "-c", - _PE_NATIVE_TARGET_DELTA_SCRIPT, - str(resolved_repo), - json.dumps(_ENHANCED_CPS_BAD_TARGETS), - str(int(period)), - str(Path(from_dataset_path).expanduser().resolve()), - str(Path(to_dataset_path).expanduser().resolve()), - str(int(top_k)), - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - stderr = completed.stderr.strip() - stdout = completed.stdout.strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise RuntimeError(f"PE-native target delta comparison failed: {detail}") - return json.loads(completed.stdout) - - -def compute_batch_us_pe_native_target_deltas( - *, - candidate_dataset_paths: list[str | Path] | tuple[str | Path, ...], - baseline_dataset_path: str | Path, - period: int = 2024, - top_k: int = 25, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> list[dict[str, Any]]: - """Compare PE-native weighted-loss targets for many candidates against one baseline.""" - - if not candidate_dataset_paths: - return [] - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - if policyengine_us_data_python is not None: - command = [str(Path(policyengine_us_data_python).expanduser())] - else: - command = ["uv", "run", "--project", str(resolved_repo), "python"] - completed = subprocess.run( - [ - *command, - "-c", - _PE_NATIVE_TARGET_DELTA_BATCH_SCRIPT, - str(resolved_repo), - json.dumps(_ENHANCED_CPS_BAD_TARGETS), - str(int(period)), - str(Path(baseline_dataset_path).expanduser().resolve()), - json.dumps( - [ - str(Path(candidate_path).expanduser().resolve()) - for candidate_path in candidate_dataset_paths - ] - ), - str(int(top_k)), - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - stderr = completed.stderr.strip() - stdout = completed.stdout.strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise RuntimeError(f"PE-native batch target delta comparison failed: {detail}") - return list(json.loads(completed.stdout)) - - -def compute_us_pe_native_support_audit( - *, - candidate_dataset_path: str | Path, - baseline_dataset_path: str | Path, - period: int = 2024, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> dict[str, Any]: - """Compare candidate vs baseline structural support on selected PE surfaces.""" - - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - if policyengine_us_data_python is not None: - command = [str(Path(policyengine_us_data_python).expanduser())] - else: - command = ["uv", "run", "--project", str(resolved_repo), "python"] - completed = subprocess.run( - [ - *command, - "-c", - _PE_NATIVE_SUPPORT_AUDIT_SCRIPT, - str(resolved_repo), - str(int(period)), - str(Path(candidate_dataset_path).expanduser().resolve()), - str(Path(baseline_dataset_path).expanduser().resolve()), - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - stderr = completed.stderr.strip() - stdout = completed.stdout.strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise RuntimeError(f"PE-native support audit failed: {detail}") - return json.loads(completed.stdout) - - -def compute_batch_us_pe_native_support_audits( - *, - candidate_dataset_paths: list[str | Path] | tuple[str | Path, ...], - baseline_dataset_path: str | Path, - period: int = 2024, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> list[dict[str, Any]]: - """Compare PE support structure for many candidates against one baseline.""" - - if not candidate_dataset_paths: - return [] - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - if policyengine_us_data_python is not None: - command = [str(Path(policyengine_us_data_python).expanduser())] - else: - command = ["uv", "run", "--project", str(resolved_repo), "python"] - completed = subprocess.run( - [ - *command, - "-c", - _PE_NATIVE_SUPPORT_AUDIT_BATCH_SCRIPT, - str(resolved_repo), - str(int(period)), - str(Path(baseline_dataset_path).expanduser().resolve()), - json.dumps( - [ - str(Path(candidate_path).expanduser().resolve()) - for candidate_path in candidate_dataset_paths - ] - ), - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - stderr = completed.stderr.strip() - stdout = completed.stdout.strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise RuntimeError(f"PE-native batch support audit failed: {detail}") - - payload = json.loads(completed.stdout) - baseline_dataset = str(payload["baseline_dataset"]) - baseline_snapshot = payload["baseline"] - period_value = int(payload["period"]) - return [ - { - "metric": "enhanced_cps_support_audit", - "period": period_value, - "candidate_dataset": str(item["candidate_dataset"]), - "baseline_dataset": baseline_dataset, - "candidate": item["candidate"], - "baseline": baseline_snapshot, - "comparisons": item["comparisons"], - } - for item in payload.get("results", ()) - ] - - -def write_us_pe_native_scores( - output_path: str | Path, - *, - candidate_dataset_path: str | Path, - baseline_dataset_path: str | Path, - period: int = 2024, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> Path: - """Write PE-native broad scoring payload to disk.""" - - payload = compute_us_pe_native_scores( - candidate_dataset_path=candidate_dataset_path, - baseline_dataset_path=baseline_dataset_path, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - destination = Path(output_path) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text( - json.dumps(payload, indent=2, sort_keys=True, allow_nan=False) - ) - return destination - - -def write_us_pe_native_target_diagnostics( - output_path: str | Path, - *, - from_dataset_path: str | Path, - to_dataset_path: str | Path, - period: int = 2024, - top_k: int = 50, - from_label: str = "policyengine-us-data", - to_label: str = "microplex-us", - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - policyengine_targets_db_path: str | Path | None = None, - artifact_id: str | None = None, - run_id: str | None = None, -) -> Path: - """Write the full PE-native per-target diagnostic dataset to disk.""" - - payload = build_us_pe_native_target_diagnostics_payload( - from_dataset_path=from_dataset_path, - to_dataset_path=to_dataset_path, - period=period, - top_k=top_k, - from_label=from_label, - to_label=to_label, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - policyengine_targets_db_path=policyengine_targets_db_path, - artifact_id=artifact_id, - run_id=run_id, - ) - destination = Path(output_path) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True)) - return destination - - -def build_us_pe_native_target_diagnostics_payload( - *, - from_dataset_path: str | Path | None = None, - to_dataset_path: str | Path | None = None, - period: int = 2024, - top_k: int = 50, - from_label: str = "policyengine-us-data", - to_label: str = "microplex-us", - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - policyengine_targets_db_path: str | Path | None = None, - target_delta_payload: dict[str, Any] | None = None, - artifact_id: str | None = None, - run_id: str | None = None, -) -> dict[str, Any]: - """Build the full PE-native per-target diagnostic payload. - - When ``target_delta_payload`` is supplied, the caller is responsible for - ensuring it compares the same baseline/candidate datasets and period. - """ - - payload = ( - dict(target_delta_payload) - if target_delta_payload is not None - else compare_us_pe_native_target_deltas( - from_dataset_path=_required_dataset_path( - from_dataset_path, - label="from_dataset_path", - ), - to_dataset_path=_required_dataset_path( - to_dataset_path, - label="to_dataset_path", - ), - period=period, - top_k=top_k, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - ) - payload["diagnostic_schema_version"] = 1 - payload["dataset_labels"] = { - "from": from_label, - "to": to_label, - } - resolved_artifact_id = _first_present( - artifact_id, - payload.get("artifact_id"), - payload.get("artifactId"), - ) - resolved_run_id = _first_present(run_id, payload.get("run_id"), payload.get("runId")) - payload.setdefault("artifact_id", resolved_artifact_id) - payload.setdefault("run_id", resolved_run_id) - payload.setdefault("baseline_dataset", payload.get("from_dataset")) - payload.setdefault("candidate_dataset", payload.get("to_dataset")) - target_db_path = ( - Path(policyengine_targets_db_path).expanduser() - if policyengine_targets_db_path is not None - else _default_policyengine_targets_db_path(policyengine_us_data_repo) - ) - annotate_pe_native_target_db_matches( - payload, - target_db_path=target_db_path, - period=period, - ) - _add_policyengine_target_diagnostic_aliases(payload) - return payload - - -def _required_dataset_path(value: str | Path | None, *, label: str) -> str | Path: - if value is None: - raise ValueError( - f"{label} is required when target_delta_payload is not supplied" - ) - return value - - -def _add_policyengine_target_diagnostic_aliases(payload: dict[str, Any]) -> None: - """Add dashboard-friendly aliases while preserving the native delta schema.""" - - context = _PolicyEngineTargetDiagnosticAliasContext.from_payload(payload) - targets_by_name: dict[str, dict[str, Any]] = {} - for row in payload.get("targets", ()): - if not isinstance(row, dict): - continue - _add_policyengine_target_diagnostic_aliases_to_row(row, context) - target_name = row.get("target_name") - if target_name is not None: - targets_by_name[str(target_name)] = row - - for list_name in ("top_improvements", "top_regressions"): - for row in payload.get(list_name) or []: - if not isinstance(row, dict): - continue - full_row = targets_by_name.get(str(row.get("target_name", ""))) - if full_row is not None: - for key, value in full_row.items(): - row.setdefault(key, value) - _add_policyengine_target_diagnostic_aliases_to_row(row, context) - - -@dataclass(frozen=True) -class _PolicyEngineTargetDiagnosticAliasContext: - baseline_dataset: Any - candidate_dataset: Any - baseline_label: Any - candidate_label: Any - period: Any - artifact_id: Any - run_id: Any - - @classmethod - def from_payload( - cls, - payload: dict[str, Any], - ) -> _PolicyEngineTargetDiagnosticAliasContext: - return cls( - baseline_dataset=payload.get("from_dataset"), - candidate_dataset=payload.get("to_dataset"), - baseline_label=payload.get("dataset_labels", {}).get("from"), - candidate_label=payload.get("dataset_labels", {}).get("to"), - period=payload.get("period"), - artifact_id=payload.get("artifact_id") or payload.get("artifactId"), - run_id=payload.get("run_id") or payload.get("runId"), - ) - - -def _add_policyengine_target_diagnostic_aliases_to_row( - row: dict[str, Any], - context: _PolicyEngineTargetDiagnosticAliasContext, -) -> None: - expected_target = _expected_policyengine_target(row) - target_name = str(row.get("target_name") or "") - target_value = row.get("target_value") - from_estimate = row.get("from_estimate") - to_estimate = row.get("to_estimate") - from_absolute_error = _absolute_error_or_none(from_estimate, target_value) - to_absolute_error = _absolute_error_or_none(to_estimate, target_value) - row.setdefault( - "target_id", - row.get("policyengine_target_id") or row.get("target_name"), - ) - row.setdefault( - "period", - _first_present(row.get("policyengine_target_period"), context.period), - ) - row.setdefault( - "variable", - _first_present( - row.get("policyengine_target_variable"), - expected_target.get("variable"), - _infer_target_variable(target_name, row), - ), - ) - row.setdefault( - "geo_level", - _first_present( - row.get("policyengine_target_geo_level"), - expected_target.get("geo_level"), - _infer_target_geo_level(target_name, row), - ), - ) - row.setdefault( - "geography", - _first_present( - row.get("policyengine_target_geographic_id"), - expected_target.get("geographic_id"), - _infer_target_geography(target_name, row), - ), - ) - row.setdefault("state", _infer_target_state(target_name, row)) - row.setdefault( - "entity", - _infer_policyengine_target_entity(target_name, row, expected_target), - ) - row.setdefault("artifact_id", context.artifact_id) - row.setdefault("run_id", context.run_id) - row.setdefault("baseline_dataset", context.baseline_dataset) - row.setdefault("candidate_dataset", context.candidate_dataset) - row.setdefault("baseline_label", context.baseline_label) - row.setdefault("candidate_label", context.candidate_label) - row.setdefault("us_data_aggregate", from_estimate) - row.setdefault("microplex_aggregate", to_estimate) - row.setdefault("us_data_absolute_error", from_absolute_error) - row.setdefault("microplex_absolute_error", to_absolute_error) - row.setdefault("us_data_relative_error", row.get("from_rel_error")) - row.setdefault("microplex_relative_error", row.get("to_rel_error")) - if from_absolute_error is not None and to_absolute_error is not None: - row.setdefault( - "delta_absolute_error", - to_absolute_error - from_absolute_error, - ) - row.setdefault( - "delta_relative_error", - _delta_or_none(row.get("to_rel_error"), row.get("from_rel_error")), - ) - row.setdefault("us_data_loss_contribution", row.get("from_weighted_term")) - row.setdefault( - "policyengine_us_data_loss_contribution", - row.get("from_weighted_term"), - ) - row.setdefault("baseline_loss_contribution", row.get("from_weighted_term")) - row.setdefault("microplex_loss_contribution", row.get("to_weighted_term")) - row.setdefault("candidate_loss_contribution", row.get("to_weighted_term")) - row.setdefault("loss_contribution", row.get("to_weighted_term")) - row.setdefault("loss_contribution_delta", row.get("weighted_term_delta")) - row.setdefault("family", _infer_target_family(target_name, row)) - row.setdefault("in_loss", True) - row.setdefault("supported_by_microplex", True) - - -def _expected_policyengine_target(row: dict[str, Any]) -> dict[str, Any]: - expected = row.get("policyengine_target_expected") - return expected if isinstance(expected, dict) else {} - - -def _first_present(*values: Any) -> Any: - for value in values: - if value is not None: - return value - return None - - -def _target_name_parts(target_name: str) -> list[str]: - return [part for part in target_name.split("/") if part] - - -def _infer_target_variable(target_name: str, row: dict[str, Any]) -> str | None: - parts = _target_name_parts(target_name) - if not parts: - return None - - if target_name.endswith("/snap-cost"): - return "snap_cost" - if target_name.endswith("/snap-hhs"): - return "snap_households" - - if parts[0] == "nation": - return _infer_national_target_variable(parts) - if parts[0] == "state": - return _infer_state_target_variable(parts) - - family = row.get("target_family") - return str(family) if family not in {None, "other"} else None - - -def _infer_target_family(target_name: str, row: dict[str, Any]) -> str | None: - family = row.get("target_family") - if family not in {None, "other"}: - return str(family) - if target_name.startswith("state/irs/aca_spending/"): - return "state_aca_spending" - if target_name.startswith("state/irs/aca_enrollment/"): - return "state_aca_enrollment" - if target_name.endswith("/snap-cost"): - return "state_snap_cost" - if target_name.endswith("/snap-hhs"): - return "state_snap_households" - return str(family) if family is not None else None - - -def _infer_national_target_variable(parts: list[str]) -> str | None: - if len(parts) < 2: - return None - source = parts[1] - if source == "irs" and len(parts) >= 3: - metric = parts[2] - if metric == "adjusted gross income": - return "adjusted_gross_income" - if metric == "count": - return "tax_unit_count" - return _slugify_target_token(metric) - if source == "census" and len(parts) >= 3: - metric = parts[2] - if metric.startswith("agi_in_spm_threshold_decile_"): - return "agi_in_spm_threshold_decile" - if metric.startswith("count_in_spm_threshold_decile_"): - return "count_in_spm_threshold_decile" - if metric == "population_by_age": - return "population" - return _slugify_target_token(metric) - if source == "gov" and len(parts) >= 3: - return _slugify_target_token(parts[2]) - if source == "cbo" and len(parts) >= 3: - if parts[2] == "income_by_source" and len(parts) >= 4: - return _slugify_target_token(parts[3]) - return _slugify_target_token(parts[2]) - if source in {"soi", "hhs"} and len(parts) >= 3: - return _slugify_target_token(parts[2]) - if source in {"jct", "net_worth", "ssa"}: - return source - return _slugify_target_token(source) - - -def _infer_state_target_variable(parts: list[str]) -> str | None: - if len(parts) < 2: - return None - source_or_state = parts[1] - if source_or_state == "irs" and len(parts) >= 3: - return _slugify_target_token(parts[2]) - if source_or_state == "census" and len(parts) >= 3: - metric = parts[2] - if metric == "population_by_state": - return "population" - if metric == "population_under_5_by_state": - return "population_under_5" - return _slugify_target_token(metric) - if source_or_state == "real_estate_taxes": - return "real_estate_taxes" - if _looks_like_state_code(source_or_state) and len(parts) >= 3: - return _slugify_target_token(parts[2]) - return _slugify_target_token(source_or_state) - - -def _slugify_target_token(value: str) -> str: - return ( - value.strip() - .lower() - .replace(" ", "_") - .replace("-", "_") - .replace("/", "_") - ) - - -def _infer_target_geo_level(target_name: str, row: dict[str, Any]) -> str | None: - scope = row.get("target_scope") - if scope in {"national", "state"}: - return str(scope) - parts = _target_name_parts(target_name) - if not parts: - return None - if parts[0] == "nation": - return "national" - if parts[0] == "state": - return "state" - return None - - -def _infer_target_geography(target_name: str, row: dict[str, Any]) -> str | None: - geo_level = _infer_target_geo_level(target_name, row) - if geo_level == "national": - return "US" - state = _infer_target_state(target_name, row) - return state - - -def _infer_target_state(target_name: str, row: dict[str, Any]) -> str | None: - geography = _first_present( - row.get("policyengine_target_geographic_id"), - _expected_policyengine_target(row).get("geographic_id"), - ) - if isinstance(geography, str) and geography != "US": - return _normalize_state_code(geography) - parts = _target_name_parts(target_name) - if parts and parts[0] == "state": - for token in parts[1:]: - if _looks_like_state_code(token): - return _normalize_state_code(token) - if parts and _looks_like_state_fips_id(parts[0]): - return parts[0] - return None - - -def _looks_like_state_code(value: str) -> bool: - return len(value) == 2 and value.isalpha() - - -def _looks_like_state_fips_id(value: str) -> bool: - return len(value) == 4 and value.startswith("US") and value[2:].isdigit() - - -def _normalize_state_code(value: str) -> str: - return value.upper() if _looks_like_state_code(value) else value - - -def _infer_policyengine_target_entity( - target_name: str, - row: dict[str, Any], - expected_target: dict[str, Any], -) -> str | None: - variable = _first_present( - row.get("policyengine_target_variable"), - expected_target.get("variable"), - row.get("variable"), - ) - domain_variable = _first_present( - row.get("policyengine_target_domain_variable"), - expected_target.get("domain_variable"), - ) - if _contains_entity_hint("tax_unit", variable, domain_variable): - return "tax_unit" - if _contains_entity_hint("spm_unit", variable, domain_variable): - return "spm_unit" - if _contains_entity_hint("household", variable, domain_variable): - return "household" - - parts = _target_name_parts(target_name) - if "irs" in parts or "jct" in parts: - return "tax_unit" - if "soi" in parts: - return "tax_unit" - if "cbo" in parts: - if "snap" in parts: - return "household" - if "ssi" in parts or "social_security" in parts: - return "person" - return "tax_unit" - if "hhs" in parts: - return "person" - if "census" in parts: - return "person" - family = row.get("target_family") - if family in { - "state_agi_distribution", - "national_irs_other", - "national_tax_expenditures", - "state_aca_enrollment", - "state_aca_spending", - }: - return "tax_unit" - if family in { - "state_age_distribution", - "state_population", - "state_population_under_5", - "national_population_by_age", - "national_infants", - "national_census_other", - "national_ssa", - }: - return "person" - if family in { - "national_spm_threshold_agi", - "national_spm_threshold_count", - }: - return "spm_unit" - if family in { - "state_real_estate_taxes", - "national_net_worth", - }: - return "household" - if "snap-hhs" in target_name: - return "household" - if "snap-cost" in target_name: - return "household" - if _contains_entity_hint("aca", variable, target_name): - return "tax_unit" - if "spm-unit" in target_name: - return "spm_unit" - return None - - -def _contains_entity_hint(entity: str, *values: Any) -> bool: - return any(entity in str(value) for value in values if value is not None) - - -def _absolute_error_or_none(value: Any, target: Any) -> float | None: - if value is None or target is None: - return None - return abs(float(value) - float(target)) - - -def _delta_or_none(value: Any, baseline: Any) -> float | None: - if value is None or baseline is None: - return None - return float(value) - float(baseline) - - -def main(argv: list[str] | None = None) -> int: - """CLI for exact broad PE-native loss scoring.""" - - parser = argparse.ArgumentParser( - description="Score a candidate and baseline under PE-US's enhanced-CPS native loss." - ) - parser.add_argument("--candidate-dataset", required=True) - parser.add_argument("--baseline-dataset", required=True) - parser.add_argument("--period", type=int, default=2024) - parser.add_argument("--policyengine-us-data-python") - parser.add_argument("--policyengine-us-data-repo") - args = parser.parse_args(argv) - - score = compute_policyengine_us_enhanced_cps_native_scores( - args.candidate_dataset, - args.baseline_dataset, - period=args.period, - policyengine_us_data_python=args.policyengine_us_data_python, - policyengine_us_data_repo=args.policyengine_us_data_repo, - ) - print(json.dumps(score.to_dict(), indent=2, sort_keys=True)) - return 0 - - -def main_target_diagnostics(argv: list[str] | None = None) -> int: - """CLI for full PE-native per-target diagnostics.""" - - parser = argparse.ArgumentParser( - description=( - "Write a full per-target PE-native diagnostic JSON comparing a " - "baseline dataset to a Microplex candidate." - ) - ) - parser.add_argument("--from-dataset", required=True) - parser.add_argument("--to-dataset", required=True) - parser.add_argument("--output-path", required=True) - parser.add_argument("--period", type=int, default=2024) - parser.add_argument("--top-k", type=int, default=50) - parser.add_argument("--from-label", default="policyengine-us-data") - parser.add_argument("--to-label", default="microplex-us") - parser.add_argument("--policyengine-us-data-python") - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--policyengine-targets-db") - parser.add_argument("--artifact-id") - parser.add_argument("--run-id") - args = parser.parse_args(argv) - - path = write_us_pe_native_target_diagnostics( - args.output_path, - from_dataset_path=args.from_dataset, - to_dataset_path=args.to_dataset, - period=args.period, - top_k=args.top_k, - from_label=args.from_label, - to_label=args.to_label, - policyengine_us_data_python=args.policyengine_us_data_python, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_targets_db_path=args.policyengine_targets_db, - artifact_id=args.artifact_id, - run_id=args.run_id, - ) - print(str(path)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main(sys.argv[1:])) diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild.py b/src/microplex_us/pipelines/pe_us_data_rebuild.py deleted file mode 100644 index 63c5298c..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild.py +++ /dev/null @@ -1,528 +0,0 @@ -"""Program spec for rebuilding the PE-US-data pipeline inside Microplex.""" - -from __future__ import annotations - -from dataclasses import asdict, dataclass, replace -from enum import Enum -from pathlib import Path -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from microplex.core import SourceProvider - - from microplex_us.pipelines.us import USMicroplexBuildConfig, USMicroplexPipeline - - -class PEUSDataRebuildStatus(str, Enum): - """Parity-rebuild status for one PE-US-data stage.""" - - NOT_STARTED = "not_started" - PARTIAL = "partial" - CLOSE = "close" - EXACT = "exact" - INTENTIONALLY_DIFFERENT = "intentionally_different" - - -@dataclass(frozen=True) -class PEUSDataRebuildStage: - """One stage in the PE-US-data rebuild program.""" - - stage_id: str - title: str - goal: str - pe_owner_modules: tuple[str, ...] - microplex_owner_modules: tuple[str, ...] - parity_contract: str - current_status: PEUSDataRebuildStatus - notes: str - next_steps: tuple[str, ...] = () - - def to_dict(self) -> dict[str, Any]: - payload = asdict(self) - payload["current_status"] = self.current_status.value - return payload - - -@dataclass(frozen=True) -class PEUSDataRebuildProgram: - """Durable spec for the architecture-first PE-US-data rebuild track.""" - - program_id: str - title: str - objective: str - principle: str - stages: tuple[PEUSDataRebuildStage, ...] - - def to_dict(self) -> dict[str, Any]: - return { - "program_id": self.program_id, - "title": self.title, - "objective": self.objective, - "principle": self.principle, - "stages": [stage.to_dict() for stage in self.stages], - } - - -def default_policyengine_us_data_rebuild_config( - **overrides: Any, -) -> USMicroplexBuildConfig: - """Return the incumbent-parity runtime config for the PE-US-data rebuild.""" - - from microplex_us.pipelines.us import USMicroplexBuildConfig - - defaults = USMicroplexBuildConfig( - synthesis_backend="seed", - calibration_backend="entropy", - policyengine_calibration_min_active_households=20, - policyengine_calibration_deferred_stage_min_active_households=(10, 1), - policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error=None, - policyengine_calibration_deferred_stage_top_family_count=7, - policyengine_calibration_deferred_stage_top_geography_count=4, - donor_imputer_backend="regime_aware", - donor_imputer_condition_selection="pe_prespecified", - donor_imputer_qrf_zero_threshold=0.05, - donor_imputer_excluded_variables=(), - puf_support_clone_enabled=True, - puf_support_clone_output_mode="collapse_to_scaffold", - prefer_cached_cps_asec_source=False, - policyengine_direct_override_variables=( - "health_savings_account_ald", - "non_sch_d_capital_gains", - ), - policyengine_prefer_existing_tax_unit_ids=True, - ) - config = replace(defaults, **overrides) - if ( - config.puf_support_clone_enabled - and config.donor_imputer_backend != "regime_aware" - ): - raise ValueError( - "PE-US-data PUF support clone rebuilds require " - "donor_imputer_backend='regime_aware' so release candidates use " - "MicroImpute chained donor imputations. Set " - "puf_support_clone_enabled=False for legacy imputer experiments." - ) - return config - - -def default_policyengine_us_data_rebuild_source_providers( - *, - cps_source_year: int = 2023, - cps_cache_dir: str | Path | None = None, - cps_download: bool = True, - puf_target_year: int = 2024, - puf_cps_reference_year: int | None = None, - puf_cache_dir: str | Path | None = None, - puf_path: str | Path | None = None, - puf_demographics_path: str | Path | None = None, - puf_expand_persons: bool = True, - include_donor_surveys: bool = True, - include_sipp: bool | None = None, - include_scf: bool | None = None, - acs_year: int = 2024, - sipp_year: int = 2023, - scf_year: int = 2022, - donor_cache_dir: str | Path | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> tuple[SourceProvider, ...]: - """Return the canonical CPS+PUF provider bundle for the rebuild track.""" - - from microplex_us.data_sources.cps import CPSASECSourceProvider - from microplex_us.data_sources.donor_surveys import ( - ACSSourceProvider, - SCFSourceProvider, - SIPPSourceProvider, - ) - from microplex_us.data_sources.puf import ( - PUF_UPRATING_MODE_PE_SOI, - SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF, - PUFSourceProvider, - ) - - cps_cache = None if cps_cache_dir is None else Path(cps_cache_dir) - puf_cache = None if puf_cache_dir is None else Path(puf_cache_dir) - donor_cache = None if donor_cache_dir is None else Path(donor_cache_dir) - donor_target_year = int(puf_target_year) - providers: list[SourceProvider] = [ - CPSASECSourceProvider( - year=int(cps_source_year), - cache_dir=cps_cache, - download=bool(cps_download), - ), - PUFSourceProvider( - target_year=int(puf_target_year), - cache_dir=puf_cache, - puf_path=puf_path, - demographics_path=puf_demographics_path, - expand_persons=bool(puf_expand_persons), - uprating_mode=PUF_UPRATING_MODE_PE_SOI, - cps_reference_year=( - int(puf_cps_reference_year) - if puf_cps_reference_year is not None - else int(cps_source_year) - ), - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - social_security_split_strategy=SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF, - ), - ] - resolved_include_sipp = ( - include_donor_surveys if include_sipp is None else include_sipp - ) - resolved_include_scf = include_donor_surveys if include_scf is None else include_scf - # The ACS donor is always enabled. It supplies the rent and real_estate_taxes - # source imputation that eCPS also draws from ACS, so omitting it leaves those - # variables at zero. ACS as a population spine ("multispine") is a separate, - # independently controlled feature that is not enabled here. - providers.append( - ACSSourceProvider( - year=int(acs_year), - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - ) - if resolved_include_sipp: - providers.extend( - [ - SIPPSourceProvider( - block="tips", - year=int(sipp_year), - cache_dir=donor_cache, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_year=donor_target_year, - ), - SIPPSourceProvider( - block="assets", - year=int(sipp_year), - cache_dir=donor_cache, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_year=donor_target_year, - ), - ] - ) - if resolved_include_scf: - providers.append( - SCFSourceProvider( - year=int(scf_year), - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - target_year=donor_target_year, - ) - ) - return tuple(providers) - - -def build_policyengine_us_data_rebuild_pipeline( - **config_overrides: Any, -) -> USMicroplexPipeline: - """Build a USMicroplexPipeline configured for the incumbent parity path.""" - - from microplex_us.pipelines.us import USMicroplexPipeline - - return USMicroplexPipeline( - config=default_policyengine_us_data_rebuild_config(**config_overrides) - ) - - -def default_policyengine_us_data_rebuild_program() -> PEUSDataRebuildProgram: - """Return the current PE-US-data rebuild program for `microplex-us`.""" - - return PEUSDataRebuildProgram( - program_id="pe-us-data-rebuild-v1", - title="Rebuild PE-US-data in Microplex", - objective=( - "Reproduce the incumbent PolicyEngine US-data pipeline using the same " - "sources, imputation families, and weighting backends where useful, " - "but in the cleaner Microplex runtime structure." - ), - principle=( - "Architecture-first replacement: first make the PE-US-data build path " - "explicit, modular, and parity-auditable inside Microplex; then change " - "models only once the incumbent pipeline is faithfully reproducible. " - "Structural improvements are allowed when they mainly improve " - "maintainability, provenance, or modularity and should change " - "results only on the margin." - ), - stages=( - PEUSDataRebuildStage( - stage_id="source-contracts", - title="Canonical source contracts", - goal=( - "Load the same incumbent public/source datasets through explicit " - "Microplex source descriptors and manifests." - ), - pe_owner_modules=( - "policyengine_us_data.datasets.cps", - "policyengine_us_data.datasets.puf", - ), - microplex_owner_modules=( - "microplex_us.data_sources.cps", - "microplex_us.data_sources.puf", - "microplex_us.source_manifests", - "microplex_us.source_registry", - ), - parity_contract=( - "Use the same raw sources and year conventions as PE-US-data, " - "but express them through Microplex source contracts." - ), - current_status=PEUSDataRebuildStatus.PARTIAL, - notes=( - "PUF already has an external manifest-backed contract. CPS is " - "partly descriptor-backed but still needs a cleaner source-level " - "parity contract." - ), - next_steps=( - "Externalize CPS source contracts to the same level as PUF.", - "Document year-by-year source selection and fallback policy.", - ), - ), - PEUSDataRebuildStage( - stage_id="cps-construction", - title="CPS construction parity", - goal=( - "Reproduce PE-US-data CPS variable construction, mappings, and " - "source-backed rules inside Microplex." - ), - pe_owner_modules=("policyengine_us_data.datasets.cps.cps",), - microplex_owner_modules=("microplex_us.data_sources.cps",), - parity_contract=( - "Same CPS mappings and rule-based derivations unless an " - "intentional difference is written down." - ), - current_status=PEUSDataRebuildStatus.PARTIAL, - notes=( - "Social Security reason-code logic is already close. Broader CPS " - "family-level parity is not yet fully audited." - ), - next_steps=( - "Audit dividends, interest, pensions, and transfer-income rules.", - "Back parity claims with focused tests where feasible.", - ), - ), - PEUSDataRebuildStage( - stage_id="puf-ingestion-uprating", - title="PUF ingestion and uprating parity", - goal=( - "Mirror PE-US-data's PUF ingest, demographics handling, and " - "uprating flow in a modular Microplex adapter." - ), - pe_owner_modules=( - "policyengine_us_data.datasets.puf.puf", - "policyengine_us_data.datasets.puf.uprate_puf", - ), - microplex_owner_modules=("microplex_us.data_sources.puf",), - parity_contract=( - "Same PUF source and uprating semantics before we intentionally " - "depart from the incumbent modeling choices." - ), - current_status=PEUSDataRebuildStatus.PARTIAL, - notes=( - "Microplex has a clean PUF adapter, but it is not yet a " - "line-by-line PE-US-data clone on demographics and uprating " - "behavior." - ), - next_steps=( - "Write explicit parity notes for demographics completion and uprating.", - "Decide which PE-data heuristics are copied versus retired.", - ), - ), - PEUSDataRebuildStage( - stage_id="extended-cps-qrf", - title="Extended CPS splice and CPS-only imputation", - goal=( - "Rebuild the PE-US-data CPS/PUF splice logic and the CPS-only " - "QRF imputation stages inside Microplex." - ), - pe_owner_modules=( - "policyengine_us_data.datasets.cps.extended_cps", - "policyengine_us_data.calibration.source_impute", - ), - microplex_owner_modules=( - "microplex_us.data_sources.family_imputation_benchmark", - "microplex_us.data_sources.puf", - "microplex_us.pe_source_impute_engine", - "microplex_us.pipelines.us", - ), - parity_contract=( - "Use the same model family and training/prediction split where " - "the intent is parity, even if the code is reorganized." - ), - current_status=PEUSDataRebuildStatus.PARTIAL, - notes=( - "The donor-survey side now has an explicit PE-style " - "prespecified predictor mode, real ACS/SIPP/SCF donor " - "providers, and one shared donor-block manifest for " - "provider specs, predictor surfaces, condition prep, SIPP " - "postprocessing rules, raw SIPP extraction details, " - "ACS/SCF subprocess dataset-loader mappings, and a " - "centralized PE source-impute block engine. The remaining " - "gap is the full extended CPS splice and line-by-line " - "stage parity." - ), - next_steps=( - "Isolate PE-data stage-1 and stage-2 QRF splice contracts.", - "Implement them behind Microplex method specs rather than inline scripts.", - "Audit annualization, sampling, and donor-row preparation details against PE-data.", - ), - ), - PEUSDataRebuildStage( - stage_id="family-imputation-parity", - title="Family imputation parity", - goal=( - "Recreate PE-US-data family imputations using the incumbent model " - "families and fallback heuristics before optimizing beyond them." - ), - pe_owner_modules=( - "policyengine_us_data.calibration.puf_impute", - "policyengine_us_data.calibration.source_impute", - ), - microplex_owner_modules=( - "microplex_us.data_sources.puf", - "microplex_us.data_sources.share_imputation", - "microplex_us.data_sources.family_imputation_benchmark", - ), - parity_contract=( - "Match PE-data on model class, feature surface, and fallback " - "rules unless a difference is intentional and benchmarked." - ), - current_status=PEUSDataRebuildStatus.PARTIAL, - notes=( - "Microplex currently has its own grouped-share / forest-family " - "search machinery. That is useful for later improvement, but not " - "yet the same as rebuilding the incumbent pipeline." - ), - next_steps=( - "Add explicit PE-style QRF family methods as first-class runtime options.", - "Separate 'incumbent rebuild' from 'challenger search' in method configs.", - ), - ), - PEUSDataRebuildStage( - stage_id="entity-export-parity", - title="PE-ingestable entity and export parity", - goal=( - "Build the same PE-ingestable entity tables and input surface, " - "with compatibility shims made explicit." - ), - pe_owner_modules=( - "policyengine_us_data datasets and H5 build path", - "policyengine_us.variables.gov.ssa.ss", - ), - microplex_owner_modules=( - "microplex_us.pipelines.us", - "microplex_us.policyengine.us", - "microplex_us.pipelines.pre_sim_parity", - ), - parity_contract=( - "PE should ingest the resulting dataset without relying on hidden " - "construction assumptions." - ), - current_status=PEUSDataRebuildStatus.PARTIAL, - notes=( - "Microplex export compatibility is fairly strong, but some " - "compatibility shims still exist, especially the Social Security " - "residual-to-retirement bridge." - ), - next_steps=( - "Retire or explicitly own the Social Security export shim.", - "Expand pre-sim parity audits over more critical input variables.", - ), - ), - PEUSDataRebuildStage( - stage_id="weighting-backend", - title="Weighting and calibration backend parity", - goal=( - "Use the same incumbent PE-US-data weighting/calibration backend " - "inside a Microplex-owned interface." - ), - pe_owner_modules=( - "policyengine_us_data.calibration.unified_calibration", - ), - microplex_owner_modules=( - "microplex_us.pipelines.pe_l0", - "microplex_us.unified_calibration", - "microplex_us.pipelines.local_reweighting", - ), - parity_contract=( - "Weight optimization should be callable through Microplex while " - "still allowing the incumbent PE optimizer when parity is desired." - ), - current_status=PEUSDataRebuildStatus.CLOSE, - notes=( - "The L0 adapter already wraps the PE-US-data optimizer, which is " - "the right structural direction." - ), - next_steps=( - "Make the incumbent optimizer path an explicit parity mode in the main build flow.", - ), - ), - PEUSDataRebuildStage( - stage_id="targets-and-eval", - title="Target DB and benchmark parity", - goal=( - "Keep the same PE target estate and measurement operator while " - "comparing incumbent and Microplex builds." - ), - pe_owner_modules=( - "policyengine_us_data.db", - "policyengine_us", - ), - microplex_owner_modules=( - "microplex_us.policyengine.harness", - "microplex_us.policyengine.comparison", - "microplex_us.pipelines.performance", - ), - parity_contract=( - "The target DB and PE formulas remain the shared truth/measurement layer." - ), - current_status=PEUSDataRebuildStatus.CLOSE, - notes=( - "This is already one of the strongest parts of the current " - "Microplex architecture." - ), - next_steps=( - "Use this layer for scheduled integrated parity checkpoints, not only for final validation.", - ), - ), - ), - ) - - -def build_policyengine_us_data_rebuild_markdown( - program: PEUSDataRebuildProgram | None = None, -) -> str: - """Render the rebuild program as Markdown.""" - - resolved = program or default_policyengine_us_data_rebuild_program() - lines = [ - f"# {resolved.title}", - "", - resolved.objective, - "", - f"Principle: {resolved.principle}", - "", - "## Stages", - "", - ] - for stage in resolved.stages: - lines.extend( - [ - f"### {stage.title}", - f"- `stage_id`: `{stage.stage_id}`", - f"- `status`: `{stage.current_status.value}`", - f"- goal: {stage.goal}", - f"- parity contract: {stage.parity_contract}", - "- PE owners:", - *[f" - `{module}`" for module in stage.pe_owner_modules], - "- Microplex owners:", - *[f" - `{module}`" for module in stage.microplex_owner_modules], - f"- notes: {stage.notes}", - ] - ) - if stage.next_steps: - lines.append("- next steps:") - lines.extend([f" - {step}" for step in stage.next_steps]) - lines.append("") - return "\n".join(lines).rstrip() + "\n" diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_audit.py b/src/microplex_us/pipelines/pe_us_data_rebuild_audit.py deleted file mode 100644 index 9b094630..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_audit.py +++ /dev/null @@ -1,356 +0,0 @@ -"""Native-loss-driven audit helpers for PE-US-data rebuild artifacts.""" - -from __future__ import annotations - -import argparse -import json -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.pe_native_scores import ( - compare_us_pe_native_target_deltas, - compute_us_pe_native_support_audit, -) -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_run import ( - resolve_us_manifest_or_contract_artifact_path, -) - - -def build_policyengine_us_data_rebuild_native_audit( - artifact_dir: str | Path, - *, - top_k: int = 15, - manifest_payload: dict[str, Any] | None = None, - native_scores_payload: dict[str, Any] | None = None, - imputation_ablation_payload: dict[str, Any] | None = None, - target_delta_payload: dict[str, Any] | None = None, - support_audit_payload: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> dict[str, Any]: - """Build a saved-artifact audit focused on native-loss regressions.""" - - artifact_root = Path(artifact_dir) - manifest = ( - dict(manifest_payload) - if manifest_payload is not None - else json.loads((artifact_root / "manifest.json").read_text()) - ) - artifacts = dict(manifest.get("artifacts", {})) - native_scores = ( - dict(native_scores_payload) - if native_scores_payload is not None - else json.loads( - _resolve_stage_artifact_path( - artifact_root, - manifest, - "policyengine_native_scores", - stage_id="09_validation_benchmarking", - ).read_text() - ) - ) - imputation_ablation = ( - dict(imputation_ablation_payload) - if imputation_ablation_payload is not None - else _load_optional_json( - _resolve_stage_artifact_path( - artifact_root, - manifest, - "imputation_ablation", - stage_id="09_validation_benchmarking", - ) - ) - ) - config = dict(manifest.get("config", {})) - candidate_dataset_path = _resolve_candidate_dataset_path(artifact_root, artifacts) - baseline_dataset_path = _resolve_baseline_dataset_path(config) - period = int( - native_scores.get("period") - or config.get("policyengine_dataset_year") - or config.get("policyengine_target_period") - or 2024 - ) - - target_delta = ( - dict(target_delta_payload) - if target_delta_payload is not None - else compare_us_pe_native_target_deltas( - from_dataset_path=baseline_dataset_path, - to_dataset_path=candidate_dataset_path, - period=period, - top_k=top_k, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - ) - support_audit = ( - dict(support_audit_payload) - if support_audit_payload is not None - else compute_us_pe_native_support_audit( - candidate_dataset_path=candidate_dataset_path, - baseline_dataset_path=baseline_dataset_path, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - ) - - family_breakdown = list( - native_scores.get("family_breakdown") - or dict(native_scores.get("broad_loss", {})).get("family_breakdown", ()) - ) - top_family_regressions = sorted( - [ - row - for row in family_breakdown - if float(row.get("loss_contribution_delta", 0.0)) > 0.0 - ], - key=lambda row: float(row.get("loss_contribution_delta", 0.0)), - reverse=True, - )[:top_k] - top_family_improvements = [ - row - for row in sorted( - family_breakdown, - key=lambda row: float(row.get("loss_contribution_delta", 0.0)), - ) - if float(row.get("loss_contribution_delta", 0.0)) < 0.0 - ][:top_k] - - support_summary = _build_support_summary(support_audit, top_k=top_k) - imputation_summary = ( - dict(imputation_ablation.get("summary", {})) - if imputation_ablation is not None - else None - ) - production_variant = ( - imputation_summary.get("production_variant") - if imputation_summary is not None - else None - ) - - return { - "schemaVersion": 1, - "artifactId": artifact_root.name, - "artifactDir": str(artifact_root.resolve()), - "period": period, - "candidateDatasetPath": str(candidate_dataset_path), - "baselineDatasetPath": str(baseline_dataset_path), - "nativeBroadLossSummary": dict(native_scores.get("summary", {})), - "topFamilyRegressions": top_family_regressions, - "topFamilyImprovements": top_family_improvements, - "topTargetRegressions": list(target_delta.get("top_regressions", ())), - "topTargetImprovements": list(target_delta.get("top_improvements", ())), - "supportAuditSummary": support_summary, - "imputationAblationSummary": imputation_summary, - "supportAudit": support_audit, - "targetDelta": target_delta, - "verdictHints": { - "largestRegressingFamily": ( - top_family_regressions[0]["family"] if top_family_regressions else None - ), - "largestRegressingTarget": ( - target_delta.get("top_regressions", [{}])[0].get("target_name") - if target_delta.get("top_regressions") - else None - ), - "missingStoredCriticalInputs": support_summary[ - "missingStoredCriticalInputs" - ], - "productionImputationVariant": production_variant, - "productionImputationVariantIsMaeWinner": ( - production_variant - == imputation_summary.get("best_mean_weighted_mae_variant") - if imputation_summary is not None - else None - ), - "productionImputationVariantIsSupportWinner": ( - production_variant - == imputation_summary.get("best_mean_support_f1_variant") - if imputation_summary is not None - else None - ), - }, - } - - -def write_policyengine_us_data_rebuild_native_audit( - artifact_dir: str | Path, - output_path: str | Path | None = None, - *, - top_k: int = 15, - manifest_payload: dict[str, Any] | None = None, - native_scores_payload: dict[str, Any] | None = None, - imputation_ablation_payload: dict[str, Any] | None = None, - target_delta_payload: dict[str, Any] | None = None, - support_audit_payload: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, -) -> Path: - """Write the native-loss-driven rebuild audit sidecar for one artifact bundle.""" - - artifact_root = Path(artifact_dir) - destination = ( - Path(output_path) - if output_path is not None - else resolve_us_stage_artifact_contract_path( - artifact_root, - "09_validation_benchmarking", - "policyengine_native_audit", - ) - ) - payload = build_policyengine_us_data_rebuild_native_audit( - artifact_root, - top_k=top_k, - manifest_payload=manifest_payload, - native_scores_payload=native_scores_payload, - imputation_ablation_payload=imputation_ablation_payload, - target_delta_payload=target_delta_payload, - support_audit_payload=support_audit_payload, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return destination - - -def _resolve_stage_artifact_path( - artifact_root: Path, - manifest: dict[str, Any], - artifact_key: str, - *, - stage_id: str, -) -> Path: - return resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - artifact_key, - stage_id=stage_id, - ) - - -def _resolve_candidate_dataset_path( - artifact_root: Path, - artifacts: dict[str, Any], -) -> Path: - dataset_name = artifacts.get("policyengine_dataset") - if not isinstance(dataset_name, str) or not dataset_name: - raise FileNotFoundError( - "Artifact bundle is missing artifacts.policyengine_dataset in manifest.json" - ) - dataset_path = Path(dataset_name) - if not dataset_path.is_absolute(): - dataset_path = artifact_root / dataset_path - if not dataset_path.exists(): - raise FileNotFoundError( - f"Artifact bundle is missing saved policyengine dataset: {dataset_path}" - ) - return dataset_path - - -def _load_optional_json(path: Path) -> dict[str, Any] | None: - if not path.exists(): - return None - return json.loads(path.read_text()) - - -def _resolve_baseline_dataset_path(config: dict[str, Any]) -> Path: - baseline_dataset = config.get("policyengine_baseline_dataset") - if not isinstance(baseline_dataset, str) or not baseline_dataset: - raise ValueError( - "Artifact config is missing policyengine_baseline_dataset for rebuild audit" - ) - return Path(baseline_dataset).expanduser().resolve() - - -def _build_support_summary( - support_audit: dict[str, Any], - *, - top_k: int, -) -> dict[str, Any]: - comparisons = dict(support_audit.get("comparisons", {})) - critical_rows = list(comparisons.get("critical_input_support", ())) - missing_stored = [ - row["variable"] - for row in critical_rows - if bool(row.get("baseline_stored")) and not bool(row.get("candidate_stored")) - ] - critical_support_gaps = sorted( - critical_rows, - key=lambda row: float(row.get("weighted_nonzero_delta", 0.0)), - )[:top_k] - filing_status_gaps = sorted( - list(comparisons.get("filing_status_weighted_delta", ())), - key=lambda row: abs(float(row.get("weighted_count_delta", 0.0))), - reverse=True, - )[:top_k] - mfs_high_agi_gaps = sorted( - list(comparisons.get("mfs_high_agi_delta", ())), - key=lambda row: abs(float(row.get("weighted_count_delta", 0.0))), - reverse=True, - )[:top_k] - hoh_agi_gaps = sorted( - list(comparisons.get("hoh_agi_delta", ())), - key=lambda row: abs(float(row.get("weighted_count_delta", 0.0))), - reverse=True, - )[:top_k] - ssi_by_age_gaps = sorted( - list(comparisons.get("ssi_by_age_delta", ())), - key=lambda row: abs(float(row.get("weighted_recipient_delta", 0.0))), - reverse=True, - )[:top_k] - medicare_part_b_by_age_gaps = sorted( - list(comparisons.get("medicare_part_b_premiums_by_age_delta", ())), - key=lambda row: abs(float(row.get("weighted_positive_delta", 0.0))), - reverse=True, - )[:top_k] - - return { - "missingStoredCriticalInputs": missing_stored, - "topCriticalInputSupportGaps": critical_support_gaps, - "topFilingStatusGaps": filing_status_gaps, - "topMFSAgiGaps": mfs_high_agi_gaps, - "topHoHAgiGaps": hoh_agi_gaps, - "topSSIByAgeGaps": ssi_by_age_gaps, - "topMedicarePartBByAgeGaps": medicare_part_b_by_age_gaps, - "topAcaPtcSpendingGaps": list( - comparisons.get("state_aca_ptc_spending_top_gaps", ()) - )[:top_k], - "topMarketplaceEnrollmentGaps": list( - comparisons.get("state_marketplace_enrollment_top_gaps", ()) - )[:top_k], - "topAgeBucketGaps": list( - comparisons.get("state_age_bucket_top_gaps", ()) - )[:top_k], - } - - -def main(argv: list[str] | None = None) -> int: - """CLI for writing one native-loss rebuild audit sidecar.""" - - parser = argparse.ArgumentParser( - description="Write a native-loss-driven audit sidecar for one PE rebuild artifact.", - ) - parser.add_argument("artifact_dir") - parser.add_argument("--output-path") - parser.add_argument("--top-k", type=int, default=15) - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--policyengine-us-data-python") - args = parser.parse_args(argv) - - destination = write_policyengine_us_data_rebuild_native_audit( - args.artifact_dir, - output_path=args.output_path, - top_k=args.top_k, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_python=args.policyengine_us_data_python, - ) - print(destination) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py deleted file mode 100644 index 20b5823a..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py +++ /dev/null @@ -1,87 +0,0 @@ -# ruff: noqa: F401 -"""Concrete checkpoint runner for the PE-US-data rebuild profile. - -This module is intentionally thin. Implementation lives in thematic -``pe_us_data_rebuild_checkpoint_*`` modules while this file preserves the public -import path and ``python -m`` entry point. -""" - -from __future__ import annotations - -from microplex_us.pipelines.artifacts import ( - build_and_save_versioned_us_microplex_from_source_providers, -) -from microplex_us.pipelines.pe_us_data_rebuild import ( - default_policyengine_us_data_rebuild_source_providers, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_ablation import ( - _build_checkpoint_imputation_ablation_payload, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_artifacts import ( - _load_checkpoint_manifest, - _load_checkpoint_manifest_if_available, - _load_checkpoint_versioned_artifacts, - _load_resume_dataframe_artifact, - _load_resume_json_artifact, - _load_resume_policyengine_tables, - _load_resume_targets, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_cli import main -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_common import ( - LOGGER, - _emit_checkpoint_progress, - _resolve_policyengine_us_runtime_version, - _root_logger_has_handlers, - _write_json_atomically, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_config import ( - _infer_policyengine_baseline_household_weight_sum, - _normalize_arch_targets_db_value, - _normalize_path_value, - _resolve_checkpoint_calibration_target_variables, - _validate_checkpoint_config_context, - _validate_query_keys, - default_policyengine_us_data_rebuild_checkpoint_config, - default_policyengine_us_data_rebuild_queries, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_evidence import ( - _refresh_checkpoint_data_flow_snapshot, - attach_policyengine_us_data_rebuild_checkpoint_evidence, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_resume import ( - _checkpoint_resume_extra_artifact_requirements, - _complete_resume_run_profile_stage, - _is_artifact_backed_checkpoint_resume_stage, - _load_checkpoint_source_frames, - _resolve_checkpoint_resume_artifact_root, - _resume_checkpoint_build_from_saved_stage, - _resume_checkpoint_build_from_source_stage, - _resume_provider_context_from_manifest, - _run_checkpoint_calibration_resume_stage, - _run_checkpoint_policyengine_entity_resume_stage, - _run_policyengine_us_data_rebuild_checkpoint_resume, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_runner import ( - run_policyengine_us_data_rebuild_checkpoint, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_types import ( - PEUSDataRebuildCheckpointEvidenceResult, - PEUSDataRebuildCheckpointResult, -) -from microplex_us.pipelines.versioned_artifacts import ( - _finalize_versioned_build_artifacts, -) - -__all__ = [ - "PEUSDataRebuildCheckpointEvidenceResult", - "PEUSDataRebuildCheckpointResult", - "attach_policyengine_us_data_rebuild_checkpoint_evidence", - "default_policyengine_us_data_rebuild_checkpoint_config", - "default_policyengine_us_data_rebuild_queries", - "main", - "run_policyengine_us_data_rebuild_checkpoint", -] - - -if __name__ == "__main__": - main() diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_ablation.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_ablation.py deleted file mode 100644 index 4a8a30c9..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_ablation.py +++ /dev/null @@ -1,738 +0,0 @@ -"""Checkpoint-specific imputation ablation evidence helpers.""" - -from __future__ import annotations - -from dataclasses import replace -from datetime import UTC, datetime -from typing import TYPE_CHECKING, Any - -import numpy as np -import pandas as pd -from microplex.core import ( - EntityObservation, - EntityType, - ObservationFrame, - SourceDescriptor, -) - -from microplex_us.pipelines.imputation_ablation import ( - ImputationAblationSliceSpec, - ImputationAblationVariant, - score_imputation_ablation_variants, -) -from microplex_us.variables import prune_redundant_variables - -if TYPE_CHECKING: - pass - -DEFAULT_CHECKPOINT_IMPUTATION_ABLATION_EVAL_FRACTION = 0.25 -MIN_CHECKPOINT_IMPUTATION_ABLATION_HOUSEHOLDS = 8 - - -def _checkpoint_imputation_ablation_variants() -> tuple[ImputationAblationVariant, ...]: - return ( - ImputationAblationVariant( - name="broad_common_qrf", - description="QRF with every compatible shared predictor.", - condition_selection="all_shared", - ), - ImputationAblationVariant( - name="top_correlated_qrf", - description="QRF with the production top-correlated predictor selection.", - condition_selection="top_correlated", - ), - ImputationAblationVariant( - name="structured_pe_conditioning", - description="PolicyEngine-style structural conditioning and preferred predictors.", - condition_selection="pe_prespecified", - ), - ) - - -def _checkpoint_imputation_ablation_slice_specs() -> tuple[ - ImputationAblationSliceSpec, ... -]: - return ( - ImputationAblationSliceSpec( - name="state_by_age", - columns=("state_fips", "age_group"), - ), - ImputationAblationSliceSpec( - name="sex_by_age", - columns=("sex", "age_group"), - ), - ImputationAblationSliceSpec( - name="employment_by_income", - columns=("employment_status", "income_bracket"), - ), - ) - - -def _production_imputation_ablation_variant_name(config: Any) -> str: - condition_selection = getattr(config, "donor_imputer_condition_selection", None) - if condition_selection == "all_shared": - return "broad_common_qrf" - if condition_selection == "top_correlated": - return "top_correlated_qrf" - return "structured_pe_conditioning" - - -def _checkpoint_post_calibration_metrics( - manifest: dict[str, Any], - *, - production_variant: str, -) -> dict[str, dict[str, float]]: - calibration_summary = dict(manifest.get("calibration", {})) - harness_summary = dict(manifest.get("policyengine_harness", {})) - native_scores_summary = dict(manifest.get("policyengine_native_scores", {})) - metrics: dict[str, float] = {} - for key in ( - "full_oracle_capped_mean_abs_relative_error", - "full_oracle_mean_abs_relative_error", - "active_solve_capped_mean_abs_relative_error", - "active_solve_mean_abs_relative_error", - ): - value = calibration_summary.get(key) - if value is not None: - metrics[key] = float(value) - for key in ( - "candidate_mean_abs_relative_error", - "mean_abs_relative_error_delta", - "candidate_composite_parity_loss", - "composite_parity_loss_delta", - "target_win_rate", - ): - value = harness_summary.get(key) - if value is not None: - metrics[key] = float(value) - for key in ( - "candidate_enhanced_cps_native_loss", - "enhanced_cps_native_loss_delta", - ): - value = native_scores_summary.get(key) - if value is not None: - metrics[key] = float(value) - return {production_variant: metrics} if metrics else {} - - -def _build_checkpoint_source_descriptor( - *, - base_source: SourceDescriptor, - household_table: pd.DataFrame, - person_table: pd.DataFrame, - household_variables: set[str] | None = None, - person_variables: set[str] | None = None, - name: str | None = None, -) -> SourceDescriptor | None: - def _build_observation( - entity: EntityType, - table: pd.DataFrame, - allowed_variables: set[str] | None, - ) -> EntityObservation | None: - observation = base_source.observation_for(entity) - available_columns = set(table.columns) - if observation.key_column not in available_columns: - return None - variable_names = tuple( - variable - for variable in observation.variable_names - if variable in available_columns - and (allowed_variables is None or variable in allowed_variables) - ) - if not variable_names: - return None - return EntityObservation( - entity=entity, - key_column=observation.key_column, - variable_names=variable_names, - weight_column=( - observation.weight_column - if observation.weight_column in available_columns - else None - ), - period_column=( - observation.period_column - if observation.period_column in available_columns - else None - ), - ) - - household_observation = _build_observation( - EntityType.HOUSEHOLD, - household_table, - household_variables, - ) - person_observation = _build_observation( - EntityType.PERSON, - person_table, - person_variables, - ) - if household_observation is None or person_observation is None: - return None - - included_variables = set(household_observation.variable_names) | set( - person_observation.variable_names - ) - return SourceDescriptor( - name=name or base_source.name, - shareability=base_source.shareability, - time_structure=base_source.time_structure, - observations=(household_observation, person_observation), - archetype=base_source.archetype, - population=base_source.population, - description=base_source.description, - variable_capabilities={ - variable: capability - for variable, capability in base_source.variable_capabilities.items() - if variable in included_variables - }, - ) - - -def _household_person_relationship(frame: ObservationFrame) -> Any: - relationship = next( - ( - candidate - for candidate in frame.relationships - if candidate.parent_entity == EntityType.HOUSEHOLD - and candidate.child_entity == EntityType.PERSON - ), - None, - ) - if relationship is None: - raise ValueError( - "Checkpoint imputation ablation requires a household-to-person relationship" - ) - return relationship - - -def _project_checkpoint_table_to_source_schema( - table: pd.DataFrame, - observation: EntityObservation, - *, - relationship_columns: tuple[str, ...] = (), -) -> pd.DataFrame: - columns = [ - observation.key_column, - *relationship_columns, - *observation.variable_names, - ] - if observation.weight_column is not None: - columns.append(observation.weight_column) - if observation.period_column is not None: - columns.append(observation.period_column) - resolved_columns = [ - column for column in dict.fromkeys(columns) if column in table.columns - ] - return table.loc[:, resolved_columns].copy() - - -def _subset_checkpoint_frame_to_households( - frame: ObservationFrame, - household_ids: tuple[Any, ...], - *, - source: SourceDescriptor, -) -> ObservationFrame | None: - relationship = _household_person_relationship(frame) - households = frame.tables[EntityType.HOUSEHOLD] - persons = frame.tables[EntityType.PERSON] - household_subset = households.loc[ - households[relationship.parent_key].isin(household_ids) - ].copy() - if household_subset.empty: - return None - household_id_index = tuple(household_subset[relationship.parent_key].tolist()) - person_subset = persons.loc[ - persons[relationship.child_key].isin(household_id_index) - ].copy() - if person_subset.empty: - return None - household_observation = source.observation_for(EntityType.HOUSEHOLD) - person_observation = source.observation_for(EntityType.PERSON) - subset_frame = ObservationFrame( - source=source, - tables={ - EntityType.HOUSEHOLD: _project_checkpoint_table_to_source_schema( - household_subset, - household_observation, - relationship_columns=(relationship.parent_key,), - ), - EntityType.PERSON: _project_checkpoint_table_to_source_schema( - person_subset, - person_observation, - relationship_columns=(relationship.child_key,), - ), - }, - relationships=(relationship,), - ) - subset_frame.validate() - return subset_frame - - -def _split_checkpoint_household_ids( - frame: ObservationFrame, - *, - eval_fraction: float, - random_seed: int, -) -> tuple[tuple[Any, ...], tuple[Any, ...]] | None: - relationship = _household_person_relationship(frame) - household_ids = ( - frame.tables[EntityType.HOUSEHOLD][relationship.parent_key] - .drop_duplicates() - .tolist() - ) - if len(household_ids) < MIN_CHECKPOINT_IMPUTATION_ABLATION_HOUSEHOLDS: - return None - shuffled = np.asarray(household_ids, dtype=object) - np.random.default_rng(random_seed).shuffle(shuffled) - eval_count = int(np.ceil(len(shuffled) * float(eval_fraction))) - eval_count = max(1, min(eval_count, len(shuffled) - 1)) - eval_ids = tuple(shuffled[:eval_count].tolist()) - train_ids = tuple(shuffled[eval_count:].tolist()) - if not train_ids or not eval_ids: - return None - return train_ids, eval_ids - - -def _build_checkpoint_holdout_scaffold_source( - scaffold_source: SourceDescriptor, - donor_frame: ObservationFrame, - *, - masked_target_variables: set[str] | None = None, -) -> SourceDescriptor | None: - excluded_variables = set(masked_target_variables or ()) - return _build_checkpoint_source_descriptor( - base_source=scaffold_source, - household_table=donor_frame.tables[EntityType.HOUSEHOLD], - person_table=donor_frame.tables[EntityType.PERSON], - household_variables=set(scaffold_source.variables_for(EntityType.HOUSEHOLD)) - - excluded_variables, - person_variables=set(scaffold_source.variables_for(EntityType.PERSON)) - - excluded_variables, - name=f"{donor_frame.source.name}_checkpoint_scaffold", - ) - - -def _resolve_checkpoint_imputation_targets( - pipeline: Any, - *, - scaffold_input: Any, - donor_input: Any, - current_seed: pd.DataFrame, -) -> tuple[list[str], list[str]]: - scaffold_observed = prune_redundant_variables( - scaffold_input.fusion_plan.variables_for(EntityType.HOUSEHOLD) - | scaffold_input.fusion_plan.variables_for(EntityType.PERSON) - ) - donor_seed = pipeline.prepare_seed_data_from_source(donor_input) - donor_observed = prune_redundant_variables( - donor_input.fusion_plan.variables_for(EntityType.HOUSEHOLD) - | donor_input.fusion_plan.variables_for(EntityType.PERSON) - ) - excluded = { - "person_id", - "household_id", - "hh_weight", - "weight", - "household_weight", - "tax_unit_id", - "family_id", - "spm_unit_id", - "marital_unit_id", - "state", - "age_group", - "income_bracket", - "is_head", - "is_spouse", - "is_dependent", - } - numeric_current = { - column - for column in current_seed.columns - if pd.api.types.is_numeric_dtype(current_seed[column]) - } - numeric_donor = { - column - for column in donor_seed.columns - if pd.api.types.is_numeric_dtype(donor_seed[column]) - } - shared_vars = sorted( - variable - for variable in scaffold_observed & donor_observed - if variable not in excluded - and variable in current_seed.columns - and variable in donor_seed.columns - and variable in numeric_current - and variable in numeric_donor - and scaffold_input.frame.source.allows_conditioning_on(variable) - and donor_input.frame.source.allows_conditioning_on(variable) - and pipeline._is_compatible_donor_condition( - current_seed[variable], - donor_seed[variable], - ) - ) - donor_only_vars = sorted( - variable - for variable in donor_observed - scaffold_observed - if variable not in excluded - and variable not in pipeline.config.donor_imputer_excluded_variables - and variable in donor_seed.columns - and variable in numeric_donor - and donor_input.frame.source.is_authoritative_for(variable) - and pipeline._should_integrate_donor_variable(current_seed, variable) - and pipeline._is_compatible_donor_target(donor_seed[variable]) - ) - donor_override_vars = sorted( - variable - for variable in scaffold_observed & donor_observed - if variable not in excluded - and variable not in pipeline.config.donor_imputer_excluded_variables - and variable in pipeline.config.donor_imputer_authoritative_override_variables - and variable in current_seed.columns - and variable in donor_seed.columns - and variable in numeric_current - and variable in numeric_donor - and donor_input.frame.source.is_authoritative_for(variable) - and pipeline._is_compatible_donor_target(donor_seed[variable]) - ) - return shared_vars, sorted(set(donor_only_vars) | set(donor_override_vars)) - - -def _checkpoint_variant_config( - config: Any, - variant: ImputationAblationVariant, -) -> Any: - return replace( - config, - donor_imputer_condition_selection=variant.condition_selection, - donor_imputer_max_condition_vars=( - None - if variant.condition_selection == "all_shared" - else config.donor_imputer_max_condition_vars - ), - ) - - -def _prepare_checkpoint_imputation_score_frame(frame: pd.DataFrame) -> pd.DataFrame: - result = frame.copy() - ages = ( - pd.to_numeric(result["age"], errors="coerce").replace([np.inf, -np.inf], np.nan) - if "age" in result.columns - else pd.Series(np.nan, index=result.index, dtype=float) - ) - age_groups = pd.cut( - ages, - bins=[-np.inf, 18.0, 35.0, 55.0, 65.0, np.inf], - labels=False, - right=False, - ) - result["age_group"] = ( - pd.Series(age_groups, index=result.index).fillna(-1).astype(int) - ) - incomes = ( - pd.to_numeric(result["income"], errors="coerce").replace( - [np.inf, -np.inf], np.nan - ) - if "income" in result.columns - else pd.Series(np.nan, index=result.index, dtype=float) - ) - income_brackets = pd.cut( - incomes, - bins=[-np.inf, 0.0, 25_000.0, 50_000.0, 100_000.0, np.inf], - labels=False, - right=False, - ) - result["income_bracket"] = ( - pd.Series( - income_brackets, - index=result.index, - ) - .fillna(-1) - .astype(int) - ) - return result - - -def _ensure_checkpoint_target_columns( - frame: pd.DataFrame, - *, - target_variables: list[str], -) -> pd.DataFrame: - result = frame.copy() - for variable in target_variables: - if variable not in result.columns: - result[variable] = 0.0 - return result - - -def _mean_checkpoint_metric(values: list[float]) -> float | None: - if not values: - return None - return float(np.mean(values)) - - -def _summarize_checkpoint_imputation_ablation( - *, - source_reports: dict[str, dict[str, Any]], - skipped_sources: list[dict[str, Any]], - production_variant: str, -) -> dict[str, Any]: - metric_names = ( - "mean_weighted_mae", - "mean_total_relative_error", - "mean_support_f1", - "mean_slice_total_js_divergence", - "mean_slice_support_js_divergence", - "mean_slice_positive_rate_delta", - ) - variant_metrics: dict[str, dict[str, list[float]]] = {} - variant_source_counts: dict[str, int] = {} - target_count = 0 - for source_report in source_reports.values(): - target_count += len(source_report.get("target_variables", ())) - report_payload = dict(source_report.get("report", {})) - for variant_name, variant_payload in dict( - report_payload.get("variants", {}) - ).items(): - aggregate_metrics = dict(variant_payload.get("aggregate_metrics", {})) - variant_source_counts[variant_name] = ( - variant_source_counts.get(variant_name, 0) + 1 - ) - metric_buckets = variant_metrics.setdefault( - variant_name, - {metric_name: [] for metric_name in metric_names}, - ) - for metric_name in metric_names: - value = aggregate_metrics.get(metric_name) - if value is not None: - metric_buckets[metric_name].append(float(value)) - variant_scorecard: dict[str, dict[str, Any]] = {} - for variant_name, metric_buckets in variant_metrics.items(): - variant_scorecard[variant_name] = { - "source_count": variant_source_counts.get(variant_name, 0), - **{ - metric_name: _mean_checkpoint_metric(metric_values) - for metric_name, metric_values in metric_buckets.items() - }, - } - - best_mean_weighted_mae_variant = None - mae_candidates = [ - (payload.get("mean_weighted_mae"), variant_name) - for variant_name, payload in variant_scorecard.items() - if payload.get("mean_weighted_mae") is not None - ] - if mae_candidates: - best_mean_weighted_mae_variant = min(mae_candidates)[1] - - best_mean_support_f1_variant = None - f1_candidates = [ - (payload.get("mean_support_f1"), variant_name) - for variant_name, payload in variant_scorecard.items() - if payload.get("mean_support_f1") is not None - ] - if f1_candidates: - best_mean_support_f1_variant = max(f1_candidates)[1] - - production_scorecard = variant_scorecard.get(production_variant, {}) - return { - "source_count": len(source_reports), - "skipped_source_count": len(skipped_sources), - "target_count": target_count, - "production_variant": production_variant, - "production_mean_weighted_mae": production_scorecard.get("mean_weighted_mae"), - "production_mean_support_f1": production_scorecard.get("mean_support_f1"), - "best_mean_weighted_mae_variant": best_mean_weighted_mae_variant, - "best_mean_support_f1_variant": best_mean_support_f1_variant, - "variant_scorecard": variant_scorecard, - } - - -def _build_checkpoint_imputation_ablation_payload( - build_result: Any, - *, - artifact_id: str, - manifest: dict[str, Any], -) -> dict[str, Any] | None: - if build_result.source_frame is None or not build_result.source_frames: - return None - - from microplex_us.pipelines.us import USMicroplexPipeline - - pipeline = USMicroplexPipeline(build_result.config) - scaffold_input = pipeline.prepare_source_input(build_result.source_frame) - scaffold_seed = pipeline.prepare_seed_data_from_source(scaffold_input) - production_variant = _production_imputation_ablation_variant_name( - build_result.config - ) - variants = _checkpoint_imputation_ablation_variants() - slice_specs = _checkpoint_imputation_ablation_slice_specs() - source_reports: dict[str, dict[str, Any]] = {} - skipped_sources: list[dict[str, Any]] = [] - - for source_index, donor_frame in enumerate(build_result.source_frames): - if donor_frame.source.name == build_result.source_frame.source.name: - continue - donor_name = donor_frame.source.name - try: - donor_input = pipeline.prepare_source_input(donor_frame) - shared_vars, target_vars = _resolve_checkpoint_imputation_targets( - pipeline, - scaffold_input=scaffold_input, - donor_input=donor_input, - current_seed=scaffold_seed, - ) - if not shared_vars: - skipped_sources.append( - {"source_name": donor_name, "reason": "no_shared_condition_vars"} - ) - continue - if not target_vars: - skipped_sources.append( - {"source_name": donor_name, "reason": "no_imputable_target_vars"} - ) - continue - - donor_subset_source = _build_checkpoint_source_descriptor( - base_source=donor_frame.source, - household_table=donor_frame.tables[EntityType.HOUSEHOLD], - person_table=donor_frame.tables[EntityType.PERSON], - name=donor_name, - ) - if donor_subset_source is None: - skipped_sources.append( - { - "source_name": donor_name, - "reason": "missing_household_or_person_observations", - } - ) - continue - - household_split = _split_checkpoint_household_ids( - donor_frame, - eval_fraction=DEFAULT_CHECKPOINT_IMPUTATION_ABLATION_EVAL_FRACTION, - random_seed=int(build_result.config.random_seed) + source_index, - ) - if household_split is None: - skipped_sources.append( - {"source_name": donor_name, "reason": "insufficient_households"} - ) - continue - train_households, eval_households = household_split - - train_frame = _subset_checkpoint_frame_to_households( - donor_frame, - train_households, - source=donor_subset_source, - ) - observed_eval_frame = _subset_checkpoint_frame_to_households( - donor_frame, - eval_households, - source=donor_subset_source, - ) - holdout_scaffold_source = _build_checkpoint_holdout_scaffold_source( - build_result.source_frame.source, - donor_frame, - masked_target_variables=set(target_vars), - ) - if holdout_scaffold_source is None: - skipped_sources.append( - { - "source_name": donor_name, - "reason": "no_overlap_with_scaffold_schema", - } - ) - continue - scaffold_eval_frame = _subset_checkpoint_frame_to_households( - donor_frame, - eval_households, - source=holdout_scaffold_source, - ) - if ( - train_frame is None - or observed_eval_frame is None - or scaffold_eval_frame is None - ): - skipped_sources.append( - {"source_name": donor_name, "reason": "empty_train_or_eval_split"} - ) - continue - - observed_eval_seed = _prepare_checkpoint_imputation_score_frame( - pipeline.prepare_seed_data_from_source( - pipeline.prepare_source_input(observed_eval_frame) - ) - ) - imputed_frames: dict[str, pd.DataFrame] = {} - for variant in variants: - variant_pipeline = USMicroplexPipeline( - _checkpoint_variant_config(build_result.config, variant) - ) - scaffold_eval_input = variant_pipeline.prepare_source_input( - scaffold_eval_frame - ) - donor_train_input = variant_pipeline.prepare_source_input(train_frame) - masked_seed = variant_pipeline.prepare_seed_data_from_source( - scaffold_eval_input - ) - integrated = variant_pipeline._integrate_donor_sources( - masked_seed, - scaffold_input=scaffold_eval_input, - donor_inputs=[donor_train_input], - )["seed_data"] - imputed_frames[variant.name] = ( - _prepare_checkpoint_imputation_score_frame( - _ensure_checkpoint_target_columns( - integrated, - target_variables=target_vars, - ) - ) - ) - - report = score_imputation_ablation_variants( - observed_frame=observed_eval_seed, - imputed_frames=imputed_frames, - target_variables=target_vars, - slice_specs=slice_specs, - variants=variants, - weight_column="hh_weight" - if "hh_weight" in observed_eval_seed.columns - else None, - post_calibration_metrics=_checkpoint_post_calibration_metrics( - manifest, - production_variant=production_variant, - ), - ) - source_reports[donor_name] = { - "source_name": donor_name, - "shared_variables": shared_vars, - "target_variables": target_vars, - "train_household_count": len(train_households), - "eval_household_count": len(eval_households), - "report": report.to_dict(), - } - except (KeyError, ValueError) as exc: - skipped_sources.append( - { - "source_name": donor_name, - "reason": "source_evaluation_failed", - "detail": str(exc), - } - ) - - if not source_reports: - return None - - return { - "schema_version": 1, - "generated_at": datetime.now(UTC).isoformat(), - "artifact_id": artifact_id, - "production_variant": production_variant, - "summary": _summarize_checkpoint_imputation_ablation( - source_reports=source_reports, - skipped_sources=skipped_sources, - production_variant=production_variant, - ), - "source_reports": source_reports, - "skipped_sources": skipped_sources, - } diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_artifacts.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_artifacts.py deleted file mode 100644 index ba3ea648..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_artifacts.py +++ /dev/null @@ -1,378 +0,0 @@ -"""Saved artifact loading helpers for PE-US-data checkpoint rebuilds.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import pandas as pd - -from microplex_us.pipelines.artifacts import ( - USMicroplexArtifactPaths, - USMicroplexVersionedBuildArtifacts, -) -from microplex_us.pipelines.registry import ( - load_us_microplex_run_registry, - select_us_microplex_frontier_entry, -) -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_policyengine_artifacts import ( - load_us_policyengine_entity_stage_artifact, -) -from microplex_us.pipelines.stage_run import ( - resolve_us_manifest_or_contract_artifact_path, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexPipeline, - USMicroplexTargets, -) -from microplex_us.policyengine.us import PolicyEngineUSEntityTableBundle - -if TYPE_CHECKING: - from microplex_us.pipelines.registry import FrontierMetric - - -def _registry_metric_value(entry: Any | None, metric: FrontierMetric) -> float | None: - if entry is None: - return None - return getattr(entry, metric, None) - - -def _resolve_saved_artifact_path( - artifact_root: Path, - relative_or_absolute: str | Path | None, -) -> Path | None: - if relative_or_absolute is None: - return None - candidate = Path(relative_or_absolute) - if not candidate.is_absolute(): - artifact_relative = artifact_root / candidate - if artifact_relative.exists(): - return artifact_relative - cwd_relative = candidate.resolve() - if cwd_relative.exists(): - return cwd_relative - candidate = artifact_relative - return candidate - - -def _resolve_required_saved_artifact_path( - artifact_root: Path, - artifacts: dict[str, Any], - artifact_key: str, -) -> Path: - path = _resolve_saved_artifact_path(artifact_root, artifacts.get(artifact_key)) - if path is None: - raise KeyError(f"Saved artifact manifest does not declare {artifact_key!r}") - return path - - -def _resolve_saved_stage_artifact_path( - artifact_root: Path, - artifacts: dict[str, Any], - artifact_key: str, - *, - stage_id: str, -) -> Path | None: - declared_path = _resolve_saved_artifact_path( - artifact_root, artifacts.get(artifact_key) - ) - if declared_path is not None: - return declared_path - contract_path = resolve_us_stage_artifact_contract_path( - artifact_root, - stage_id, - artifact_key, - ) - return contract_path if contract_path.exists() else None - - -def _load_checkpoint_manifest_if_available(artifact_root: Path) -> dict[str, Any]: - path = artifact_root / "manifest.json" - if not path.exists(): - return {} - try: - payload = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - return {} - return payload if isinstance(payload, dict) else {} - - -def _load_checkpoint_manifest(artifact_root: Path) -> dict[str, Any]: - manifest_path = artifact_root / "manifest.json" - if not manifest_path.exists(): - raise FileNotFoundError(f"Saved artifact manifest not found: {manifest_path}") - payload = json.loads(manifest_path.read_text()) - if not isinstance(payload, dict): - raise ValueError(f"Saved artifact manifest is not an object: {manifest_path}") - return payload - - -def _load_resume_dataframe_artifact( - artifact_root: Path, - manifest: dict[str, Any], - artifact_key: str, - *, - stage_id: str, -) -> pd.DataFrame: - path = resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - artifact_key, - stage_id=stage_id, - ) - if not path.exists(): - raise FileNotFoundError(f"Resume artifact not found: {path}") - return pd.read_parquet(path) - - -def _load_resume_json_artifact( - artifact_root: Path, - manifest: dict[str, Any], - artifact_key: str, - *, - stage_id: str, -) -> dict[str, Any]: - path = resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - artifact_key, - stage_id=stage_id, - ) - if not path.exists(): - raise FileNotFoundError(f"Resume artifact not found: {path}") - payload = json.loads(path.read_text()) - if not isinstance(payload, dict): - raise ValueError(f"Resume JSON artifact is not an object: {path}") - return payload - - -def _load_resume_targets( - artifact_root: Path, - manifest: dict[str, Any], - *, - config: USMicroplexBuildConfig, - seed_data: pd.DataFrame, -) -> USMicroplexTargets: - path = resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - "targets", - stage_id="07_calibration", - ) - if path.exists(): - payload = json.loads(path.read_text()) - if not isinstance(payload, dict): - raise ValueError(f"Resume targets artifact is not an object: {path}") - return USMicroplexTargets( - marginal=dict(payload.get("marginal", {})), - continuous=dict(payload.get("continuous", {})), - ) - return USMicroplexPipeline(config).build_targets(seed_data) - - -def _load_resume_policyengine_tables( - artifact_root: Path, - manifest: dict[str, Any], - artifact_key: str, - *, - stage_id: str, - expected_stage: str | None, -) -> PolicyEngineUSEntityTableBundle: - path = resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - artifact_key, - stage_id=stage_id, - ) - if not path.exists(): - raise FileNotFoundError(f"Resume PE entity artifact not found: {path}") - bundle, _metadata = load_us_policyengine_entity_stage_artifact( - path, - expected_stage=expected_stage, # type: ignore[arg-type] - ) - return bundle - - -def _load_checkpoint_versioned_artifacts( - *, - build_result: Any, - artifact_root: Path, - frontier_metric: FrontierMetric, -) -> USMicroplexVersionedBuildArtifacts: - manifest_path = artifact_root / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - artifacts = dict(manifest.get("artifacts", {})) - artifact_paths = USMicroplexArtifactPaths( - output_dir=artifact_root, - version_id=artifact_root.name, - seed_data=_resolve_required_saved_artifact_path( - artifact_root, - artifacts, - "seed_data", - ), - synthetic_data=_resolve_required_saved_artifact_path( - artifact_root, - artifacts, - "synthetic_data", - ), - calibrated_data=_resolve_required_saved_artifact_path( - artifact_root, - artifacts, - "calibrated_data", - ), - targets=_resolve_required_saved_artifact_path( - artifact_root, - artifacts, - "targets", - ), - manifest=manifest_path, - scaffold_seed_data=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "scaffold_seed_data", - stage_id="04_seed_scaffold", - ), - synthesizer=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "synthesizer", - stage_id="05_donor_integration_synthesis", - ), - policyengine_dataset=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "policyengine_dataset", - stage_id="08_dataset_assembly", - ), - data_flow_snapshot=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "data_flow_snapshot", - stage_id="08_dataset_assembly", - ), - stage_manifest=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "stage_manifest", - stage_id="08_dataset_assembly", - ), - artifact_inventory=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "artifact_inventory", - stage_id="08_dataset_assembly", - ), - conditional_readiness=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "conditional_readiness", - stage_id="08_dataset_assembly", - ), - source_plan=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "source_plan", - stage_id="03_source_planning", - ), - policyengine_entity_tables=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "policyengine_entity_tables", - stage_id="07_calibration", - ), - calibration_summary=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "calibration_summary", - stage_id="07_calibration", - ), - validation_evidence=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "validation_evidence", - stage_id="09_validation_benchmarking", - ), - policyengine_harness=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "policyengine_harness", - stage_id="09_validation_benchmarking", - ), - policyengine_native_scores=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "policyengine_native_scores", - stage_id="09_validation_benchmarking", - ), - policyengine_native_audit=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "policyengine_native_audit", - stage_id="09_validation_benchmarking", - ), - policyengine_native_target_diagnostics=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "policyengine_native_target_diagnostics", - stage_id="09_validation_benchmarking", - ), - child_tax_unit_agi_drift=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "child_tax_unit_agi_drift", - stage_id="09_validation_benchmarking", - ), - capital_gains_lots=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "capital_gains_lots", - stage_id="08_dataset_assembly", - ), - source_weight_diagnostics=_resolve_saved_stage_artifact_path( - artifact_root, - artifacts, - "source_weight_diagnostics", - stage_id="05_donor_integration_synthesis", - ), - run_registry=_resolve_saved_artifact_path( - artifact_root, - dict(manifest.get("run_registry", {})).get("path"), - ), - run_index_db=_resolve_saved_artifact_path( - artifact_root, - dict(manifest.get("run_index", {})).get("path"), - ), - ) - current_entry = None - frontier_entry = None - frontier_delta = None - if artifact_paths.run_registry is not None: - registry_entries = load_us_microplex_run_registry(artifact_paths.run_registry) - current_entry = next( - ( - entry - for entry in reversed(registry_entries) - if entry.artifact_id == artifact_root.name - ), - None, - ) - frontier_entry = select_us_microplex_frontier_entry( - artifact_paths.run_registry, - metric=frontier_metric, - ) - current_value = _registry_metric_value(current_entry, frontier_metric) - frontier_value = _registry_metric_value(frontier_entry, frontier_metric) - if current_value is not None and frontier_value is not None: - frontier_delta = current_value - frontier_value - return USMicroplexVersionedBuildArtifacts( - build_result=build_result, - artifact_paths=artifact_paths, - current_entry=current_entry, - frontier_entry=frontier_entry, - frontier_delta=frontier_delta, - ) diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_cli.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_cli.py deleted file mode 100644 index 060df33e..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_cli.py +++ /dev/null @@ -1,317 +0,0 @@ -"""CLI for the PE-US-data checkpoint rebuild runner.""" - -from __future__ import annotations - -import argparse -import json - -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_runner import ( - run_policyengine_us_data_rebuild_checkpoint, -) -from microplex_us.pipelines.stage_contracts import US_CANONICAL_STAGE_IDS -from microplex_us.pipelines.stage_run import parse_us_stage_input_override - - -def main(argv: list[str] | None = None) -> None: - """CLI entry point for one PE-US-data rebuild checkpoint.""" - - parser = argparse.ArgumentParser( - description="Run a versioned PE-US-data rebuild checkpoint in microplex-us." - ) - parser.add_argument("--output-root", required=True) - parser.add_argument("--baseline-dataset", required=True) - parser.add_argument("--targets-db", required=True) - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--policyengine-us-data-python") - parser.add_argument("--version-id") - parser.add_argument("--target-period", type=int, default=2024) - parser.add_argument("--target-profile", default="pe_native_broad") - parser.add_argument("--calibration-target-profile") - parser.add_argument( - "--calibration-target-source", - choices=["policyengine", "arch"], - default="policyengine", - help=( - "Target provider used for calibration. Use 'arch' with " - "--arch-targets-db for MP production calibration while keeping " - "--target-profile on the PE/eCPS comparison surface." - ), - ) - parser.add_argument( - "--arch-targets-db", - action="append", - default=[], - help=( - "Arch targets SQLite DB or consumer_facts.jsonl path for " - "calibration. May be supplied more than once." - ), - ) - parser.add_argument("--n-synthetic", type=int, default=100_000) - parser.add_argument("--random-seed", type=int, default=42) - parser.add_argument("--donor-imputer-condition-selection") - parser.add_argument( - "--donor-imputer-backend", - choices=["maf", "qrf", "zi_qrf", "regime_aware"], - default=None, - help=( - "Donor imputer backend. `zi_qrf` activates the zero-inflated " - "QRF path that skips predict() on gate-predicted-zero rows, " - "which is a large wall-clock win on heavy-zero PUF tax " - "variables. See docs/next-run-plan.md." - ), - ) - parser.add_argument("--cps-source-year", type=int, default=2023) - parser.add_argument("--puf-target-year", type=int) - parser.add_argument("--puf-cps-reference-year", type=int) - parser.add_argument("--acs-year", type=int, default=2024) - parser.add_argument("--sipp-year", type=int, default=2023) - parser.add_argument("--scf-year", type=int, default=2022) - parser.add_argument("--cps-cache-dir") - parser.add_argument("--puf-cache-dir") - parser.add_argument("--donor-cache-dir") - parser.add_argument("--puf-path") - parser.add_argument("--puf-demographics-path") - parser.add_argument("--cps-sample-n", type=int) - parser.add_argument("--puf-sample-n", type=int) - parser.add_argument("--donor-sample-n", type=int) - parser.add_argument("--query-random-seed", type=int, default=0) - parser.add_argument("--target-variable", action="append", default=[]) - parser.add_argument("--target-domain", action="append", default=[]) - parser.add_argument("--target-geo-level", action="append", default=[]) - parser.add_argument("--calibration-target-variable", action="append", default=[]) - parser.add_argument("--calibration-target-domain", action="append", default=[]) - parser.add_argument("--calibration-target-geo-level", action="append", default=[]) - parser.add_argument( - "--include-donor-surveys", - action=argparse.BooleanOptionalAction, - default=True, - ) - parser.add_argument( - "--include-sipp", - action=argparse.BooleanOptionalAction, - default=None, - help="Include SIPP donor providers. Defaults to --include-donor-surveys.", - ) - parser.add_argument( - "--include-scf", - action=argparse.BooleanOptionalAction, - default=None, - help="Include the SCF donor provider. Defaults to --include-donor-surveys.", - ) - parser.add_argument("--no-cps-download", action="store_true") - parser.add_argument("--no-puf-expand-persons", action="store_true") - parser.add_argument("--defer-policyengine-harness", action="store_true") - parser.add_argument("--defer-policyengine-native-score", action="store_true") - parser.add_argument("--defer-native-audit", action="store_true") - parser.add_argument("--defer-imputation-ablation", action="store_true") - parser.add_argument("--require-policyengine-native-score", action="store_true") - parser.add_argument( - "--calibration-backend", - choices=[ - "entropy", - "ipf", - "chi2", - "sparse", - "hardconcrete", - "pe_l0", - "microcalibrate", - "none", - ], - default=None, - help=( - "Weighting/calibration backend. Default is the config default " - "(entropy). Use `microcalibrate` for the identity-preserving " - "gradient-descent chi-squared backend that survived the v6 OOM." - ), - ) - parser.add_argument( - "--calibration-max-iter", - type=int, - default=None, - help=( - "Max iterations / epochs for the calibration solver. Passed " - "through to USMicroplexBuildConfig.calibration_max_iter." - ), - ) - parser.add_argument( - "--policyengine-materialize-batch-size", - type=int, - default=None, - help=( - "If set, splits PolicyEngine variable materialization into " - "household chunks of this size. At 1.5M-household scale a " - "single Microsimulation is 25-35 GB; batch_size=100_000 " - "drops peak to a few GB. Required for workstation runs; " - "unset (full-dataset) path targeted Modal GPU." - ), - ) - parser.add_argument( - "--pipeline-checkpoint-save-post-imputation-path", - type=str, - default=None, - help=( - "If set, save a post-imputation pipeline checkpoint to this " - "directory (right after donor imputation + PE tables build, " - "before microsim). A rerun can resume from this checkpoint " - "to skip the ~11 h synthesis stage." - ), - ) - parser.add_argument( - "--policyengine-export-column-contract-path", - type=str, - default=None, - help=( - "If set, check the eCPS export-column contract from the " - "post-imputation PE entity tables before microsimulation and " - "calibration." - ), - ) - parser.add_argument( - "--pipeline-checkpoint-save-post-microsim-path", - type=str, - default=None, - help=( - "If set, save a post-microsim pipeline checkpoint to this " - "directory (after target variables are materialized, before " - "the calibration fit loop). A rerun can resume from this " - "checkpoint to skip both synthesis and microsim, leaving " - "only the calibration fit." - ), - ) - parser.add_argument( - "--resume-from-stage", - choices=US_CANONICAL_STAGE_IDS, - default=None, - help=( - "Resume an existing saved run from this canonical stage. Requires " - "--version-id unless --output-root points directly at a saved artifact " - "directory. The runner validates required durable artifacts before " - "starting." - ), - ) - parser.add_argument( - "--capital-gains-lots", - action=argparse.BooleanOptionalAction, - default=None, - help=( - "Write an anchor-preserving synthetic capital-gains lot SQLite " - "sidecar artifact from PolicyEngine person tables." - ), - ) - parser.add_argument("--capital-gains-lots-max-lots-per-person", type=int) - parser.add_argument("--capital-gains-lots-random-seed", type=int) - parser.add_argument( - "--allow-stage-input-overrides", - action="store_true", - help=( - "Allow typed stage manifests to consume explicit CLI input overrides " - "instead of the immediately previous stage manifest." - ), - ) - parser.add_argument( - "--stage-input-override", - action="append", - default=[], - metavar="STAGE_ID.KEY=PATH", - help=("Explicit stage input override. Requires --allow-stage-input-overrides."), - ) - args = parser.parse_args(argv) - stage_input_overrides = tuple( - parse_us_stage_input_override(value) for value in args.stage_input_override - ) - if stage_input_overrides and not args.allow_stage_input_overrides: - parser.error("--stage-input-override requires --allow-stage-input-overrides") - - config_overrides = { - "n_synthetic": int(args.n_synthetic), - "random_seed": int(args.random_seed), - } - if args.donor_imputer_condition_selection is not None: - config_overrides["donor_imputer_condition_selection"] = ( - args.donor_imputer_condition_selection - ) - if args.donor_imputer_backend is not None: - config_overrides["donor_imputer_backend"] = args.donor_imputer_backend - if args.calibration_backend is not None: - config_overrides["calibration_backend"] = args.calibration_backend - if args.calibration_max_iter is not None: - config_overrides["calibration_max_iter"] = int(args.calibration_max_iter) - if args.policyengine_materialize_batch_size is not None: - config_overrides["policyengine_materialize_batch_size"] = int( - args.policyengine_materialize_batch_size - ) - if args.pipeline_checkpoint_save_post_imputation_path is not None: - config_overrides["pipeline_checkpoint_save_post_imputation_path"] = ( - args.pipeline_checkpoint_save_post_imputation_path - ) - if args.policyengine_export_column_contract_path is not None: - config_overrides["policyengine_export_column_contract_path"] = ( - args.policyengine_export_column_contract_path - ) - if args.pipeline_checkpoint_save_post_microsim_path is not None: - config_overrides["pipeline_checkpoint_save_post_microsim_path"] = ( - args.pipeline_checkpoint_save_post_microsim_path - ) - if args.capital_gains_lots is not None: - config_overrides["capital_gains_lots_enabled"] = bool(args.capital_gains_lots) - if args.capital_gains_lots_max_lots_per_person is not None: - config_overrides["capital_gains_lots_max_lots_per_person"] = int( - args.capital_gains_lots_max_lots_per_person - ) - if args.capital_gains_lots_random_seed is not None: - config_overrides["capital_gains_lots_random_seed"] = int( - args.capital_gains_lots_random_seed - ) - - result = run_policyengine_us_data_rebuild_checkpoint( - output_root=args.output_root, - policyengine_baseline_dataset=args.baseline_dataset, - policyengine_targets_db=args.targets_db, - arch_targets_db=(tuple(args.arch_targets_db) if args.arch_targets_db else None), - calibration_target_source=args.calibration_target_source, - target_period=args.target_period, - target_profile=args.target_profile, - calibration_target_profile=args.calibration_target_profile, - target_variables=tuple(args.target_variable), - target_domains=tuple(args.target_domain), - target_geo_levels=tuple(args.target_geo_level), - calibration_target_variables=tuple(args.calibration_target_variable), - calibration_target_domains=tuple(args.calibration_target_domain), - calibration_target_geo_levels=tuple(args.calibration_target_geo_level), - config_overrides=config_overrides, - cps_source_year=args.cps_source_year, - cps_cache_dir=args.cps_cache_dir, - cps_download=not args.no_cps_download, - puf_target_year=args.puf_target_year, - puf_cps_reference_year=args.puf_cps_reference_year, - puf_cache_dir=args.puf_cache_dir, - puf_path=args.puf_path, - puf_demographics_path=args.puf_demographics_path, - puf_expand_persons=not args.no_puf_expand_persons, - include_donor_surveys=args.include_donor_surveys, - include_sipp=args.include_sipp, - include_scf=args.include_scf, - acs_year=args.acs_year, - sipp_year=args.sipp_year, - scf_year=args.scf_year, - donor_cache_dir=args.donor_cache_dir, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_python=args.policyengine_us_data_python, - cps_sample_n=args.cps_sample_n, - puf_sample_n=args.puf_sample_n, - donor_sample_n=args.donor_sample_n, - query_random_seed=args.query_random_seed, - version_id=args.version_id, - defer_policyengine_harness=args.defer_policyengine_harness, - require_policyengine_native_score=args.require_policyengine_native_score, - defer_policyengine_native_score=args.defer_policyengine_native_score, - defer_native_audit=args.defer_native_audit, - defer_imputation_ablation=args.defer_imputation_ablation, - allow_stage_input_overrides=args.allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - resume_from_stage=args.resume_from_stage, - ) - - print(result.artifacts.artifact_paths.output_dir) - print(result.parity_path) - print(json.dumps(result.parity_payload["verdict"], indent=2, sort_keys=True)) diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_common.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_common.py deleted file mode 100644 index 019737e0..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_common.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Shared utilities for the PE-US-data checkpoint runner.""" - -from __future__ import annotations - -import json -import logging -import sys -from pathlib import Path -from typing import Any - -LOGGER = logging.getLogger("microplex_us.pipelines.pe_us_data_rebuild_checkpoint") - - -def _root_logger_has_handlers() -> bool: - return bool(logging.getLogger().handlers) - - -def _emit_checkpoint_progress(message: str, /, **context: object) -> None: - details = ", ".join( - f"{key}={value}" - for key, value in context.items() - if value is not None and value != "" - ) - line = f"{message} [{details}]" if details else message - LOGGER.info(line) - if not LOGGER.handlers and not _root_logger_has_handlers(): - print(line, file=sys.stderr, flush=True) - - -def _write_json_atomically(path: Path, payload: dict[str, Any]) -> None: - temp_path = path.with_name(f".{path.name}.tmp") - temp_path.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temp_path.replace(path) - - -def _resolve_policyengine_us_runtime_version() -> str | None: - from importlib.metadata import PackageNotFoundError, version - - try: - return version("policyengine-us") - except PackageNotFoundError: - return None diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_config.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_config.py deleted file mode 100644 index 679de1e7..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_config.py +++ /dev/null @@ -1,277 +0,0 @@ -"""Configuration and query helpers for PE-US-data checkpoint rebuilds.""" - -from __future__ import annotations - -from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal - -import h5py -from microplex.core import SourceQuery - -from microplex_us.pipelines.pe_us_data_rebuild import ( - default_policyengine_us_data_rebuild_config, -) -from microplex_us.pipelines.us import USMicroplexBuildConfig - -if TYPE_CHECKING: - from microplex.core import SourceProvider - -DEFAULT_ARCH_CALIBRATION_TARGET_PROFILE = "pe_native_broad_source_backed" - - -def _resolve_checkpoint_calibration_target_variables( - calibration_target_variables: tuple[str, ...], -) -> tuple[str, ...]: - return tuple(calibration_target_variables) - - -def _normalize_path_value(value: str | Path | None) -> str | None: - if value is None: - return None - return str(Path(value).expanduser()) - - -def _normalize_arch_targets_db_value( - value: str | Path | tuple[str | Path, ...] | None, -) -> str | tuple[str, ...] | None: - if value is None: - return None - if isinstance(value, (str, Path)): - return str(Path(value).expanduser()) - return tuple(str(Path(path).expanduser()) for path in value) - - -def _validate_checkpoint_config_context( - config: USMicroplexBuildConfig, - *, - policyengine_baseline_dataset: str | Path, - policyengine_targets_db: str | Path, - arch_targets_db: str | Path | tuple[str | Path, ...] | None, - calibration_target_source: Literal["policyengine", "arch"], - target_period: int, - target_profile: str, - calibration_target_profile: str | None, - target_variables: tuple[str, ...], - target_domains: tuple[str, ...], - target_geo_levels: tuple[str, ...], - calibration_target_variables: tuple[str, ...], - calibration_target_domains: tuple[str, ...], - calibration_target_geo_levels: tuple[str, ...], -) -> None: - expected_pairs = { - "policyengine_baseline_dataset": _normalize_path_value( - policyengine_baseline_dataset - ), - "policyengine_targets_db": _normalize_path_value(policyengine_targets_db), - "arch_targets_db": _normalize_arch_targets_db_value(arch_targets_db), - "calibration_target_source": calibration_target_source, - "policyengine_dataset_year": int(target_period), - "policyengine_target_period": int(target_period), - "policyengine_target_profile": target_profile, - "policyengine_calibration_target_profile": ( - calibration_target_profile - or ( - DEFAULT_ARCH_CALIBRATION_TARGET_PROFILE - if calibration_target_source == "arch" - else target_profile - ) - ), - "policyengine_target_variables": tuple(target_variables), - "policyengine_target_domains": tuple(target_domains), - "policyengine_target_geo_levels": tuple(target_geo_levels), - "policyengine_calibration_target_variables": ( - _resolve_checkpoint_calibration_target_variables( - calibration_target_variables - ) - ), - "policyengine_calibration_target_domains": tuple(calibration_target_domains), - "policyengine_calibration_target_geo_levels": tuple( - calibration_target_geo_levels - ), - } - for key, expected in expected_pairs.items(): - observed = getattr(config, key) - if observed != expected: - raise ValueError( - "Explicit config does not match the requested PE rebuild context for " - f"{key}: expected {expected!r}, observed {observed!r}" - ) - - -def _validate_query_keys( - provider_names: tuple[str, ...], - queries: dict[str, SourceQuery], -) -> None: - unexpected = sorted(set(queries) - set(provider_names)) - if unexpected: - allowed = ", ".join(provider_names) - unexpected_text = ", ".join(unexpected) - raise ValueError( - "Checkpoint queries include unknown provider keys: " - f"{unexpected_text}. Expected one of: {allowed}" - ) - - -def _infer_policyengine_baseline_household_weight_sum( - baseline_dataset: str | Path, - *, - target_period: int, -) -> float | None: - """Best-effort household-weight target inferred from the PE baseline dataset.""" - - dataset_path = Path(baseline_dataset).expanduser() - if not dataset_path.exists(): - return None - try: - with h5py.File(dataset_path, "r") as handle: - weights = handle.get("household_weight") - if weights is None: - return None - period_key = str(int(target_period)) - if period_key not in weights: - return None - weight_sum = float(weights[period_key][...].sum()) - except (FileNotFoundError, OSError, ValueError): - return None - return weight_sum if weight_sum > 0.0 else None - - -def default_policyengine_us_data_rebuild_checkpoint_config( - *, - policyengine_baseline_dataset: str | Path, - policyengine_targets_db: str | Path, - arch_targets_db: str | Path | tuple[str | Path, ...] | None = None, - calibration_target_source: Literal["policyengine", "arch"] = "policyengine", - target_period: int = 2024, - target_profile: str = "pe_native_broad", - calibration_target_profile: str | None = None, - target_variables: tuple[str, ...] = (), - target_domains: tuple[str, ...] = (), - target_geo_levels: tuple[str, ...] = (), - calibration_target_variables: tuple[str, ...] = (), - calibration_target_domains: tuple[str, ...] = (), - calibration_target_geo_levels: tuple[str, ...] = (), - **overrides: Any, -) -> USMicroplexBuildConfig: - """Return the canonical rebuild config with required PE comparison context.""" - - resolved_target_period = int(target_period) - if calibration_target_source not in {"policyengine", "arch"}: - raise ValueError( - "calibration_target_source must be 'policyengine' or 'arch', " - f"got {calibration_target_source!r}" - ) - resolved_arch_targets_db = _normalize_arch_targets_db_value(arch_targets_db) - if calibration_target_source == "arch" and resolved_arch_targets_db is None: - raise ValueError( - "arch_targets_db is required when calibration_target_source='arch'" - ) - resolved_calibration_target_profile = calibration_target_profile or ( - DEFAULT_ARCH_CALIBRATION_TARGET_PROFILE - if calibration_target_source == "arch" - else target_profile - ) - resolved_baseline_weight_sum = _infer_policyengine_baseline_household_weight_sum( - policyengine_baseline_dataset, - target_period=resolved_target_period, - ) - resolved_overrides = dict(overrides) - infer_total_weight_targets = ( - resolved_baseline_weight_sum is not None - and resolved_overrides.get("calibration_backend") != "none" - ) - if infer_total_weight_targets: - resolved_overrides.setdefault( - "policyengine_selection_target_total_weight", - resolved_baseline_weight_sum, - ) - if not resolved_overrides.get( - "policyengine_calibration_rescale_to_input_weight_sum", - False, - ): - resolved_overrides.setdefault( - "policyengine_calibration_target_total_weight", - resolved_baseline_weight_sum, - ) - resolved_overrides.setdefault( - "policyengine_calibration_rescale_to_target_total_weight", - True, - ) - return default_policyengine_us_data_rebuild_config( - policyengine_baseline_dataset=str(policyengine_baseline_dataset), - policyengine_targets_db=str(policyengine_targets_db), - arch_targets_db=resolved_arch_targets_db, - calibration_target_source=calibration_target_source, - policyengine_dataset_year=resolved_target_period, - policyengine_target_period=resolved_target_period, - policyengine_target_profile=target_profile, - policyengine_calibration_target_profile=resolved_calibration_target_profile, - policyengine_target_variables=tuple(target_variables), - policyengine_target_domains=tuple(target_domains), - policyengine_target_geo_levels=tuple(target_geo_levels), - policyengine_calibration_target_variables=( - _resolve_checkpoint_calibration_target_variables( - calibration_target_variables - ) - ), - policyengine_calibration_target_domains=tuple(calibration_target_domains), - policyengine_calibration_target_geo_levels=tuple(calibration_target_geo_levels), - **resolved_overrides, - ) - - -def default_policyengine_us_data_rebuild_queries( - providers: tuple[SourceProvider, ...] | list[SourceProvider], - *, - cps_sample_n: int | None = None, - puf_sample_n: int | None = None, - donor_sample_n: int | None = None, - cps_state_age_floor: int | None = 1, - donor_state_age_floor: int | None = 1, - random_seed: int = 0, -) -> dict[str, SourceQuery]: - """Return default provider queries for a rebuild checkpoint smoke run.""" - - from microplex_us.data_sources.cps import CPSASECSourceProvider - from microplex_us.data_sources.donor_surveys import DonorSurveySourceProvider - from microplex_us.data_sources.puf import PUFSourceProvider - - resolved_donor_sample_n = donor_sample_n - if resolved_donor_sample_n is None: - source_sample_sizes = tuple( - int(sample_n) - for sample_n in (cps_sample_n, puf_sample_n) - if sample_n is not None - ) - if source_sample_sizes: - resolved_donor_sample_n = max(source_sample_sizes) - - queries: dict[str, SourceQuery] = {} - for provider in providers: - sample_n: int | None = None - if isinstance(provider, CPSASECSourceProvider): - sample_n = cps_sample_n - elif isinstance(provider, PUFSourceProvider): - sample_n = puf_sample_n - elif isinstance(provider, DonorSurveySourceProvider): - sample_n = resolved_donor_sample_n - if sample_n is None: - continue - provider_filters = { - "sample_n": int(sample_n), - "random_seed": int(random_seed), - } - if ( - isinstance(provider, CPSASECSourceProvider) - and cps_state_age_floor is not None - ): - provider_filters["state_age_floor"] = int(cps_state_age_floor) - elif ( - isinstance(provider, DonorSurveySourceProvider) - and donor_state_age_floor is not None - ): - provider_filters["state_age_floor"] = int(donor_state_age_floor) - queries[provider.descriptor.name] = SourceQuery( - provider_filters=provider_filters - ) - return queries diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_evidence.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_evidence.py deleted file mode 100644 index bbe235ce..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_evidence.py +++ /dev/null @@ -1,690 +0,0 @@ -"""Evidence attachment for PE-US-data checkpoint rebuilds.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import TYPE_CHECKING, Any - -from microplex.targets import assert_valid_benchmark_artifact_manifest - -from microplex_us.pipelines.index_db import append_us_microplex_run_index_entry -from microplex_us.pipelines.pe_us_data_rebuild import ( - PEUSDataRebuildProgram, - default_policyengine_us_data_rebuild_program, -) -from microplex_us.pipelines.pe_us_data_rebuild_audit import ( - build_policyengine_us_data_rebuild_native_audit, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_ablation import ( - _build_checkpoint_imputation_ablation_payload, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_common import ( - _resolve_policyengine_us_runtime_version, - _write_json_atomically, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_types import ( - PEUSDataRebuildCheckpointEvidenceResult, -) -from microplex_us.pipelines.pe_us_data_rebuild_parity import ( - build_policyengine_us_data_rebuild_parity_artifact, - write_policyengine_us_data_rebuild_parity_artifact, -) -from microplex_us.pipelines.registry import ( - append_us_microplex_run_registry_entry, - build_us_microplex_run_registry_entry, - load_us_microplex_run_registry, -) -from microplex_us.pipelines.stage_contracts import ( - canonicalize_us_pipeline_stage_id, - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_metrics import stage_metrics -from microplex_us.pipelines.stage_run import ( - write_us_stage_run_manifests_from_artifact_manifest, -) - -if TYPE_CHECKING: - from microplex.targets import TargetProvider - - from microplex_us.pipelines.registry import FrontierMetric - from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessSlice, - ) - - -def _refresh_checkpoint_data_flow_snapshot( - artifact_root: Path, - manifest: dict[str, Any], - *, - extra_outputs: tuple[str, ...] = (), -) -> Path | None: - if extra_outputs: - manifest.setdefault("diagnostics", {}).setdefault( - "checkpoint_extra_outputs", - list(extra_outputs), - ) - try: - updated_manifest = write_us_stage_run_manifests_from_artifact_manifest( - artifact_root, - manifest, - ) - except ValueError as exc: - manifest.setdefault("diagnostics", {})["checkpoint_stage_refresh_error"] = ( - f"{type(exc).__name__}: {exc}" - ) - return _patch_checkpoint_data_flow_snapshot_outputs( - artifact_root, - manifest=manifest, - extra_outputs=extra_outputs, - ) - manifest.clear() - manifest.update(updated_manifest) - snapshot_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - ) - if extra_outputs: - return _patch_checkpoint_data_flow_snapshot_outputs( - artifact_root, - manifest=manifest, - extra_outputs=extra_outputs, - ) - return snapshot_path if snapshot_path.exists() else None - - -def _patch_checkpoint_data_flow_snapshot_outputs( - artifact_root: Path, - *, - manifest: dict[str, Any], - extra_outputs: tuple[str, ...], -) -> Path | None: - snapshot_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - ) - if not snapshot_path.exists(): - return None - snapshot = json.loads(snapshot_path.read_text()) - stages = snapshot.get("stages") - if not isinstance(stages, list): - return snapshot_path - validation_stage = None - for stage in stages: - if not isinstance(stage, dict): - continue - stage_id = str(stage.get("id", "")) - if canonicalize_us_pipeline_stage_id(stage_id) == "09_validation_benchmarking": - validation_stage = stage - stage["id"] = "09_validation_benchmarking" - break - if validation_stage is None: - validation_stage = { - "id": "09_validation_benchmarking", - "outputs": [], - "metrics": [], - "status": "ready", - } - stages.append(validation_stage) - existing_outputs = list(validation_stage.get("outputs") or ()) - validation_stage["outputs"] = list( - dict.fromkeys( - [ - *existing_outputs, - *_checkpoint_validation_output_names(manifest), - *extra_outputs, - ] - ) - ) - if not validation_stage.get("metrics"): - validation_stage["metrics"] = stage_metrics( - "09_validation_benchmarking", - manifest=manifest, - ) - if extra_outputs: - validation_stage["status"] = "ready" - else: - validation_stage.setdefault("status", "ready") - _write_json_atomically(snapshot_path, snapshot) - return snapshot_path - - -def _checkpoint_validation_output_names(manifest: dict[str, Any]) -> tuple[str, ...]: - artifacts = dict(manifest.get("artifacts", {})) - ordered_keys = ( - "policyengine_harness", - "policyengine_native_scores", - "imputation_ablation", - "policyengine_native_audit", - "policyengine_native_target_diagnostics", - "child_tax_unit_agi_drift", - ) - return tuple( - str(artifacts[key]) - for key in ordered_keys - if isinstance(artifacts.get(key), str) - ) - - -def _attach_checkpoint_registry_and_index( - artifact_root: Path, - manifest: dict[str, Any], - *, - harness_path: Path | None, - harness_payload: dict[str, Any] | None, - run_registry_path: str | Path | None, - run_index_path: str | Path | None, - run_registry_metadata: dict[str, Any] | None, -) -> tuple[Path | None, Path | None]: - if ( - manifest.get("calibration", {}).get( - "full_oracle_capped_mean_abs_relative_error" - ) - is None - and manifest.get("calibration", {}).get("full_oracle_mean_abs_relative_error") - is None - and "policyengine_harness" not in manifest - and "policyengine_native_scores" not in manifest - ): - return None, None - if ( - "policyengine_harness" not in manifest - and "policyengine_native_scores" not in manifest - ): - resolved_harness_payload = None - else: - resolved_harness_payload = ( - dict(harness_payload) - if harness_payload is not None - else ( - json.loads(harness_path.read_text()) - if harness_path is not None and harness_path.exists() - else None - ) - ) - resolved_run_registry_path = Path( - run_registry_path or artifact_root.parent / "run_registry.jsonl" - ) - existing_entry = next( - ( - entry - for entry in reversed( - load_us_microplex_run_registry(resolved_run_registry_path) - ) - if entry.artifact_id == artifact_root.name - ), - None, - ) - if existing_entry is None: - run_entry = build_us_microplex_run_registry_entry( - artifact_dir=artifact_root, - manifest_path=artifact_root / "manifest.json", - manifest=manifest, - policyengine_harness_path=harness_path, - policyengine_harness_payload=resolved_harness_payload, - metadata=dict(run_registry_metadata or {}), - ) - recorded_entry = append_us_microplex_run_registry_entry( - resolved_run_registry_path, - run_entry, - ) - else: - recorded_entry = existing_entry - resolved_run_index_path = append_us_microplex_run_index_entry( - run_index_path or artifact_root.parent, - recorded_entry, - policyengine_harness_payload=resolved_harness_payload, - ) - manifest["run_registry"] = { - "path": str(resolved_run_registry_path), - "artifact_id": recorded_entry.artifact_id, - "improved_candidate_frontier": recorded_entry.improved_candidate_frontier, - "improved_delta_frontier": recorded_entry.improved_delta_frontier, - "improved_composite_frontier": recorded_entry.improved_composite_frontier, - "improved_native_frontier": recorded_entry.improved_native_frontier, - "default_frontier_metric": _checkpoint_default_frontier_metric(manifest), - } - manifest["run_index"] = { - "path": str(resolved_run_index_path), - "artifact_id": recorded_entry.artifact_id, - } - return resolved_run_registry_path, resolved_run_index_path - - -def _checkpoint_default_frontier_metric(manifest: dict[str, Any]) -> FrontierMetric: - if ( - dict(manifest.get("calibration", {})).get( - "full_oracle_capped_mean_abs_relative_error" - ) - is not None - ): - return "full_oracle_capped_mean_abs_relative_error" - if ( - dict(manifest.get("calibration", {})).get("full_oracle_mean_abs_relative_error") - is not None - ): - return "full_oracle_mean_abs_relative_error" - if "policyengine_native_scores" in manifest: - return "enhanced_cps_native_loss_delta" - return "candidate_composite_parity_loss" - - -def _build_checkpoint_harness_context( - *, - manifest: dict[str, Any], - policyengine_target_provider: TargetProvider | None, - policyengine_baseline_dataset: str | Path | None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ), - policyengine_harness_metadata: dict[str, Any] | None, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None, -) -> tuple[ - TargetProvider | None, - str | Path | None, - tuple[PolicyEngineUSHarnessSlice, ...], - dict[str, Any], -]: - from microplex_us.policyengine.harness import ( - default_policyengine_us_db_all_target_slices, - default_policyengine_us_harness_slices, - filter_nonempty_policyengine_us_harness_slices, - ) - from microplex_us.policyengine.us import PolicyEngineUSDBTargetProvider - - config = dict(manifest.get("config", {})) - resolved_target_provider = policyengine_target_provider - if ( - resolved_target_provider is None - and config.get("policyengine_targets_db") is not None - ): - resolved_target_provider = PolicyEngineUSDBTargetProvider( - config["policyengine_targets_db"] - ) - resolved_baseline_dataset = policyengine_baseline_dataset or config.get( - "policyengine_baseline_dataset" - ) - harness_period = ( - config.get("policyengine_dataset_year") - or config.get("policyengine_target_period") - or 2024 - ) - if policyengine_harness_slices is not None: - resolved_harness_slices = tuple(policyengine_harness_slices) - elif config.get("policyengine_targets_db") is not None: - resolved_harness_slices = default_policyengine_us_db_all_target_slices( - period=int(harness_period), - reform_id=int(config.get("policyengine_target_reform_id", 0) or 0), - ) - else: - resolved_harness_slices = default_policyengine_us_harness_slices( - period=int(harness_period) - ) - if resolved_target_provider is not None and resolved_harness_slices: - resolved_harness_slices = filter_nonempty_policyengine_us_harness_slices( - resolved_target_provider, - resolved_harness_slices, - cache=policyengine_comparison_cache, - ) - resolved_harness_metadata = { - "baseline_dataset": ( - Path(resolved_baseline_dataset).name - if resolved_baseline_dataset is not None - else None - ), - "targets_db": ( - Path(config["policyengine_targets_db"]).name - if config.get("policyengine_targets_db") is not None - else None - ), - "target_period": config.get("policyengine_target_period"), - "target_variables": list(config.get("policyengine_target_variables", ())), - "target_domains": list(config.get("policyengine_target_domains", ())), - "target_geo_levels": list(config.get("policyengine_target_geo_levels", ())), - "target_profile": config.get("policyengine_target_profile"), - "calibration_target_profile": config.get( - "policyengine_calibration_target_profile" - ), - "target_reform_id": config.get("policyengine_target_reform_id"), - "harness_slice_names": [ - slice_spec.name for slice_spec in resolved_harness_slices - ], - "policyengine_us_runtime_version": _resolve_policyengine_us_runtime_version(), - "harness_suite": ( - "policyengine_us_all_targets" - if config.get("policyengine_targets_db") is not None - and policyengine_harness_slices is None - else None - ), - **dict(policyengine_harness_metadata or {}), - } - return ( - resolved_target_provider, - resolved_baseline_dataset, - resolved_harness_slices, - resolved_harness_metadata, - ) - - -def attach_policyengine_us_data_rebuild_checkpoint_evidence( - artifact_dir: str | Path, - *, - build_result: Any | None = None, - program: PEUSDataRebuildProgram | None = None, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - compute_harness: bool = True, - compute_native_scores: bool = True, - compute_native_audit: bool = True, - compute_imputation_ablation: bool = False, - require_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - precomputed_imputation_ablation_payload: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, -) -> PEUSDataRebuildCheckpointEvidenceResult: - """Attach PE comparison evidence to an already-saved rebuild artifact.""" - - from microplex_us.pipelines.pe_native_scores import ( - build_us_pe_native_target_diagnostics_payload, - compute_us_pe_native_scores, - ) - from microplex_us.policyengine.harness import evaluate_policyengine_us_harness - from microplex_us.policyengine.us import load_policyengine_us_entity_tables - - artifact_root = Path(artifact_dir) - manifest_path = artifact_root / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - config = dict(manifest.get("config", {})) - artifacts = dict(manifest.get("artifacts", {})) - dataset_name = artifacts.get("policyengine_dataset") - dataset_path = ( - artifact_root / dataset_name if isinstance(dataset_name, str) else None - ) - if dataset_path is None or not dataset_path.exists(): - raise FileNotFoundError( - "Saved rebuild artifact is missing policyengine_dataset output" - ) - - harness_path: Path | None = None - harness_payload = ( - dict(precomputed_policyengine_harness_payload) - if precomputed_policyengine_harness_payload is not None - else None - ) - if harness_payload is None and compute_harness: - ( - resolved_target_provider, - resolved_baseline_dataset, - resolved_harness_slices, - resolved_harness_metadata, - ) = _build_checkpoint_harness_context( - manifest=manifest, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_comparison_cache=policyengine_comparison_cache, - ) - if resolved_target_provider is None: - raise ValueError( - "Cannot compute rebuild checkpoint harness without a target provider" - ) - if resolved_baseline_dataset is None: - raise ValueError( - "Cannot compute rebuild checkpoint harness without a baseline dataset" - ) - if not resolved_harness_slices: - raise ValueError( - "Cannot compute rebuild checkpoint harness because no nonempty slices resolved" - ) - candidate_tables = load_policyengine_us_entity_tables( - dataset_path, - period=( - config.get("policyengine_dataset_year") - or config.get("policyengine_target_period") - or 2024 - ), - ) - harness_run = evaluate_policyengine_us_harness( - candidate_tables, - resolved_target_provider, - resolved_harness_slices, - baseline_dataset=str(resolved_baseline_dataset), - dataset_year=config.get("policyengine_dataset_year"), - simulation_cls=None, - candidate_label="microplex", - baseline_label="policyengine_us_data", - metadata=resolved_harness_metadata, - cache=policyengine_comparison_cache, - ) - harness_payload = harness_run.to_dict() - if harness_payload is not None: - harness_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "09_validation_benchmarking", - "policyengine_harness", - ) - _write_json_atomically(harness_path, harness_payload) - artifacts["policyengine_harness"] = harness_path.name - manifest["policyengine_harness"] = dict(harness_payload.get("summary", {})) - - native_scores_path: Path | None = None - native_scores_payload = ( - dict(precomputed_policyengine_native_scores) - if precomputed_policyengine_native_scores is not None - else None - ) - if native_scores_payload is None and compute_native_scores: - resolved_baseline_dataset = policyengine_baseline_dataset or config.get( - "policyengine_baseline_dataset" - ) - if resolved_baseline_dataset is None: - raise ValueError( - "Cannot compute PE-native scores without a baseline dataset" - ) - native_scores_payload = compute_us_pe_native_scores( - candidate_dataset_path=dataset_path, - baseline_dataset_path=resolved_baseline_dataset, - period=( - config.get("policyengine_dataset_year") - or config.get("policyengine_target_period") - or 2024 - ), - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - if native_scores_payload is not None: - native_scores_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "09_validation_benchmarking", - "policyengine_native_scores", - ) - _write_json_atomically(native_scores_path, native_scores_payload) - artifacts["policyengine_native_scores"] = native_scores_path.name - manifest["policyengine_native_scores"] = dict( - native_scores_payload.get("summary", {}) - ) - elif require_policyengine_native_score: - raise ValueError( - "require_policyengine_native_score=True but no PE-native scores were computed" - ) - - imputation_ablation_path: Path | None = None - imputation_ablation_payload = ( - dict(precomputed_imputation_ablation_payload) - if precomputed_imputation_ablation_payload is not None - else None - ) - if ( - imputation_ablation_payload is None - and compute_imputation_ablation - and build_result is not None - ): - imputation_ablation_payload = _build_checkpoint_imputation_ablation_payload( - build_result, - artifact_id=artifact_root.name, - manifest=manifest, - ) - if imputation_ablation_payload is not None: - imputation_ablation_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "09_validation_benchmarking", - "imputation_ablation", - ) - _write_json_atomically(imputation_ablation_path, imputation_ablation_payload) - artifacts["imputation_ablation"] = imputation_ablation_path.name - manifest["imputation_ablation"] = dict( - imputation_ablation_payload.get("summary", {}) - ) - - manifest["artifacts"] = artifacts - _attach_checkpoint_registry_and_index( - artifact_root, - manifest, - harness_path=harness_path, - harness_payload=harness_payload, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=run_registry_metadata, - ) - assert_valid_benchmark_artifact_manifest( - manifest, - artifact_dir=artifact_root, - manifest_path=manifest_path, - summary_section=( - "policyengine_harness" if "policyengine_harness" in manifest else None - ), - required_artifact_keys=( - "seed_data", - "synthetic_data", - "calibrated_data", - "targets", - *( - ("policyengine_harness",) - if artifacts.get("policyengine_harness") is not None - else () - ), - *( - ("policyengine_native_scores",) - if artifacts.get("policyengine_native_scores") is not None - else () - ), - ), - required_summary_keys=( - ( - "candidate_mean_abs_relative_error", - "baseline_mean_abs_relative_error", - "mean_abs_relative_error_delta", - ) - if "policyengine_harness" in manifest - else () - ), - ) - resolved_program = program or default_policyengine_us_data_rebuild_program() - parity_path = write_policyengine_us_data_rebuild_parity_artifact( - artifact_root, - program=resolved_program, - ) - parity_payload = build_policyengine_us_data_rebuild_parity_artifact( - artifact_root, - program=resolved_program, - ) - native_audit_path: Path | None = None - native_audit_payload: dict[str, Any] | None = None - native_target_diagnostics_path: Path | None = None - native_target_diagnostics_payload: dict[str, Any] | None = None - if compute_native_audit and artifacts.get("policyengine_native_scores") is not None: - native_audit_payload = build_policyengine_us_data_rebuild_native_audit( - artifact_root, - manifest_payload=manifest, - native_scores_payload=native_scores_payload, - imputation_ablation_payload=imputation_ablation_payload, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - native_audit_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "09_validation_benchmarking", - "policyengine_native_audit", - ) - _write_json_atomically(native_audit_path, native_audit_payload) - artifacts["policyengine_native_audit"] = native_audit_path.name - manifest["policyengine_native_audit"] = dict( - native_audit_payload.get("verdictHints", {}) - ) - target_delta_payload = native_audit_payload.get("targetDelta") - if isinstance(target_delta_payload, dict): - native_target_diagnostics_payload = ( - build_us_pe_native_target_diagnostics_payload( - period=( - config.get("policyengine_dataset_year") - or config.get("policyengine_target_period") - or 2024 - ), - from_label="policyengine-us-data", - to_label="microplex-us", - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - policyengine_targets_db_path=config.get("policyengine_targets_db"), - target_delta_payload=target_delta_payload, - artifact_id=str( - native_audit_payload.get("artifactId") or artifact_root.name - ), - run_id=str( - native_audit_payload.get("artifactId") or artifact_root.name - ), - ) - ) - native_target_diagnostics_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "09_validation_benchmarking", - "policyengine_native_target_diagnostics", - ) - _write_json_atomically( - native_target_diagnostics_path, - native_target_diagnostics_payload, - ) - artifacts["policyengine_native_target_diagnostics"] = ( - native_target_diagnostics_path.name - ) - manifest["artifacts"] = artifacts - _refresh_checkpoint_data_flow_snapshot( - artifact_root, - manifest, - extra_outputs=tuple( - path.name - for path in ( - native_audit_path, - native_target_diagnostics_path, - ) - if path is not None - ), - ) - _write_json_atomically(manifest_path, manifest) - return PEUSDataRebuildCheckpointEvidenceResult( - artifact_dir=artifact_root, - manifest_path=manifest_path, - harness_path=harness_path, - native_scores_path=native_scores_path, - parity_path=parity_path, - parity_payload=parity_payload, - native_audit_path=native_audit_path, - native_audit_payload=native_audit_payload, - native_target_diagnostics_path=native_target_diagnostics_path, - native_target_diagnostics_payload=native_target_diagnostics_payload, - imputation_ablation_path=imputation_ablation_path, - imputation_ablation_payload=imputation_ablation_payload, - ) diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_resume.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_resume.py deleted file mode 100644 index db3092d2..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_resume.py +++ /dev/null @@ -1,827 +0,0 @@ -"""Stage resume orchestration for PE-US-data checkpoint rebuilds.""" - -from __future__ import annotations - -import json -from collections.abc import Mapping -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import pandas as pd -from microplex.core import ObservationFrame, SourceQuery - -from microplex_us.pipelines.artifact_io import _stage_artifact_ref, _stage_diagnostics -from microplex_us.pipelines.artifacts import USMicroplexVersionedBuildArtifacts -from microplex_us.pipelines.pe_us_data_rebuild import PEUSDataRebuildProgram -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_artifacts import ( - _load_checkpoint_manifest, - _load_checkpoint_manifest_if_available, - _load_checkpoint_versioned_artifacts, - _load_resume_dataframe_artifact, - _load_resume_json_artifact, - _load_resume_policyengine_tables, - _load_resume_targets, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_common import ( - _emit_checkpoint_progress, - _write_json_atomically, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_evidence import ( - attach_policyengine_us_data_rebuild_checkpoint_evidence, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_types import ( - PEUSDataRebuildCheckpointResult, -) -from microplex_us.pipelines.stage_contracts import ( - US_CANONICAL_STAGE_IDS, - canonicalize_us_pipeline_stage_id, - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_policyengine_artifacts import ( - write_us_policyengine_entity_stage_artifact, -) -from microplex_us.pipelines.stage_resume import ( - USStageResumeArtifactRequirement, - preflight_us_stage_resume, -) -from microplex_us.pipelines.stage_run import ( - USArtifactRef, - USCalibrationOutputs, - USDiagnosticOutput, - USPolicyEngineEntityOutputs, - USRunProfileOutputs, - USStageInputOverride, - resolve_us_manifest_or_contract_artifact_path, -) -from microplex_us.pipelines.stage_runtime import USStageRuntimeWriter -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexPipeline, - USMicroplexTargets, -) -from microplex_us.pipelines.versioned_artifacts import ( - _finalize_versioned_build_artifacts, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - save_us_pipeline_checkpoint, -) - -if TYPE_CHECKING: - from microplex.core import SourceProvider - from microplex.targets import TargetProvider - - from microplex_us.pipelines.registry import FrontierMetric - from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessSlice, - ) - - -def _resolve_checkpoint_resume_artifact_root( - output_root: str | Path, - *, - version_id: str | None, - resume_from_stage: str | None = None, -) -> Path: - root = Path(output_root).expanduser() - if version_id is not None: - return root / version_id - if (root / "manifest.json").exists(): - return root - if ( - resume_from_stage is not None - and canonicalize_us_pipeline_stage_id(resume_from_stage) == "01_run_profile" - ): - return root - raise ValueError( - "resume_from_stage requires --version-id unless --output-root points " - "directly at a saved artifact directory with manifest.json" - ) - - -def _is_artifact_backed_checkpoint_resume_stage(stage_id: str) -> bool: - return US_CANONICAL_STAGE_IDS.index(stage_id) >= US_CANONICAL_STAGE_IDS.index( - "06_policyengine_entities" - ) - - -def _resume_provider_context_from_manifest( - artifact_root: Path, - manifest: dict[str, Any], -) -> tuple[tuple[str, ...], dict[str, SourceQuery]]: - plan = _resume_provider_query_plan_from_manifest(artifact_root, manifest) - provider_names = _string_tuple(plan.get("provider_names")) - if not provider_names: - provider_names = _string_tuple(plan.get("source_names")) - if not provider_names: - provider_names = _string_tuple( - dict(manifest.get("synthesis", {})).get("source_names") - ) - queries = _resume_queries_from_provider_plan(plan) - if not provider_names and queries: - provider_names = tuple(queries) - return provider_names, queries - - -def _resume_provider_query_plan_from_manifest( - artifact_root: Path, - manifest: dict[str, Any], -) -> dict[str, Any]: - stage_manifests = manifest.get("stage_output_manifests") - stage_manifest_path = None - if isinstance(stage_manifests, Mapping): - stage_manifest_path = stage_manifests.get("01_run_profile") - path = ( - artifact_root / str(stage_manifest_path) - if isinstance(stage_manifest_path, str) - else artifact_root / "stage_artifacts" / "manifests" / "01_run_profile.json" - ) - try: - payload = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - return {} - if not isinstance(payload, Mapping): - return {} - outputs = payload.get("outputs") - if not isinstance(outputs, Mapping): - return {} - plan = outputs.get("provider_query_plan") - return dict(plan) if isinstance(plan, Mapping) else {} - - -def _resume_queries_from_provider_plan( - provider_query_plan: Mapping[str, Any], -) -> dict[str, SourceQuery]: - queries_payload = provider_query_plan.get("queries") - if not isinstance(queries_payload, Mapping): - return {} - queries: dict[str, SourceQuery] = {} - for key, value in queries_payload.items(): - if not isinstance(key, str) or not isinstance(value, Mapping): - continue - provider_filters = value.get("provider_filters") - if provider_filters is None: - provider_filters = value.get("providerFilters") - queries[key] = SourceQuery( - provider_filters=( - dict(provider_filters) - if isinstance(provider_filters, Mapping) - else dict(value) - ) - ) - return queries - - -def _string_tuple(value: Any) -> tuple[str, ...]: - if not isinstance(value, (list, tuple)): - return () - return tuple(str(item) for item in value if item) - - -def _checkpoint_resume_extra_artifact_requirements( - resume_from_stage: str, -) -> tuple[USStageResumeArtifactRequirement, ...]: - stage_id = canonicalize_us_pipeline_stage_id(resume_from_stage) - stage_index = US_CANONICAL_STAGE_IDS.index(stage_id) - requirements: list[USStageResumeArtifactRequirement] = [] - if stage_index >= US_CANONICAL_STAGE_IDS.index("06_policyengine_entities"): - requirements.extend( - [ - USStageResumeArtifactRequirement( - "05_donor_integration_synthesis", - "seed_data", - "runner must hydrate seed rows before replaying downstream stages", - ), - USStageResumeArtifactRequirement( - "05_donor_integration_synthesis", - "synthetic_data", - "runner must hydrate candidate rows before replaying downstream stages", - ), - ] - ) - if stage_index >= US_CANONICAL_STAGE_IDS.index("08_dataset_assembly"): - requirements.extend( - [ - USStageResumeArtifactRequirement( - "07_calibration", - "calibrated_data", - "runner must hydrate calibrated rows before dataset assembly", - ), - USStageResumeArtifactRequirement( - "07_calibration", - "targets", - "runner must hydrate target payload before dataset assembly", - ), - USStageResumeArtifactRequirement( - "07_calibration", - "calibration_summary", - "runner must hydrate calibration summary before dataset assembly", - ), - USStageResumeArtifactRequirement( - "07_calibration", - "policyengine_entity_tables", - "runner must hydrate calibrated PE entity tables before dataset assembly", - ), - ] - ) - if stage_index >= US_CANONICAL_STAGE_IDS.index("09_validation_benchmarking"): - requirements.append( - USStageResumeArtifactRequirement( - "08_dataset_assembly", - "policyengine_dataset", - "validation and benchmark evidence require the assembled H5 dataset", - ) - ) - return tuple(requirements) - - -def _resume_policyengine_table_summary( - tables: PolicyEngineUSEntityTableBundle, -) -> dict[str, Any]: - return { - "households": int(len(tables.households)), - "persons": int(len(tables.persons)), - "tax_units": int(len(tables.tax_units)), - "spm_units": int(len(tables.spm_units)), - "families": int(len(tables.families)), - "marital_units": int(len(tables.marital_units)), - } - - -def _resume_target_ledger(targets: USMicroplexTargets) -> dict[str, Any]: - return { - "n_marginal_groups": len(targets.marginal), - "n_continuous": len(targets.continuous), - "marginal_keys": sorted(targets.marginal.keys()), - "continuous_keys": sorted(targets.continuous.keys()), - } - - -def _load_checkpoint_source_frames( - providers: tuple[SourceProvider, ...], - queries: dict[str, SourceQuery], -) -> list[ObservationFrame]: - pipeline = USMicroplexPipeline() - frames: list[ObservationFrame] = [] - for provider in providers: - frame = provider.load_frame( - pipeline._resolve_source_query(provider, queries or {}) - ) - frames.append(frame) - return frames - - -def _complete_resume_run_profile_stage( - *, - stage_runtime_writer: USStageRuntimeWriter, - config: USMicroplexBuildConfig, - version_id: str, - provider_names: tuple[str, ...], - queries: dict[str, SourceQuery], -) -> None: - stage_runtime_writer.start_stage( - "01_run_profile", - metadata={"version_id": version_id, "resume": True}, - ) - stage_runtime_writer.complete_stage( - USRunProfileOutputs( - manifest=USArtifactRef( - key="manifest", - path="manifest.json", - format="json", - required=True, - assume_exists=True, - ), - resolved_config=config.to_dict(), - provider_query_plan={ - "provider_names": list(provider_names), - "queries": { - key: ( - query.to_dict() - if hasattr(query, "to_dict") - else dict(getattr(query, "__dict__", {})) - ) - for key, query in queries.items() - }, - }, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - description="Runtime run-profile summary.", - summary={ - "provider_names": list(provider_names), - "version_id": version_id, - "resume": True, - }, - ) - }, - ) - ) - - -def _resume_checkpoint_build_from_source_stage( - *, - artifact_root: Path, - resume_from_stage: str, - config: USMicroplexBuildConfig, - providers: tuple[SourceProvider, ...], - queries: dict[str, SourceQuery], - stage_runtime_writer: USStageRuntimeWriter, - provider_names: tuple[str, ...], -) -> USMicroplexBuildResult: - pipeline = USMicroplexPipeline(config, stage_runtime_writer=stage_runtime_writer) - if resume_from_stage == "01_run_profile": - _complete_resume_run_profile_stage( - stage_runtime_writer=stage_runtime_writer, - config=config, - version_id=artifact_root.name, - provider_names=provider_names, - queries=queries, - ) - return pipeline.build_from_source_providers(list(providers), queries=queries) - if resume_from_stage == "02_source_loading": - return pipeline.build_from_source_providers(list(providers), queries=queries) - - frames = _load_checkpoint_source_frames(providers, queries) - restored_scaffold_seed_data = None - if resume_from_stage == "05_donor_integration_synthesis": - manifest = _load_checkpoint_manifest(artifact_root) - restored_scaffold_seed_data = _load_resume_dataframe_artifact( - artifact_root, - manifest, - "scaffold_seed_data", - stage_id="04_seed_scaffold", - ) - return pipeline.build_from_frames( - frames, - resume_from_stage=resume_from_stage, - restored_scaffold_seed_data=restored_scaffold_seed_data, - ) - - -def _load_resume_build_result_base( - *, - artifact_root: Path, - config: USMicroplexBuildConfig, -) -> tuple[dict[str, Any], pd.DataFrame, pd.DataFrame, pd.DataFrame | None, USMicroplexTargets]: - manifest = _load_checkpoint_manifest_if_available(artifact_root) - seed_data = _load_resume_dataframe_artifact( - artifact_root, - manifest, - "seed_data", - stage_id="05_donor_integration_synthesis", - ) - synthetic_data = _load_resume_dataframe_artifact( - artifact_root, - manifest, - "synthetic_data", - stage_id="05_donor_integration_synthesis", - ) - scaffold_seed_data_path = resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - "scaffold_seed_data", - stage_id="04_seed_scaffold", - ) - scaffold_seed_data = ( - pd.read_parquet(scaffold_seed_data_path) - if scaffold_seed_data_path.exists() - else None - ) - targets = _load_resume_targets( - artifact_root, - manifest, - config=config, - seed_data=seed_data, - ) - return manifest, seed_data, synthetic_data, scaffold_seed_data, targets - - -def _run_checkpoint_policyengine_entity_resume_stage( - *, - pipeline: USMicroplexPipeline, - synthetic_data: pd.DataFrame, -) -> PolicyEngineUSEntityTableBundle: - pipeline._runtime_start_stage("06_policyengine_entities") - try: - synthetic_tables = pipeline.build_policyengine_entity_tables(synthetic_data) - if pipeline.config.pipeline_checkpoint_save_post_imputation_path is not None: - save_us_pipeline_checkpoint( - synthetic_tables, - pipeline.config.pipeline_checkpoint_save_post_imputation_path, - stage="post_imputation", - ) - pipeline._check_policyengine_export_column_contract( - synthetic_tables, - stage="pre_calibration", - ) - if pipeline.stage_runtime_writer is not None: - write_us_policyengine_entity_stage_artifact( - synthetic_tables, - pipeline.stage_runtime_writer.artifact_root, - stage_id="06_policyengine_entities", - artifact_key="pre_calibration_policyengine_entity_tables", - checkpoint_stage="post_microsim", - ) - entity_summary = _resume_policyengine_table_summary(synthetic_tables) - pipeline.stage_runtime_writer.complete_stage( - USPolicyEngineEntityOutputs( - pre_calibration_policyengine_entity_tables=_stage_artifact_ref( - pipeline.stage_runtime_writer.artifact_root, - "06_policyengine_entities", - "pre_calibration_policyengine_entity_tables", - ), - materialized_policyengine_inputs=entity_summary, - diagnostics=_stage_diagnostics( - "06_policyengine_entities", - entity_summary, - ), - ) - ) - except Exception as exc: - pipeline._runtime_fail_stage("06_policyengine_entities", exc) - raise - return synthetic_tables - - -def _run_checkpoint_calibration_resume_stage( - *, - pipeline: USMicroplexPipeline, - synthetic_data: pd.DataFrame, - synthetic_tables: PolicyEngineUSEntityTableBundle, - targets: USMicroplexTargets, -) -> tuple[PolicyEngineUSEntityTableBundle, pd.DataFrame, dict[str, Any]]: - pipeline._runtime_start_stage("07_calibration") - try: - if pipeline._has_policyengine_calibration_targets(): - policyengine_tables, calibrated_data, calibration_summary = ( - pipeline.calibrate_policyengine_tables(synthetic_tables) - ) - else: - calibrated_data, calibration_summary = pipeline.calibrate( - synthetic_data, - targets, - ) - policyengine_tables = pipeline.build_policyengine_entity_tables( - calibrated_data - ) - if pipeline.stage_runtime_writer is not None: - artifact_root = pipeline.stage_runtime_writer.artifact_root - write_us_policyengine_entity_stage_artifact( - policyengine_tables, - artifact_root, - stage_id="07_calibration", - artifact_key="policyengine_entity_tables", - checkpoint_stage="post_calibration", - ) - calibrated_data_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "07_calibration", - "calibrated_data", - ) - targets_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "07_calibration", - "targets", - ) - calibration_summary_path = resolve_us_stage_artifact_contract_path( - artifact_root, - "07_calibration", - "calibration_summary", - ) - calibrated_data_path.parent.mkdir(parents=True, exist_ok=True) - targets_path.parent.mkdir(parents=True, exist_ok=True) - calibration_summary_path.parent.mkdir(parents=True, exist_ok=True) - calibrated_data.to_parquet(calibrated_data_path, index=False) - _write_json_atomically( - targets_path, - { - "marginal": targets.marginal, - "continuous": targets.continuous, - }, - ) - _write_json_atomically(calibration_summary_path, calibration_summary) - target_ledger = _resume_target_ledger(targets) - pipeline.stage_runtime_writer.complete_stage( - USCalibrationOutputs( - calibrated_data=_stage_artifact_ref( - artifact_root, - "07_calibration", - "calibrated_data", - ), - targets=_stage_artifact_ref( - artifact_root, - "07_calibration", - "targets", - ), - calibration_summary=_stage_artifact_ref( - artifact_root, - "07_calibration", - "calibration_summary", - ), - policyengine_entity_tables=_stage_artifact_ref( - artifact_root, - "07_calibration", - "policyengine_entity_tables", - ), - target_ledger=target_ledger, - diagnostics=_stage_diagnostics( - "07_calibration", - { - "calibrated_rows": int(len(calibrated_data)), - "backend": pipeline.config.calibration_backend, - **target_ledger, - }, - ), - ) - ) - except Exception as exc: - pipeline._runtime_fail_stage("07_calibration", exc) - raise - return policyengine_tables, calibrated_data, calibration_summary - - -def _resume_checkpoint_build_from_saved_stage( - *, - artifact_root: Path, - resume_from_stage: str, - config: USMicroplexBuildConfig, - stage_runtime_writer: USStageRuntimeWriter, -) -> USMicroplexBuildResult: - ( - manifest, - seed_data, - synthetic_data, - scaffold_seed_data, - targets, - ) = _load_resume_build_result_base(artifact_root=artifact_root, config=config) - synthesis_metadata = dict(manifest.get("synthesis", {})) - synthesis_metadata["stage_resume"] = { - "source_artifact_dir": str(artifact_root), - "resume_from_stage": resume_from_stage, - } - pipeline = USMicroplexPipeline(config, stage_runtime_writer=stage_runtime_writer) - stage_index = US_CANONICAL_STAGE_IDS.index(resume_from_stage) - - if stage_index <= US_CANONICAL_STAGE_IDS.index("06_policyengine_entities"): - pre_calibration_tables = _run_checkpoint_policyengine_entity_resume_stage( - pipeline=pipeline, - synthetic_data=synthetic_data, - ) - else: - pre_calibration_tables = _load_resume_policyengine_tables( - artifact_root, - manifest, - "pre_calibration_policyengine_entity_tables", - stage_id="06_policyengine_entities", - expected_stage="post_microsim", - ) - - if stage_index <= US_CANONICAL_STAGE_IDS.index("07_calibration"): - policyengine_tables, calibrated_data, calibration_summary = ( - _run_checkpoint_calibration_resume_stage( - pipeline=pipeline, - synthetic_data=synthetic_data, - synthetic_tables=pre_calibration_tables, - targets=targets, - ) - ) - else: - calibrated_data = _load_resume_dataframe_artifact( - artifact_root, - manifest, - "calibrated_data", - stage_id="07_calibration", - ) - calibration_summary = _load_resume_json_artifact( - artifact_root, - manifest, - "calibration_summary", - stage_id="07_calibration", - ) - policyengine_tables = _load_resume_policyengine_tables( - artifact_root, - manifest, - "policyengine_entity_tables", - stage_id="07_calibration", - expected_stage="post_calibration", - ) - - return USMicroplexBuildResult( - config=config, - seed_data=seed_data, - synthetic_data=synthetic_data, - calibrated_data=calibrated_data, - targets=targets, - calibration_summary=calibration_summary, - synthesis_metadata=synthesis_metadata, - policyengine_tables=policyengine_tables, - pre_calibration_policyengine_tables=pre_calibration_tables, - scaffold_seed_data=scaffold_seed_data, - ) - - -def _run_policyengine_us_data_rebuild_checkpoint_resume( - *, - output_root: str | Path, - version_id: str | None, - resume_from_stage: str, - resolved_config: USMicroplexBuildConfig, - program: PEUSDataRebuildProgram, - resolved_providers: tuple[SourceProvider, ...], - provider_names: tuple[str, ...], - resolved_queries: dict[str, SourceQuery], - frontier_metric: FrontierMetric, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None, - policyengine_target_provider: TargetProvider | None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ), - resolved_harness_metadata: dict[str, Any], - policyengine_us_data_repo: str | Path | None, - policyengine_us_data_python: str | Path | None, - defer_policyengine_harness: bool, - require_policyengine_native_score: bool, - defer_policyengine_native_score: bool, - defer_native_audit: bool, - defer_imputation_ablation: bool, - precomputed_policyengine_harness_payload: dict[str, Any] | None, - precomputed_policyengine_native_scores: dict[str, Any] | None, - precomputed_imputation_ablation_payload: dict[str, Any] | None, - run_registry_path: str | Path | None, - run_index_path: str | Path | None, - resolved_registry_metadata: dict[str, Any], - allow_stage_input_overrides: bool, - stage_input_overrides: tuple[USStageInputOverride, ...], -) -> PEUSDataRebuildCheckpointResult: - resume_stage_id = canonicalize_us_pipeline_stage_id(resume_from_stage) - artifact_root = _resolve_checkpoint_resume_artifact_root( - output_root, - version_id=version_id, - resume_from_stage=resume_stage_id, - ) - preflight = preflight_us_stage_resume( - artifact_root, - resume_stage_id, - extra_required_artifacts=_checkpoint_resume_extra_artifact_requirements( - resume_stage_id - ), - ) - preflight.raise_for_missing() - manifest = _load_checkpoint_manifest_if_available(artifact_root) - stage_runtime_writer = USStageRuntimeWriter( - artifact_root, - manifest_payload=manifest, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: resuming build", - artifact_dir=artifact_root, - resume_from_stage=resume_stage_id, - providers=",".join(provider_names), - ) - - build_result: USMicroplexBuildResult | None = None - if resume_stage_id in { - "01_run_profile", - "02_source_loading", - "03_source_planning", - "04_seed_scaffold", - "05_donor_integration_synthesis", - }: - build_result = _resume_checkpoint_build_from_source_stage( - artifact_root=artifact_root, - resume_from_stage=resume_stage_id, - config=resolved_config, - providers=resolved_providers, - queries=resolved_queries, - stage_runtime_writer=stage_runtime_writer, - provider_names=provider_names, - ) - elif resume_stage_id in { - "06_policyengine_entities", - "07_calibration", - "08_dataset_assembly", - "09_validation_benchmarking", - }: - build_result = _resume_checkpoint_build_from_saved_stage( - artifact_root=artifact_root, - resume_from_stage=resume_stage_id, - config=resolved_config, - stage_runtime_writer=stage_runtime_writer, - ) - - artifacts: USMicroplexVersionedBuildArtifacts | None = None - if build_result is not None and resume_stage_id != "09_validation_benchmarking": - artifacts = _finalize_versioned_build_artifacts( - build_result, - output_root=artifact_root.parent, - version_id=artifact_root.name, - preallocated_output_dir=artifact_root, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=resolved_config.policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=resolved_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=True, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=True, - precomputed_policyengine_harness_payload=None, - precomputed_policyengine_native_scores=None, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=resolved_registry_metadata, - enable_child_tax_unit_agi_drift=True, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - stage_runtime_writer=stage_runtime_writer, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: resumed build complete", - artifact_dir=artifact_root, - resume_from_stage=resume_stage_id, - frontier_metric=frontier_metric, - ) - - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: attaching PE evidence", - artifact_dir=artifact_root, - compute_harness=not defer_policyengine_harness, - compute_native_scores=not defer_policyengine_native_score, - compute_native_audit=not defer_native_audit, - compute_imputation_ablation=not defer_imputation_ablation, - ) - evidence = attach_policyengine_us_data_rebuild_checkpoint_evidence( - artifact_root, - build_result=build_result, - program=program, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=resolved_config.policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=resolved_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - compute_harness=not defer_policyengine_harness, - compute_native_scores=not defer_policyengine_native_score, - compute_native_audit=not defer_native_audit, - compute_imputation_ablation=not defer_imputation_ablation, - require_policyengine_native_score=require_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - precomputed_imputation_ablation_payload=precomputed_imputation_ablation_payload, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=resolved_registry_metadata, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: evidence complete", - parity_path=evidence.parity_path, - native_audit_path=evidence.native_audit_path, - native_target_diagnostics_path=getattr( - evidence, - "native_target_diagnostics_path", - None, - ), - imputation_ablation_path=evidence.imputation_ablation_path, - ) - refreshed_artifacts = _load_checkpoint_versioned_artifacts( - build_result=( - artifacts.build_result - if artifacts is not None - else build_result - ), - artifact_root=artifact_root, - frontier_metric=frontier_metric, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: checkpoint ready", - artifact_dir=refreshed_artifacts.artifact_paths.output_dir, - ) - return PEUSDataRebuildCheckpointResult( - build_config=resolved_config, - provider_names=provider_names, - queries=resolved_queries, - artifacts=refreshed_artifacts, - parity_path=evidence.parity_path, - parity_payload=evidence.parity_payload, - native_audit_path=evidence.native_audit_path, - native_audit_payload=evidence.native_audit_payload, - native_target_diagnostics_path=getattr( - evidence, - "native_target_diagnostics_path", - None, - ), - native_target_diagnostics_payload=getattr( - evidence, - "native_target_diagnostics_payload", - None, - ), - imputation_ablation_path=evidence.imputation_ablation_path, - imputation_ablation_payload=evidence.imputation_ablation_payload, - ) diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_runner.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_runner.py deleted file mode 100644 index 7252ec41..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_runner.py +++ /dev/null @@ -1,465 +0,0 @@ -"""Top-level PE-US-data checkpoint rebuild runner.""" - -from __future__ import annotations - -from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal - -from microplex.core import SourceQuery - -from microplex_us.pipelines.artifacts import ( - build_and_save_versioned_us_microplex_from_source_providers, -) -from microplex_us.pipelines.pe_us_data_rebuild import ( - default_policyengine_us_data_rebuild_program, - default_policyengine_us_data_rebuild_source_providers, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_artifacts import ( - _load_checkpoint_manifest, - _load_checkpoint_versioned_artifacts, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_common import ( - _emit_checkpoint_progress, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_config import ( - _validate_checkpoint_config_context, - _validate_query_keys, - default_policyengine_us_data_rebuild_checkpoint_config, - default_policyengine_us_data_rebuild_queries, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_evidence import ( - attach_policyengine_us_data_rebuild_checkpoint_evidence, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_resume import ( - _checkpoint_resume_extra_artifact_requirements, - _is_artifact_backed_checkpoint_resume_stage, - _resolve_checkpoint_resume_artifact_root, - _resume_provider_context_from_manifest, - _run_policyengine_us_data_rebuild_checkpoint_resume, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_types import ( - PEUSDataRebuildCheckpointResult, -) -from microplex_us.pipelines.stage_contracts import ( - US_CANONICAL_STAGE_IDS, - canonicalize_us_pipeline_stage_id, -) -from microplex_us.pipelines.stage_resume import preflight_us_stage_resume -from microplex_us.pipelines.stage_run import USStageInputOverride -from microplex_us.pipelines.us import USMicroplexBuildConfig - -if TYPE_CHECKING: - from microplex.core import SourceProvider - from microplex.targets import TargetProvider - - from microplex_us.pipelines.registry import FrontierMetric - from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessSlice, - ) - - -def run_policyengine_us_data_rebuild_checkpoint( - output_root: str | Path, - *, - policyengine_baseline_dataset: str | Path, - policyengine_targets_db: str | Path, - arch_targets_db: str | Path | tuple[str | Path, ...] | None = None, - calibration_target_source: Literal["policyengine", "arch"] = "policyengine", - target_period: int = 2024, - target_profile: str = "pe_native_broad", - calibration_target_profile: str | None = None, - target_variables: tuple[str, ...] = (), - target_domains: tuple[str, ...] = (), - target_geo_levels: tuple[str, ...] = (), - calibration_target_variables: tuple[str, ...] = (), - calibration_target_domains: tuple[str, ...] = (), - calibration_target_geo_levels: tuple[str, ...] = (), - config: USMicroplexBuildConfig | None = None, - config_overrides: dict[str, Any] | None = None, - providers: tuple[SourceProvider, ...] | list[SourceProvider] | None = None, - queries: dict[str, SourceQuery] | None = None, - cps_source_year: int = 2023, - cps_cache_dir: str | Path | None = None, - cps_download: bool = True, - puf_target_year: int | None = None, - puf_cps_reference_year: int | None = None, - puf_cache_dir: str | Path | None = None, - puf_path: str | Path | None = None, - puf_demographics_path: str | Path | None = None, - puf_expand_persons: bool = True, - include_donor_surveys: bool = True, - include_sipp: bool | None = None, - include_scf: bool | None = None, - acs_year: int = 2024, - sipp_year: int = 2023, - scf_year: int = 2022, - donor_cache_dir: str | Path | None = None, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - cps_sample_n: int | None = None, - puf_sample_n: int | None = None, - donor_sample_n: int | None = None, - query_random_seed: int = 0, - version_id: str | None = None, - frontier_metric: FrontierMetric = "full_oracle_capped_mean_abs_relative_error", - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - defer_policyengine_harness: bool = False, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - defer_native_audit: bool = False, - defer_imputation_ablation: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - precomputed_imputation_ablation_payload: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), - resume_from_stage: str | None = None, -) -> PEUSDataRebuildCheckpointResult: - """Run one saved rebuild checkpoint and write its PE comparison sidecars.""" - - if config is not None and config_overrides: - raise ValueError( - "config_overrides cannot be used when an explicit config is supplied" - ) - resolved_config = config or default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_targets_db=policyengine_targets_db, - arch_targets_db=arch_targets_db, - calibration_target_source=calibration_target_source, - target_period=target_period, - target_profile=target_profile, - calibration_target_profile=calibration_target_profile, - target_variables=target_variables, - target_domains=target_domains, - target_geo_levels=target_geo_levels, - calibration_target_variables=calibration_target_variables, - calibration_target_domains=calibration_target_domains, - calibration_target_geo_levels=calibration_target_geo_levels, - **dict(config_overrides or {}), - ) - if config is not None: - _validate_checkpoint_config_context( - resolved_config, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_targets_db=policyengine_targets_db, - arch_targets_db=arch_targets_db, - calibration_target_source=calibration_target_source, - target_period=target_period, - target_profile=target_profile, - calibration_target_profile=calibration_target_profile, - target_variables=target_variables, - target_domains=target_domains, - target_geo_levels=target_geo_levels, - calibration_target_variables=calibration_target_variables, - calibration_target_domains=calibration_target_domains, - calibration_target_geo_levels=calibration_target_geo_levels, - ) - resume_stage_id = ( - canonicalize_us_pipeline_stage_id(resume_from_stage) - if resume_from_stage is not None - else None - ) - if resume_stage_id is not None and resume_stage_id not in US_CANONICAL_STAGE_IDS: - raise ValueError(f"Unknown US pipeline stage: {resume_from_stage}") - if ( - resume_stage_id is not None - and _is_artifact_backed_checkpoint_resume_stage(resume_stage_id) - ): - artifact_root = _resolve_checkpoint_resume_artifact_root( - output_root, - version_id=version_id, - resume_from_stage=resume_stage_id, - ) - preflight = preflight_us_stage_resume( - artifact_root, - resume_stage_id, - extra_required_artifacts=_checkpoint_resume_extra_artifact_requirements( - resume_stage_id - ), - ) - preflight.raise_for_missing() - manifest = _load_checkpoint_manifest(artifact_root) - provider_names, resolved_queries = _resume_provider_context_from_manifest( - artifact_root, - manifest, - ) - program = default_policyengine_us_data_rebuild_program() - if ( - policyengine_us_data_python is not None - and not defer_policyengine_native_score - and precomputed_policyengine_native_scores is None - ): - raise ValueError( - "policyengine_us_data_python requires defer_policyengine_native_score=True " - "or precomputed_policyengine_native_scores because the automatic native-score " - "save path cannot yet honor a custom PE-US-data interpreter" - ) - resolved_harness_metadata = { - "rebuild_checkpoint": True, - "rebuild_program_id": program.program_id, - "rebuild_provider_names": list(provider_names), - **dict(policyengine_harness_metadata or {}), - } - resolved_registry_metadata = { - "rebuild_checkpoint": True, - "rebuild_program_id": program.program_id, - "rebuild_provider_names": list(provider_names), - "rebuild_profile_expected": True, - **dict(run_registry_metadata or {}), - } - return _run_policyengine_us_data_rebuild_checkpoint_resume( - output_root=output_root, - version_id=version_id, - resume_from_stage=resume_stage_id, - resolved_config=resolved_config, - program=program, - resolved_providers=(), - provider_names=provider_names, - resolved_queries=resolved_queries, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_harness_slices=policyengine_harness_slices, - resolved_harness_metadata=resolved_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - defer_native_audit=defer_native_audit, - defer_imputation_ablation=defer_imputation_ablation, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - precomputed_imputation_ablation_payload=precomputed_imputation_ablation_payload, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - resolved_registry_metadata=resolved_registry_metadata, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - if providers is None: - resolved_providers = tuple( - default_policyengine_us_data_rebuild_source_providers( - cps_source_year=cps_source_year, - cps_cache_dir=cps_cache_dir, - cps_download=cps_download, - puf_target_year=( - int(puf_target_year) - if puf_target_year is not None - else int(target_period) - ), - puf_cps_reference_year=puf_cps_reference_year, - puf_cache_dir=puf_cache_dir, - puf_path=puf_path, - puf_demographics_path=puf_demographics_path, - puf_expand_persons=puf_expand_persons, - include_donor_surveys=include_donor_surveys, - include_sipp=include_sipp, - include_scf=include_scf, - acs_year=acs_year, - sipp_year=sipp_year, - scf_year=scf_year, - donor_cache_dir=donor_cache_dir, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - ) - ) - else: - resolved_providers = tuple(providers) - if not resolved_providers: - raise ValueError( - "providers must be None or a non-empty provider sequence for a rebuild checkpoint" - ) - resolved_queries = ( - dict(queries) - if queries is not None - else default_policyengine_us_data_rebuild_queries( - resolved_providers, - cps_sample_n=cps_sample_n, - puf_sample_n=puf_sample_n, - donor_sample_n=donor_sample_n, - random_seed=query_random_seed, - ) - ) - program = default_policyengine_us_data_rebuild_program() - provider_names = tuple(provider.descriptor.name for provider in resolved_providers) - _validate_query_keys(provider_names, resolved_queries) - if ( - policyengine_us_data_python is not None - and not defer_policyengine_native_score - and precomputed_policyengine_native_scores is None - ): - raise ValueError( - "policyengine_us_data_python requires defer_policyengine_native_score=True " - "or precomputed_policyengine_native_scores because the automatic native-score " - "save path cannot yet honor a custom PE-US-data interpreter" - ) - resolved_harness_metadata = { - "rebuild_checkpoint": True, - "rebuild_program_id": program.program_id, - "rebuild_provider_names": list(provider_names), - **dict(policyengine_harness_metadata or {}), - } - resolved_registry_metadata = { - "rebuild_checkpoint": True, - "rebuild_program_id": program.program_id, - "rebuild_provider_names": list(provider_names), - "rebuild_profile_expected": True, - **dict(run_registry_metadata or {}), - } - if resume_from_stage is not None: - return _run_policyengine_us_data_rebuild_checkpoint_resume( - output_root=output_root, - version_id=version_id, - resume_from_stage=resume_from_stage, - resolved_config=resolved_config, - program=program, - resolved_providers=resolved_providers, - provider_names=provider_names, - resolved_queries=resolved_queries, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_harness_slices=policyengine_harness_slices, - resolved_harness_metadata=resolved_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - defer_native_audit=defer_native_audit, - defer_imputation_ablation=defer_imputation_ablation, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - precomputed_imputation_ablation_payload=precomputed_imputation_ablation_payload, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - resolved_registry_metadata=resolved_registry_metadata, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: starting build", - output_root=Path(output_root).expanduser(), - version_id=version_id or "auto", - target_profile=resolved_config.policyengine_target_profile, - calibration_target_profile=( - resolved_config.policyengine_calibration_target_profile - ), - calibration_target_source=resolved_config.calibration_target_source, - donor_condition_selection=resolved_config.donor_imputer_condition_selection, - providers=",".join(provider_names), - ) - - artifacts = build_and_save_versioned_us_microplex_from_source_providers( - providers=list(resolved_providers), - output_root=output_root, - config=resolved_config, - queries=resolved_queries or None, - version_id=version_id, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=resolved_config.policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=resolved_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=True, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=True, - precomputed_policyengine_harness_payload=None, - precomputed_policyengine_native_scores=None, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=resolved_registry_metadata, - enable_child_tax_unit_agi_drift=True, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: build complete", - artifact_dir=artifacts.artifact_paths.output_dir, - frontier_metric=frontier_metric, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: attaching PE evidence", - artifact_dir=artifacts.artifact_paths.output_dir, - compute_harness=not defer_policyengine_harness, - compute_native_scores=not defer_policyengine_native_score, - compute_native_audit=not defer_native_audit, - compute_imputation_ablation=not defer_imputation_ablation, - ) - evidence = attach_policyengine_us_data_rebuild_checkpoint_evidence( - artifacts.artifact_paths.output_dir, - build_result=artifacts.build_result, - program=program, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=resolved_config.policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=resolved_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - compute_harness=not defer_policyengine_harness, - compute_native_scores=not defer_policyengine_native_score, - compute_native_audit=not defer_native_audit, - compute_imputation_ablation=not defer_imputation_ablation, - require_policyengine_native_score=require_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - precomputed_imputation_ablation_payload=precomputed_imputation_ablation_payload, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=resolved_registry_metadata, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: evidence complete", - parity_path=evidence.parity_path, - native_audit_path=evidence.native_audit_path, - native_target_diagnostics_path=getattr( - evidence, - "native_target_diagnostics_path", - None, - ), - imputation_ablation_path=evidence.imputation_ablation_path, - ) - refreshed_artifacts = _load_checkpoint_versioned_artifacts( - build_result=artifacts.build_result, - artifact_root=artifacts.artifact_paths.output_dir, - frontier_metric=frontier_metric, - ) - _emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: checkpoint ready", - artifact_dir=refreshed_artifacts.artifact_paths.output_dir, - ) - return PEUSDataRebuildCheckpointResult( - build_config=resolved_config, - provider_names=provider_names, - queries=resolved_queries, - artifacts=refreshed_artifacts, - parity_path=evidence.parity_path, - parity_payload=evidence.parity_payload, - native_audit_path=evidence.native_audit_path, - native_audit_payload=evidence.native_audit_payload, - native_target_diagnostics_path=getattr( - evidence, - "native_target_diagnostics_path", - None, - ), - native_target_diagnostics_payload=getattr( - evidence, - "native_target_diagnostics_payload", - None, - ), - imputation_ablation_path=evidence.imputation_ablation_path, - imputation_ablation_payload=evidence.imputation_ablation_payload, - ) diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_types.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_types.py deleted file mode 100644 index fd686018..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint_types.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Result types for the PE-US-data checkpoint runner.""" - -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from microplex.core import SourceQuery - - from microplex_us.pipelines.artifacts import USMicroplexVersionedBuildArtifacts - from microplex_us.pipelines.us import USMicroplexBuildConfig - - -@dataclass(frozen=True) -class PEUSDataRebuildCheckpointResult: - """Saved artifact bundle plus attached PE comparison sidecars.""" - - build_config: USMicroplexBuildConfig - provider_names: tuple[str, ...] - queries: dict[str, SourceQuery] - artifacts: USMicroplexVersionedBuildArtifacts - parity_path: Path - parity_payload: dict[str, Any] - native_audit_path: Path | None = None - native_audit_payload: dict[str, Any] | None = None - native_target_diagnostics_path: Path | None = None - native_target_diagnostics_payload: dict[str, Any] | None = None - imputation_ablation_path: Path | None = None - imputation_ablation_payload: dict[str, Any] | None = None - - -@dataclass(frozen=True) -class PEUSDataRebuildCheckpointEvidenceResult: - """Comparison evidence attached to one saved rebuild artifact.""" - - artifact_dir: Path - manifest_path: Path - harness_path: Path | None - native_scores_path: Path | None - parity_path: Path - parity_payload: dict[str, Any] - native_audit_path: Path | None = None - native_audit_payload: dict[str, Any] | None = None - native_target_diagnostics_path: Path | None = None - native_target_diagnostics_payload: dict[str, Any] | None = None - imputation_ablation_path: Path | None = None - imputation_ablation_payload: dict[str, Any] | None = None diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_parity.py b/src/microplex_us/pipelines/pe_us_data_rebuild_parity.py deleted file mode 100644 index 5d8aea18..00000000 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_parity.py +++ /dev/null @@ -1,382 +0,0 @@ -"""Artifact-backed parity summaries for the PE-US-data rebuild track.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.pe_us_data_rebuild import ( - PEUSDataRebuildProgram, - default_policyengine_us_data_rebuild_config, - default_policyengine_us_data_rebuild_program, -) -from microplex_us.pipelines.stage_run import ( - resolve_us_manifest_or_contract_artifact_path, -) - -_HARNESS_SUMMARY_KEYS = ( - "candidate_mean_abs_relative_error", - "baseline_mean_abs_relative_error", - "mean_abs_relative_error_delta", - "candidate_composite_parity_loss", - "baseline_composite_parity_loss", - "composite_parity_loss_delta", - "slice_win_rate", - "target_win_rate", - "supported_target_rate", - "baseline_supported_target_rate", - "tag_summaries", -) - -_NATIVE_SUMMARY_KEYS = ( - "candidate_enhanced_cps_native_loss", - "baseline_enhanced_cps_native_loss", - "enhanced_cps_native_loss_delta", - "candidate_beats_baseline", - "candidate_unweighted_msre", - "baseline_unweighted_msre", - "unweighted_msre_delta", - "n_targets_total", - "n_targets_kept", - "n_targets_zero_dropped", - "n_targets_bad_dropped", - "n_national_targets", - "n_state_targets", -) - -_IMPUTATION_SUMMARY_KEYS = ( - "source_count", - "skipped_source_count", - "target_count", - "production_variant", - "production_mean_weighted_mae", - "production_mean_support_f1", - "best_mean_weighted_mae_variant", - "best_mean_support_f1_variant", - "variant_scorecard", -) - -_PROFILE_CONTEXT_KEYS = { - "cps_asec_cache_dir", - "policyengine_baseline_dataset", - "policyengine_dataset", - "policyengine_targets_db", -} -_POLICYENGINE_HARNESS_BASELINE_LABEL = "policyengine_us_data" -_POLICYENGINE_NATIVE_METRIC = "enhanced_cps_native_loss" - - -def build_policyengine_us_data_rebuild_parity_artifact( - artifact_dir: str | Path, - *, - program: PEUSDataRebuildProgram | None = None, - manifest_payload: dict[str, Any] | None = None, - harness_payload: dict[str, Any] | None = None, - native_scores_payload: dict[str, Any] | None = None, - imputation_ablation_payload: dict[str, Any] | None = None, -) -> dict[str, Any]: - """Build a compact rebuild-parity sidecar from one saved artifact bundle.""" - - artifact_root = Path(artifact_dir) - manifest_source = _resolve_payload_source( - artifact_root / "manifest.json", - override_supplied=manifest_payload is not None, - ) - manifest = ( - dict(manifest_payload) - if manifest_payload is not None - else json.loads((artifact_root / "manifest.json").read_text()) - ) - harness_path = _resolve_stage_artifact_path( - artifact_root, - manifest, - "policyengine_harness", - stage_id="09_validation_benchmarking", - ) - harness_source = _resolve_payload_source( - harness_path, - override_supplied=harness_payload is not None, - ) - harness = ( - dict(harness_payload) - if harness_payload is not None - else _load_optional_json(harness_path) - ) - native_scores_path = _resolve_stage_artifact_path( - artifact_root, - manifest, - "policyengine_native_scores", - stage_id="09_validation_benchmarking", - ) - native_scores_source = _resolve_payload_source( - native_scores_path, - override_supplied=native_scores_payload is not None, - ) - native_scores = ( - dict(native_scores_payload) - if native_scores_payload is not None - else _load_optional_json(native_scores_path) - ) - imputation_ablation_path = _resolve_stage_artifact_path( - artifact_root, - manifest, - "imputation_ablation", - stage_id="09_validation_benchmarking", - ) - imputation_ablation_source = _resolve_payload_source( - imputation_ablation_path, - override_supplied=imputation_ablation_payload is not None, - ) - imputation_ablation = ( - dict(imputation_ablation_payload) - if imputation_ablation_payload is not None - else _load_optional_json(imputation_ablation_path) - ) - - resolved_program = program or default_policyengine_us_data_rebuild_program() - config = _normalize_observed_config(dict(manifest.get("config", {}))) - default_config = default_policyengine_us_data_rebuild_config().to_dict() - harness_summary = dict(harness.get("summary", {})) if harness is not None else {} - native_summary = ( - dict(native_scores.get("summary", {})) if native_scores is not None else {} - ) - imputation_summary = ( - dict(imputation_ablation.get("summary", {})) - if imputation_ablation is not None - else {} - ) - baseline_dataset_path = config.get("policyengine_baseline_dataset") - harness_is_pe_comparison = bool( - baseline_dataset_path - and harness is not None - and harness.get("baseline_label") == _POLICYENGINE_HARNESS_BASELINE_LABEL - ) - native_is_pe_comparison = bool( - baseline_dataset_path - and native_scores is not None - and native_scores.get("metric") == _POLICYENGINE_NATIVE_METRIC - ) - - return { - "schemaVersion": 1, - "artifactId": artifact_root.name, - "artifactDir": str(artifact_root.resolve()), - "evidence": { - "manifest": manifest_source, - "policyengineHarness": harness_source, - "policyengineNativeScores": native_scores_source, - "imputationAblation": imputation_ablation_source, - }, - "program": { - "programId": resolved_program.program_id, - "title": resolved_program.title, - "stageStatuses": { - stage.stage_id: stage.current_status.value for stage in resolved_program.stages - }, - }, - "profileConformance": _build_profile_conformance( - observed_config=config, - expected_config=default_config, - ), - "baselineSlice": { - "baselineDatasetPath": config.get("policyengine_baseline_dataset"), - "targetsDbPath": config.get("policyengine_targets_db"), - "datasetYear": config.get("policyengine_dataset_year"), - "targetPeriod": config.get("policyengine_target_period"), - "targetProfile": config.get("policyengine_target_profile"), - "calibrationTargetProfile": config.get("policyengine_calibration_target_profile"), - "candidateLabel": harness.get("candidate_label") if harness is not None else None, - "baselineLabel": harness.get("baseline_label") if harness is not None else None, - "comparisonMetadata": dict(harness.get("metadata", {})) if harness is not None else {}, - }, - "comparison": { - "policyengineHarness": ( - { - "available": True, - "isPolicyEngineComparison": harness_is_pe_comparison, - "period": harness.get("period"), - **{ - key: harness_summary.get(key) - for key in _HARNESS_SUMMARY_KEYS - }, - } - if harness is not None - else {"available": False} - ), - "policyengineNativeScores": ( - { - "available": True, - "isPolicyEngineComparison": native_is_pe_comparison, - "metric": native_scores.get("metric"), - "period": native_scores.get("period"), - **{ - key: native_summary.get(key) - for key in _NATIVE_SUMMARY_KEYS - }, - } - if native_scores is not None - else {"available": False} - ), - "imputationAblation": ( - { - "available": True, - **{ - key: imputation_summary.get(key) - for key in _IMPUTATION_SUMMARY_KEYS - }, - } - if imputation_ablation is not None - else {"available": False} - ), - }, - "verdict": { - "candidateBeatsHarnessMeanAbsRelativeError": ( - _delta_is_better(harness_summary.get("mean_abs_relative_error_delta")) - if harness_is_pe_comparison - else None - ), - "candidateBeatsHarnessCompositeParityLoss": ( - _delta_is_better(harness_summary.get("composite_parity_loss_delta")) - if harness_is_pe_comparison - else None - ), - "candidateBeatsNativeBroadLoss": ( - native_summary.get("candidate_beats_baseline") - if native_is_pe_comparison - else None - ), - "productionImputationVariantIsMaeWinner": ( - imputation_summary.get("production_variant") - == imputation_summary.get("best_mean_weighted_mae_variant") - if imputation_ablation is not None - else None - ), - "productionImputationVariantIsSupportWinner": ( - imputation_summary.get("production_variant") - == imputation_summary.get("best_mean_support_f1_variant") - if imputation_ablation is not None - else None - ), - "hasRealPolicyEngineComparison": harness_is_pe_comparison - or native_is_pe_comparison, - "hasImputationAblation": imputation_ablation is not None, - }, - } - - -def write_policyengine_us_data_rebuild_parity_artifact( - artifact_dir: str | Path, - output_path: str | Path | None = None, - *, - program: PEUSDataRebuildProgram | None = None, - manifest_payload: dict[str, Any] | None = None, - harness_payload: dict[str, Any] | None = None, - native_scores_payload: dict[str, Any] | None = None, - imputation_ablation_payload: dict[str, Any] | None = None, -) -> Path: - """Write the PE rebuild parity sidecar for one saved artifact bundle.""" - - artifact_root = Path(artifact_dir) - destination = ( - Path(output_path) - if output_path is not None - else artifact_root / "pe_us_data_rebuild_parity.json" - ) - payload = build_policyengine_us_data_rebuild_parity_artifact( - artifact_root, - program=program, - manifest_payload=manifest_payload, - harness_payload=harness_payload, - native_scores_payload=native_scores_payload, - imputation_ablation_payload=imputation_ablation_payload, - ) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return destination - - -def _load_optional_json(path: Path) -> dict[str, Any] | None: - if not path.exists(): - return None - return json.loads(path.read_text()) - - -def _resolve_stage_artifact_path( - artifact_root: Path, - manifest: dict[str, Any], - artifact_key: str, - *, - stage_id: str, -) -> Path: - return resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - artifact_key, - stage_id=stage_id, - ) - - -def _build_profile_conformance( - *, - observed_config: dict[str, Any], - expected_config: dict[str, Any], -) -> dict[str, Any]: - differing_keys = [] - matching_keys = [] - observed_only_keys = [] - for key in sorted(set(expected_config) | set(observed_config)): - if key in _PROFILE_CONTEXT_KEYS: - continue - if key not in expected_config: - observed_only_keys.append( - { - "key": key, - "expected": None, - "observed": observed_config.get(key), - } - ) - continue - observed = observed_config.get(key) - expected = expected_config.get(key) - if observed == expected: - matching_keys.append(key) - else: - differing_keys.append( - { - "key": key, - "expected": expected, - "observed": observed, - } - ) - differing_keys.extend(observed_only_keys) - return { - "exactMatch": not differing_keys, - "matchingKeyCount": len(matching_keys), - "differingKeyCount": len(differing_keys), - "differingKeys": differing_keys, - } - - -def _delta_is_better(value: Any) -> bool | None: - if value is None: - return None - try: - return float(value) < 0.0 - except (TypeError, ValueError): - return None - - -def _normalize_observed_config(observed_config: dict[str, Any]) -> dict[str, Any]: - from microplex_us.pipelines.us import USMicroplexBuildConfig - - normalized = USMicroplexBuildConfig().to_dict() - normalized.update(observed_config) - return normalized - - -def _resolve_payload_source(path: Path, *, override_supplied: bool) -> dict[str, Any]: - return { - "source": "in_memory_override" if override_supplied else "artifact_bundle", - "file": path.name, - "exists": path.exists(), - } diff --git a/src/microplex_us/pipelines/pe_us_dataset_readiness.py b/src/microplex_us/pipelines/pe_us_dataset_readiness.py deleted file mode 100644 index 68118466..00000000 --- a/src/microplex_us/pipelines/pe_us_dataset_readiness.py +++ /dev/null @@ -1,512 +0,0 @@ -"""Lightweight readiness audit for exported PolicyEngine-US datasets.""" - -from __future__ import annotations - -import argparse -import json -from pathlib import Path -from typing import Any - -import h5py -import numpy as np - -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) - -DEFAULT_PERIOD = 2024 -DEFAULT_REQUIRED_VARIABLES: dict[str, str] = { - "household_id": "household", - "household_weight": "household", - "person_id": "person", - "person_household_id": "person", - "tax_unit_id": "tax_unit", - "person_tax_unit_id": "person", - "spm_unit_id": "spm_unit", - "person_spm_unit_id": "person", - "state_fips": "household", - "county_fips": "household", - "congressional_district_geoid": "household", - "spm_unit_spm_threshold": "spm_unit", - "spm_unit_tenure_type": "spm_unit", -} -DEFAULT_EXPECTED_MATERIALIZED_VARIABLES = ( - "income_tax", - "income_tax_positive", - "eitc", - "ctc", - "refundable_ctc", - "non_refundable_ctc", - "snap", - "ssi", - "tanf", - "medicaid", - "aca_ptc", -) -DEFAULT_EXPECTED_SPINES = ("cps_asec", "acs_pums") - - -def build_policyengine_us_dataset_readiness_audit( - path: str | Path, - *, - period: int | str = DEFAULT_PERIOD, - expected_materialized_variables: tuple[str, ...] = DEFAULT_EXPECTED_MATERIALIZED_VARIABLES, - required_variables: dict[str, str] | None = None, - expected_spines: tuple[str, ...] = DEFAULT_EXPECTED_SPINES, - minimum_nonmissing_share: float = 0.999, -) -> dict[str, Any]: - """Inspect a saved artifact bundle or ``policyengine_us.h5`` export. - - This audit intentionally avoids running PolicyEngine. It only checks that the - exported H5 has the structural, geography, SPM-threshold, and materialized - policy-output arrays we expect before expensive native scoring starts. - """ - - input_path = Path(path).expanduser() - artifact_dir = input_path if input_path.is_dir() else None - dataset_path = _resolve_dataset_path(input_path) - manifest = _load_optional_json(artifact_dir / "manifest.json") if artifact_dir else None - source_spine_composition = ( - _load_optional_json(artifact_dir / "source_spine_composition.json") - if artifact_dir - else None - ) - period_key = str(period) - required = dict(required_variables or DEFAULT_REQUIRED_VARIABLES) - expected_variables = tuple(dict.fromkeys(expected_materialized_variables)) - issues: list[dict[str, Any]] = [] - - with h5py.File(dataset_path, "r") as handle: - entity_counts = _entity_counts(handle, period_key) - variable_summaries = { - variable: _variable_summary( - handle, - variable, - period_key=period_key, - entity_counts=entity_counts, - preferred_entity=required.get(variable), - ) - for variable in sorted(set(required) | set(expected_variables)) - } - - for variable, expected_entity in required.items(): - summary = variable_summaries[variable] - _append_variable_presence_issues( - issues, - variable=variable, - summary=summary, - expected_entity=expected_entity, - minimum_nonmissing_share=minimum_nonmissing_share, - required=True, - ) - for variable in expected_variables: - summary = variable_summaries[variable] - _append_variable_presence_issues( - issues, - variable=variable, - summary=summary, - expected_entity=None, - minimum_nonmissing_share=0.0, - required=True, - ) - - _append_source_spine_issues( - issues, - source_spine_composition=source_spine_composition, - expected_spines=expected_spines, - ) - valid = not any(issue["severity"] == "error" for issue in issues) - return { - "schemaVersion": 1, - "valid": valid, - "inputPath": str(input_path), - "artifactDir": str(artifact_dir) if artifact_dir is not None else None, - "datasetPath": str(dataset_path), - "period": int(period) if str(period).isdigit() else str(period), - "entityCounts": entity_counts, - "requiredVariables": required, - "expectedMaterializedVariables": list(expected_variables), - "variableSummaries": variable_summaries, - "sourceSpineComposition": _source_spine_summary(source_spine_composition), - "manifestSummary": _manifest_summary(manifest), - "issues": issues, - } - - -def write_policyengine_us_dataset_readiness_audit( - path: str | Path, - output_path: str | Path | None = None, - **kwargs: Any, -) -> Path: - """Write a readiness audit JSON sidecar.""" - - input_path = Path(path).expanduser() - destination = ( - Path(output_path) - if output_path is not None - else ( - input_path / "policyengine_dataset_readiness.json" - if input_path.is_dir() - else input_path.with_name(f"{input_path.stem}_readiness.json") - ) - ) - payload = build_policyengine_us_dataset_readiness_audit(input_path, **kwargs) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return destination - - -def _resolve_dataset_path(path: Path) -> Path: - if path.is_file(): - return path.resolve() - if not path.is_dir(): - raise FileNotFoundError(f"Dataset or artifact directory not found: {path}") - manifest_path = path / "manifest.json" - if manifest_path.exists(): - manifest = json.loads(manifest_path.read_text()) - dataset_name = dict(manifest.get("artifacts", {})).get("policyengine_dataset") - if isinstance(dataset_name, str) and dataset_name: - dataset_path = Path(dataset_name) - if not dataset_path.is_absolute(): - dataset_path = path / dataset_path - if dataset_path.exists(): - return dataset_path.resolve() - dataset_path = resolve_us_stage_artifact_contract_path( - path, - "08_dataset_assembly", - "policyengine_dataset", - ) - if dataset_path.exists(): - return dataset_path.resolve() - raise FileNotFoundError(f"No policyengine_us.h5 export found under {path}") - - -def _load_optional_json(path: Path) -> dict[str, Any] | None: - if not path.exists(): - return None - return json.loads(path.read_text()) - - -def _entity_counts(handle: h5py.File, period_key: str) -> dict[str, int | None]: - variables = { - "household": "household_id", - "person": "person_id", - "tax_unit": "tax_unit_id", - "spm_unit": "spm_unit_id", - } - return { - entity: _dataset_length(handle, variable, period_key) - for entity, variable in variables.items() - } - - -def _dataset_length( - handle: h5py.File, - variable: str, - period_key: str, -) -> int | None: - if variable not in handle or period_key not in handle[variable]: - return None - return int(len(handle[variable][period_key])) - - -def _variable_summary( - handle: h5py.File, - variable: str, - *, - period_key: str, - entity_counts: dict[str, int | None], - preferred_entity: str | None = None, -) -> dict[str, Any]: - if variable not in handle: - return {"exists": False, "hasPeriod": False} - group = handle[variable] - if period_key not in group: - return { - "exists": True, - "hasPeriod": False, - "availablePeriods": sorted(str(key) for key in group.keys()), - } - values = np.asarray(group[period_key]) - length = int(values.shape[0]) if values.shape else 1 - profile = _array_profile(values) - return { - "exists": True, - "hasPeriod": True, - "length": length, - "dtype": str(values.dtype), - "entity": _infer_entity( - length, - entity_counts, - preferred_entity=preferred_entity, - ), - **profile, - } - - -def _array_profile(values: np.ndarray) -> dict[str, Any]: - flat = np.ravel(values) - if flat.dtype.kind in {"b", "i", "u", "f"}: - numeric = flat.astype(float, copy=False) - finite = np.isfinite(numeric) - positive = finite & (numeric > 0.0) - nonzero = finite & (numeric != 0.0) - return { - "finiteCount": int(finite.sum()), - "finiteShare": _share(finite.sum(), len(flat)), - "nonmissingCount": int(finite.sum()), - "nonmissingShare": _share(finite.sum(), len(flat)), - "positiveCount": int(positive.sum()), - "positiveShare": _share(positive.sum(), len(flat)), - "nonzeroCount": int(nonzero.sum()), - "nonzeroShare": _share(nonzero.sum(), len(flat)), - } - decoded = _decode_string_array(flat) - nonmissing = np.array( - [bool(value.strip()) and value.strip().lower() != "nan" for value in decoded], - dtype=bool, - ) - return { - "nonmissingCount": int(nonmissing.sum()), - "nonmissingShare": _share(nonmissing.sum(), len(flat)), - } - - -def _decode_string_array(values: np.ndarray) -> list[str]: - result: list[str] = [] - for value in values.tolist(): - if isinstance(value, bytes): - result.append(value.decode("utf-8", errors="replace")) - else: - result.append(str(value)) - return result - - -def _share(numerator: int | np.integer, denominator: int) -> float | None: - if denominator == 0: - return None - return float(numerator) / float(denominator) - - -def _infer_entity( - length: int, - entity_counts: dict[str, int | None], - *, - preferred_entity: str | None = None, -) -> str | None: - matches = [ - entity - for entity, count in entity_counts.items() - if count is not None and int(count) == length - ] - if preferred_entity is not None and preferred_entity in matches: - return preferred_entity - if len(matches) == 1: - return matches[0] - if len(matches) > 1: - return "|".join(matches) - return None - - -def _append_variable_presence_issues( - issues: list[dict[str, Any]], - *, - variable: str, - summary: dict[str, Any], - expected_entity: str | None, - minimum_nonmissing_share: float, - required: bool, -) -> None: - severity = "error" if required else "warning" - if not summary.get("exists"): - issues.append( - { - "severity": severity, - "code": "missing_variable", - "variable": variable, - "message": f"Dataset is missing {variable!r}", - } - ) - return - if not summary.get("hasPeriod"): - issues.append( - { - "severity": severity, - "code": "missing_period", - "variable": variable, - "message": f"Dataset variable {variable!r} is missing the requested period", - } - ) - return - entity = summary.get("entity") - if expected_entity is not None and entity != expected_entity: - issues.append( - { - "severity": "error", - "code": "entity_length_mismatch", - "variable": variable, - "expectedEntity": expected_entity, - "observedEntity": entity, - "message": ( - f"Variable {variable!r} has length matching {entity!r}, " - f"expected {expected_entity!r}" - ), - } - ) - nonmissing_share = summary.get("nonmissingShare") - if ( - isinstance(nonmissing_share, int | float) - and nonmissing_share < minimum_nonmissing_share - ): - issues.append( - { - "severity": "error" if required else "warning", - "code": "low_nonmissing_share", - "variable": variable, - "nonmissingShare": float(nonmissing_share), - "minimumNonmissingShare": minimum_nonmissing_share, - "message": ( - f"Variable {variable!r} nonmissing share " - f"{nonmissing_share:.4f} is below {minimum_nonmissing_share:.4f}" - ), - } - ) - if variable == "spm_unit_spm_threshold": - positive_share = summary.get("positiveShare") - if isinstance(positive_share, int | float) and positive_share < 0.999: - issues.append( - { - "severity": "error", - "code": "low_positive_spm_threshold_share", - "variable": variable, - "positiveShare": float(positive_share), - "message": "SPM thresholds should be positive for nearly all SPM units", - } - ) - - -def _append_source_spine_issues( - issues: list[dict[str, Any]], - *, - source_spine_composition: dict[str, Any] | None, - expected_spines: tuple[str, ...], -) -> None: - if not expected_spines: - return - if source_spine_composition is None: - issues.append( - { - "severity": "warning", - "code": "missing_source_spine_composition", - "message": "Artifact has no source_spine_composition.json sidecar", - } - ) - return - observed = { - str(group.get("spine")) - for group in source_spine_composition.get("groups", ()) - if group.get("spine") is not None - } - missing = sorted(set(expected_spines) - observed) - if missing: - issues.append( - { - "severity": "error", - "code": "missing_expected_spines", - "missingSpines": missing, - "observedSpines": sorted(observed), - "message": "Source-spine composition is missing expected spines", - } - ) - - -def _source_spine_summary(payload: dict[str, Any] | None) -> dict[str, Any] | None: - if payload is None: - return None - return { - "householdCount": payload.get("household_count"), - "nonzeroHouseholdCount": payload.get("nonzero_household_count"), - "totalActiveWeight": payload.get("total_active_weight"), - "effectiveSampleSize": payload.get("effective_sample_size"), - "groups": [ - { - "spine": group.get("spine"), - "householdCount": group.get("household_count"), - "nonzeroHouseholdCount": group.get("nonzero_household_count"), - "totalActiveWeight": group.get("total_active_weight"), - "totalSourceWeight": group.get("total_source_weight"), - } - for group in payload.get("groups", ()) - ], - } - - -def _manifest_summary(payload: dict[str, Any] | None) -> dict[str, Any] | None: - if payload is None: - return None - artifacts = dict(payload.get("artifacts", {})) - return { - "rows": payload.get("rows"), - "weights": payload.get("weights"), - "policyengineDataset": artifacts.get("policyengine_dataset"), - "policyengineNativeScores": artifacts.get("policyengine_native_scores"), - "sourceSpineComposition": artifacts.get("source_spine_composition"), - } - - -def main(argv: list[str] | None = None) -> int: - """CLI entry point for dataset readiness audits.""" - - parser = argparse.ArgumentParser( - description="Audit a saved MicroPlex PE-US H5 export before native scoring.", - ) - parser.add_argument("path", help="Artifact directory or policyengine_us.h5 path") - parser.add_argument("--period", default=DEFAULT_PERIOD) - parser.add_argument("--output-path") - parser.add_argument( - "--expected-materialized-variable", - action="append", - default=None, - help="Calculated PolicyEngine variable expected in the H5. Repeatable.", - ) - parser.add_argument( - "--expected-spine", - action="append", - default=None, - help="Source spine expected in source_spine_composition.json. Repeatable.", - ) - args = parser.parse_args(argv) - - output = write_policyengine_us_dataset_readiness_audit( - args.path, - output_path=args.output_path, - period=args.period, - expected_materialized_variables=tuple( - args.expected_materialized_variable - if args.expected_materialized_variable is not None - else DEFAULT_EXPECTED_MATERIALIZED_VARIABLES - ), - expected_spines=tuple( - args.expected_spine - if args.expected_spine is not None - else DEFAULT_EXPECTED_SPINES - ), - ) - payload = json.loads(output.read_text()) - print( - json.dumps( - { - "output": str(output), - "valid": payload["valid"], - "datasetPath": payload["datasetPath"], - "entityCounts": payload["entityCounts"], - "issueCount": len(payload["issues"]), - }, - indent=2, - sort_keys=True, - ) - ) - return 0 if payload["valid"] else 1 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/pe_us_recalibrate_from_checkpoint.py b/src/microplex_us/pipelines/pe_us_recalibrate_from_checkpoint.py deleted file mode 100644 index 9b3f0847..00000000 --- a/src/microplex_us/pipelines/pe_us_recalibrate_from_checkpoint.py +++ /dev/null @@ -1,226 +0,0 @@ -"""Recalibrate a saved US microplex checkpoint with a new calibration config. - -Load a ``post_imputation`` or ``post_microsim`` pipeline checkpoint -previously saved via -``pe_us_data_rebuild_checkpoint --pipeline-checkpoint-save-post-imputation-path`` -(or ``--pipeline-checkpoint-save-post-microsim-path``) and rerun the -calibration stage without repeating the ~11 hours of synthesis + donor -imputation. A ``post_microsim`` checkpoint additionally skips the -microsim materialization step because the materialized vars are -already on the bundle as columns. - -Intended for rapid iteration on calibration backends / target sets / -sparsity schedules: change one flag, run for ~30 min -(``post_imputation``) or ~1–2 min + calibration fit -(``post_microsim``) instead of half a day. -""" - -from __future__ import annotations - -import argparse -import json -import os -import sys -from collections.abc import Sequence -from pathlib import Path - -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexPipeline, - recalibrate_policyengine_us_from_checkpoint, -) - - -def _prepare_output_root(output_root: Path) -> Path: - if not output_root.exists(): - raise FileNotFoundError(f"--output-root does not exist: {output_root}") - if not output_root.is_dir(): - raise NotADirectoryError(f"--output-root is not a directory: {output_root}") - if not os.access(output_root, os.W_OK | os.X_OK): - raise PermissionError(f"--output-root is not writable: {output_root}") - return output_root - - -def main(argv: Sequence[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description=( - "Rerun US microplex calibration from a saved checkpoint. Works " - "with both post_imputation (skips ~11 h synthesis) and " - "post_microsim (additionally skips ~30 min microsim) stages." - ), - ) - parser.add_argument( - "--checkpoint-path", - type=Path, - required=True, - help=( - "Path to a directory written by the main pipeline with " - "--pipeline-checkpoint-save-post-imputation-path or " - "--pipeline-checkpoint-save-post-microsim-path." - ), - ) - parser.add_argument( - "--output-root", - type=Path, - required=True, - help="Existing output directory for the recalibrated bundle and summary.", - ) - parser.add_argument( - "--targets-db", - type=Path, - required=True, - help="Path to the PolicyEngine US targets SQLite database.", - ) - parser.add_argument( - "--arch-targets-db", - type=Path, - action="append", - default=[], - help=( - "Path to an Arch target artifact. May be supplied multiple times. " - "Required when --calibration-target-source=arch." - ), - ) - parser.add_argument( - "--target-period", - type=int, - default=None, - help="Calendar year for calibration targets (default: config default).", - ) - parser.add_argument( - "--target-profile", - type=str, - default=None, - help="PolicyEngine target profile used for scoring/ledger context.", - ) - parser.add_argument( - "--calibration-target-source", - choices=["policyengine", "arch"], - default=None, - help=( - "Source for calibration targets. Defaults to the build config default " - "unless supplied." - ), - ) - parser.add_argument( - "--calibration-target-profile", - type=str, - default=None, - help="Target profile used to select calibration constraints.", - ) - parser.add_argument( - "--calibration-backend", - type=str, - default="pe_l0", - help="Calibration backend (pe_l0, microcalibrate, hardconcrete, etc.).", - ) - parser.add_argument( - "--calibration-max-iter", - type=int, - default=None, - help="Max iterations / epochs for the calibration solver.", - ) - parser.add_argument( - "--policyengine-materialize-batch-size", - type=int, - default=100_000, - help=( - "Batch size for PE variable materialization (default 100_000; " - "keeps a single Microsimulation under a few GB at 1.5M-household scale)." - ), - ) - parser.add_argument( - "--pipeline-checkpoint-save-post-microsim-path", - type=Path, - default=None, - help=( - "If set, also save a post-microsim checkpoint during this " - "recalibration so the next iteration can skip microsim too." - ), - ) - parser.add_argument( - "--policyengine-dataset-output", - type=Path, - default=None, - help=( - "Optional PolicyEngine-readable H5 output path. When set, the " - "recalibrated entity tables are exported after calibration." - ), - ) - args = parser.parse_args(argv) - output_root = _prepare_output_root(args.output_root) - - config_kwargs: dict[str, object] = { - "calibration_backend": args.calibration_backend, - "policyengine_targets_db": args.targets_db, - "policyengine_materialize_batch_size": int( - args.policyengine_materialize_batch_size - ), - } - if args.arch_targets_db: - config_kwargs["arch_targets_db"] = tuple( - str(path) for path in args.arch_targets_db - ) - if args.target_period is not None: - config_kwargs["policyengine_target_period"] = int(args.target_period) - if args.target_profile is not None: - config_kwargs["policyengine_target_profile"] = args.target_profile - if args.calibration_target_source is not None: - config_kwargs["calibration_target_source"] = args.calibration_target_source - if args.calibration_target_profile is not None: - config_kwargs["policyengine_calibration_target_profile"] = ( - args.calibration_target_profile - ) - if args.calibration_max_iter is not None: - config_kwargs["calibration_max_iter"] = int(args.calibration_max_iter) - if args.pipeline_checkpoint_save_post_microsim_path is not None: - config_kwargs["pipeline_checkpoint_save_post_microsim_path"] = ( - args.pipeline_checkpoint_save_post_microsim_path - ) - - config = USMicroplexBuildConfig(**config_kwargs) - result = recalibrate_policyengine_us_from_checkpoint(config, args.checkpoint_path) - - result.calibrated_data.to_parquet(output_root / "calibrated_data.parquet") - result.policyengine_tables.households.to_parquet( - output_root / "households.parquet" - ) - if result.policyengine_tables.persons is not None: - result.policyengine_tables.persons.to_parquet( - output_root / "persons.parquet" - ) - if result.policyengine_tables.tax_units is not None: - result.policyengine_tables.tax_units.to_parquet( - output_root / "tax_units.parquet" - ) - if result.policyengine_tables.spm_units is not None: - result.policyengine_tables.spm_units.to_parquet( - output_root / "spm_units.parquet" - ) - if result.policyengine_tables.families is not None: - result.policyengine_tables.families.to_parquet( - output_root / "families.parquet" - ) - if result.policyengine_tables.marital_units is not None: - result.policyengine_tables.marital_units.to_parquet( - output_root / "marital_units.parquet" - ) - (output_root / "calibration_summary.json").write_text( - json.dumps(result.calibration_summary, indent=2, default=str) - ) - if args.policyengine_dataset_output is not None: - USMicroplexPipeline(config).export_policyengine_dataset( - result, - args.policyengine_dataset_output, - period=args.target_period, - ) - print( - f"Recalibrated from {args.checkpoint_path} → {output_root} " - f"(stage={result.loaded_stage}, " - f"rows={len(result.calibrated_data)})" - ) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/microplex_us/pipelines/performance.py b/src/microplex_us/pipelines/performance.py deleted file mode 100644 index 9dc21753..00000000 --- a/src/microplex_us/pipelines/performance.py +++ /dev/null @@ -1,1719 +0,0 @@ -"""Performance harness for iterative US microplex optimization.""" - -from __future__ import annotations - -import json -import shutil -import subprocess -from collections.abc import Iterable -from dataclasses import asdict, dataclass, field, replace -from pathlib import Path -from tempfile import TemporaryDirectory -from time import perf_counter - -import h5py -import numpy as np -import pandas as pd -from microplex.core import EntityType, ObservationFrame, SourceProvider, SourceQuery -from microplex.fusion import FusionPlan -from microplex.targets import TargetSet - -from microplex_us.pipelines.pe_native_optimization import ( - optimize_policyengine_us_native_loss_dataset, - rewrite_policyengine_us_dataset_weights, -) -from microplex_us.pipelines.pe_native_scores import ( - _ENHANCED_CPS_BAD_TARGETS, - build_policyengine_us_data_subprocess_env, - compare_us_pe_native_target_deltas, - compute_batch_us_pe_native_scores, - compute_us_pe_native_scores, - compute_us_pe_native_support_audit, - resolve_policyengine_us_data_repo_root, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexPipeline, - USMicroplexTargets, -) -from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessRun, - default_policyengine_us_db_harness_slices, - evaluate_policyengine_us_harness, - filter_nonempty_policyengine_us_harness_slices, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSDBTargetProvider, - PolicyEngineUSEntityTableBundle, - _infer_policyengine_array_entity, - _load_policyengine_us_period_arrays, - _resolve_policyengine_us_tax_benefit_system, - write_policyengine_us_time_period_dataset, -) - -CacheValue = object -BuildConfigCacheKey = tuple[tuple[str, CacheValue], ...] -SourceQueryCacheKey = tuple[ - str, - tuple[tuple[str, CacheValue], ...], - int | str | None, - tuple[tuple[str, CacheValue], ...], -] -PreCalibrationCacheKey = tuple[tuple[SourceQueryCacheKey, ...], BuildConfigCacheKey] -CalibrationCacheKey = tuple[PreCalibrationCacheKey, BuildConfigCacheKey] - -PRECALIBRATION_STAGE_NAMES = ( - "prepare_source_inputs", - "prepare_seed_data", - "integrate_donor_sources", - "build_targets", - "resolve_synthesis_variables", - "synthesize", - "ensure_target_support", - "build_policyengine_tables", -) - -_MATCHED_BASELINE_REWEIGHT_SCRIPT = """ -import json -import sys -from pathlib import Path - -import numpy as np -from policyengine_core.data import Dataset - -REPO_ROOT = sys.argv[1] -if REPO_ROOT not in sys.path: - sys.path.insert(0, REPO_ROOT) - -from policyengine_us import Microsimulation -from policyengine_us_data.datasets.cps.enhanced_cps import reweight -from policyengine_us_data.utils.loss import build_loss_matrix - -BAD_TARGETS = tuple(json.loads(sys.argv[2])) -PERIOD = int(sys.argv[3]) -DATASET_PATH = sys.argv[4] -OUTPUT_WEIGHTS = Path(sys.argv[5]) -EPOCHS = int(sys.argv[6]) -L0_LAMBDA = float(sys.argv[7]) -SEED = int(sys.argv[8]) - - -def dataset_from_path(dataset_path: str, dataset_name: str): - class LocalDataset(Dataset): - name = dataset_name - label = dataset_name - file_path = dataset_path - data_format = Dataset.TIME_PERIOD_ARRAYS - time_period = PERIOD - - return LocalDataset - - -dataset_cls = dataset_from_path( - DATASET_PATH, - Path(DATASET_PATH).stem.replace("-", "_"), -) -sim = Microsimulation(dataset=dataset_cls) -original_weights = sim.calculate( - "household_weight", - map_to="household", - period=PERIOD, -).values.astype(np.float64) -rng = np.random.default_rng(SEED) -original_weights = original_weights + rng.normal(1.0, 0.1, len(original_weights)) -loss_matrix, targets_array = build_loss_matrix(dataset_cls, PERIOD) -zero_mask = np.isclose(targets_array, 0.0, atol=0.1) -bad_mask = loss_matrix.columns.isin(BAD_TARGETS) -keep_mask = ~(zero_mask | bad_mask) -loss_matrix_clean = loss_matrix.loc[:, keep_mask].astype(np.float32) -targets_array_clean = targets_array[keep_mask] -optimized_weights = reweight( - original_weights, - loss_matrix_clean, - targets_array_clean, - log_path=None, - epochs=EPOCHS, - l0_lambda=L0_LAMBDA, - seed=SEED, -) -np.save(OUTPUT_WEIGHTS, optimized_weights.astype(np.float32)) -""".strip() - - -def default_fast_calibration_target_variables( - variables: tuple[str, ...], -) -> tuple[str, ...]: - """Drop redundant targets when a downstream PE tax aggregate already subsumes them.""" - filtered = list(variables) - if "income_tax" in filtered and "adjusted_gross_income" in filtered: - filtered = [ - variable for variable in filtered if variable != "adjusted_gross_income" - ] - return tuple(filtered) or variables - - -@dataclass(frozen=True) -class USMicroplexPerformanceHarnessConfig: - """Configuration for a repeatable local optimization harness.""" - - sample_n: int | None = 100 - n_synthetic: int = 100 - random_seed: int = 42 - targets_db: str | Path | None = None - baseline_dataset: str | Path | None = None - target_period: int = 2024 - target_variables: tuple[str, ...] | None = None - target_domains: tuple[str, ...] | None = None - target_geo_levels: tuple[str, ...] | None = None - target_profile: str | None = None - calibration_target_variables: tuple[str, ...] | None = None - calibration_target_domains: tuple[str, ...] | None = None - calibration_target_geo_levels: tuple[str, ...] | None = None - calibration_target_profile: str | None = None - build_config: USMicroplexBuildConfig | None = None - evaluate_parity: bool = True - evaluate_pe_native_loss: bool = False - evaluate_matched_pe_native_loss: bool = False - reweight_matched_pe_native_loss: bool = False - optimize_pe_native_loss: bool = False - pe_native_household_budget: int | None = None - pe_native_optimizer_max_iter: int = 200 - pe_native_optimizer_l2_penalty: float = 0.0 - pe_native_optimizer_tol: float = 1e-8 - pe_native_score_consistency_tol: float = 1e-6 - pe_native_target_delta_top_k: int = 25 - matched_baseline_household_count: int | None = None - matched_baseline_random_seed: int = 42 - matched_baseline_reweight_epochs: int = 250 - matched_baseline_reweight_l0_lambda: float = 2.6445e-07 - matched_baseline_reweight_seed: int = 1456 - policyengine_us_data_repo: str | Path | None = None - strict_materialization: bool = True - fast_inner_loop_calibration: bool = False - output_json_path: str | Path | None = None - output_policyengine_dataset_path: str | Path | None = None - output_pe_native_target_delta_path: str | Path | None = None - output_pe_native_support_audit_path: str | Path | None = None - output_matched_baseline_dataset_path: str | Path | None = None - - -@dataclass(frozen=True) -class USMicroplexPerformanceHarnessResult: - """Stage timings plus parity metrics for one performance-harness run.""" - - config: USMicroplexPerformanceHarnessConfig - build_config: USMicroplexBuildConfig - build_result: USMicroplexBuildResult - source_names: tuple[str, ...] - stage_timings: dict[str, float] - total_seconds: float - parity_run: PolicyEngineUSHarnessRun | None = None - pe_native_scores: dict[str, object] | None = None - matched_pe_native_scores: dict[str, object] | None = None - pe_native_target_deltas: dict[str, object] | None = None - pe_native_support_audit: dict[str, object] | None = None - policyengine_dataset_path: str | None = None - matched_baseline_dataset_path: str | None = None - - def _parity_run_attr(self, name: str) -> float | None: - if self.parity_run is None: - return None - return getattr(self.parity_run, name) - - def _pe_native_score_attr(self, name: str) -> float | bool | int | None: - if self.pe_native_scores is None: - return None - summary = self.pe_native_scores.get("summary") - if not isinstance(summary, dict): - return None - return summary.get(name) - - @property - def candidate_composite_parity_loss(self) -> float | None: - return self._parity_run_attr("candidate_composite_parity_loss") - - @property - def baseline_composite_parity_loss(self) -> float | None: - return self._parity_run_attr("baseline_composite_parity_loss") - - @property - def target_win_rate(self) -> float | None: - return self._parity_run_attr("target_win_rate") - - @property - def slice_win_rate(self) -> float | None: - return self._parity_run_attr("slice_win_rate") - - @property - def candidate_enhanced_cps_native_loss(self) -> float | None: - value = self._pe_native_score_attr("candidate_enhanced_cps_native_loss") - return float(value) if value is not None else None - - @property - def baseline_enhanced_cps_native_loss(self) -> float | None: - value = self._pe_native_score_attr("baseline_enhanced_cps_native_loss") - return float(value) if value is not None else None - - @property - def enhanced_cps_native_loss_delta(self) -> float | None: - value = self._pe_native_score_attr("enhanced_cps_native_loss_delta") - return float(value) if value is not None else None - - def to_dict(self) -> dict[str, object]: - payload: dict[str, object] = { - "config": _json_compatible_value(asdict(self.config)), - "build_config": self.build_config.to_dict(), - "source_names": list(self.source_names), - "stage_timings": dict(self.stage_timings), - "total_seconds": float(self.total_seconds), - "calibration_summary": _json_compatible_value( - self.build_result.calibration_summary - ), - "policyengine_dataset_path": self.policyengine_dataset_path, - } - if self.parity_run is not None: - payload["parity_run"] = self.parity_run.to_dict() - if self.pe_native_scores is not None: - payload["pe_native_scores"] = _json_compatible_value( - self.pe_native_scores - ) - if self.matched_pe_native_scores is not None: - payload["matched_pe_native_scores"] = _json_compatible_value( - self.matched_pe_native_scores - ) - if self.pe_native_target_deltas is not None: - payload["pe_native_target_deltas"] = _json_compatible_value( - self.pe_native_target_deltas - ) - if self.pe_native_support_audit is not None: - payload["pe_native_support_audit"] = _json_compatible_value( - self.pe_native_support_audit - ) - if self.matched_baseline_dataset_path is not None: - payload["matched_baseline_dataset_path"] = self.matched_baseline_dataset_path - return payload - - def save(self, path: str | Path) -> Path: - destination = Path(path) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text(json.dumps(self.to_dict(), indent=2, sort_keys=True)) - return destination - - -@dataclass(frozen=True) -class USMicroplexPerformanceHarnessRequest: - """One performance-harness request for shared-session batch execution.""" - - providers: tuple[SourceProvider, ...] - config: USMicroplexPerformanceHarnessConfig - queries: dict[str, SourceQuery] | None = None - - -@dataclass -class USMicroplexPreCalibrationCacheEntry: - """Reusable upstream build state prior to PE-US calibration.""" - - seed_data: pd.DataFrame - synthetic_data: pd.DataFrame - synthetic_tables: PolicyEngineUSEntityTableBundle - targets: USMicroplexTargets - synthesis_metadata: dict[str, object] - synthesizer: object | None - source_frame: ObservationFrame - source_frames: tuple[ObservationFrame, ...] - fusion_plan: FusionPlan - - -@dataclass -class USMicroplexCalibrationCacheEntry: - """Reusable PE-US calibration result for a fixed synthetic table bundle.""" - - policyengine_tables: PolicyEngineUSEntityTableBundle - calibrated_data: pd.DataFrame - calibration_summary: dict[str, object] - - -def _stage(stage_timings: dict[str, float], name: str) -> float: - start = perf_counter() - stage_timings.setdefault(name, 0.0) - return start - - -def _finish_stage(stage_timings: dict[str, float], name: str, start: float) -> None: - stage_timings[name] += perf_counter() - start - - -def _copy_output_file(source: str | Path, destination: str | Path) -> str: - source_path = Path(source).expanduser().resolve() - destination_path = Path(destination).expanduser().resolve() - destination_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(source_path, destination_path) - return str(destination_path) - - -def _normalize_source_query_value(value: object) -> object: - if isinstance(value, Path): - return str(value) - if isinstance(value, (str, int, float, bool)) or value is None: - return value - if isinstance(value, dict): - return tuple( - sorted( - (str(key), _normalize_source_query_value(item)) - for key, item in value.items() - ) - ) - if isinstance(value, (list, tuple)): - return tuple(_normalize_source_query_value(item) for item in value) - if callable(value): - return f"{value.__module__}.{value.__qualname__}" - return str(value) - - -def _json_compatible_value(value: object) -> object: - if isinstance(value, Path): - return str(value) - if isinstance(value, dict): - return { - str(key): _json_compatible_value(item) - for key, item in value.items() - } - if isinstance(value, (list, tuple)): - return [_json_compatible_value(item) for item in value] - if hasattr(value, "item") and callable(getattr(value, "item")): - try: - return _json_compatible_value(value.item()) - except (TypeError, ValueError): - pass - if isinstance(value, (str, int, float, bool)) or value is None: - return value - return str(value) - - -def _sorted_normalized_items( - items: Iterable[tuple[str, object]], -) -> tuple[tuple[str, object], ...]: - return tuple( - sorted( - (str(key), _normalize_source_query_value(value)) - for key, value in items - ) - ) - - -_POLICYENGINE_SELECTION_CACHE_FIELDS = frozenset( - { - "policyengine_selection_backend", - "policyengine_selection_household_budget", - "policyengine_selection_max_iter", - "policyengine_selection_tol", - "policyengine_selection_l2_penalty", - } -) - -PRECALIBRATION_EXCLUDED_BUILD_CONFIG_FIELDS = frozenset( - { - "calibration_backend", - "calibration_tol", - "calibration_max_iter", - "target_sparsity", - "device", - "policyengine_baseline_dataset", - "policyengine_targets_db", - "policyengine_target_period", - "policyengine_target_variables", - "policyengine_target_domains", - "policyengine_target_geo_levels", - "policyengine_calibration_target_variables", - "policyengine_calibration_target_domains", - "policyengine_calibration_target_geo_levels", - "policyengine_target_reform_id", - "policyengine_simulation_cls", - } -) | _POLICYENGINE_SELECTION_CACHE_FIELDS - -CALIBRATION_INCLUDED_BUILD_CONFIG_FIELDS = frozenset( - { - "calibration_backend", - "calibration_tol", - "calibration_max_iter", - "target_sparsity", - "device", - "policyengine_targets_db", - "policyengine_target_period", - "policyengine_target_variables", - "policyengine_target_domains", - "policyengine_target_geo_levels", - "policyengine_calibration_target_variables", - "policyengine_calibration_target_domains", - "policyengine_calibration_target_geo_levels", - "policyengine_target_reform_id", - "policyengine_simulation_cls", - "policyengine_dataset_year", - } -) | _POLICYENGINE_SELECTION_CACHE_FIELDS - - -def _provider_cache_identity(provider: SourceProvider) -> tuple[str, tuple[tuple[str, object], ...]]: - provider_type = f"{provider.__class__.__module__}.{provider.__class__.__qualname__}" - public_items = _sorted_normalized_items( - (key, value) - for key, value in vars(provider).items() - if not key.startswith("_") and not callable(value) - ) - return provider_type, public_items - - -def _build_config_key( - build_config: USMicroplexBuildConfig, - *, - included_fields: frozenset[str] | None = None, - excluded_fields: frozenset[str] = frozenset(), -) -> tuple[tuple[str, object], ...]: - return _sorted_normalized_items( - (key, value) - for key, value in build_config.to_dict().items() - if (included_fields is None or key in included_fields) - and key not in excluded_fields - ) - - -def _precalibration_build_config_key( - build_config: USMicroplexBuildConfig, -) -> BuildConfigCacheKey: - return _build_config_key( - build_config, - excluded_fields=PRECALIBRATION_EXCLUDED_BUILD_CONFIG_FIELDS, - ) - - -def _calibration_build_config_key( - build_config: USMicroplexBuildConfig, -) -> BuildConfigCacheKey: - return _build_config_key( - build_config, - included_fields=CALIBRATION_INCLUDED_BUILD_CONFIG_FIELDS, - ) - - -def _source_query_cache_key( - provider: SourceProvider, - query: SourceQuery | None, -) -> SourceQueryCacheKey: - query = query or SourceQuery() - provider_type, provider_items = _provider_cache_identity(provider) - return ( - provider_type, - provider_items, - query.period, - _sorted_normalized_items(query.provider_filters.items()), - ) - - -def _copy_optional_table(table: pd.DataFrame | None) -> pd.DataFrame | None: - if table is None: - return None - return table.copy() - - -def _clone_policyengine_us_tables( - tables: PolicyEngineUSEntityTableBundle, -) -> PolicyEngineUSEntityTableBundle: - if not isinstance(tables, PolicyEngineUSEntityTableBundle): - return tables - return PolicyEngineUSEntityTableBundle( - households=tables.households.copy(), - persons=_copy_optional_table(tables.persons), - tax_units=_copy_optional_table(tables.tax_units), - spm_units=_copy_optional_table(tables.spm_units), - families=_copy_optional_table(tables.families), - marital_units=_copy_optional_table(tables.marital_units), - ) - - -def _clone_calibration_cache_entry( - entry: USMicroplexCalibrationCacheEntry, -) -> USMicroplexCalibrationCacheEntry: - return USMicroplexCalibrationCacheEntry( - policyengine_tables=_clone_policyengine_us_tables(entry.policyengine_tables), - calibrated_data=entry.calibrated_data.copy(deep=True), - calibration_summary=dict(entry.calibration_summary), - ) - - -def _filter_policyengine_tables_to_households( - tables: PolicyEngineUSEntityTableBundle, - household_ids: pd.Index, -) -> PolicyEngineUSEntityTableBundle: - household_id_set = set(household_ids.tolist()) - - def _filter_table(table: pd.DataFrame | None) -> pd.DataFrame | None: - if table is None: - return None - if "household_id" not in table.columns: - return table.copy() - return table.loc[table["household_id"].isin(household_id_set)].copy() - - households = tables.households.loc[ - tables.households["household_id"].isin(household_id_set) - ].copy() - return PolicyEngineUSEntityTableBundle( - households=households, - persons=_filter_table(tables.persons), - tax_units=_filter_table(tables.tax_units), - spm_units=_filter_table(tables.spm_units), - families=_filter_table(tables.families), - marital_units=_filter_table(tables.marital_units), - ) - - -def _write_matched_policyengine_us_baseline_dataset( - baseline_dataset_path: str | Path, - output_dataset_path: str | Path, - *, - period: int, - household_count: int, - random_seed: int, - sample_method: str = "uniform", -) -> str: - period_key = str(period) - arrays = _load_policyengine_us_period_arrays( - baseline_dataset_path, - period_key=period_key, - variables=None, - ) - required_structural = { - "household_id", - "household_weight", - "person_id", - "person_household_id", - } - missing = sorted(required_structural - set(arrays)) - if missing: - raise ValueError( - "matched baseline dataset is missing required structural arrays: " - + ", ".join(missing) - ) - - household_ids = np.asarray(arrays["household_id"]) - household_weights = np.asarray(arrays["household_weight"], dtype=np.float64) - if household_count <= 0: - raise ValueError("matched baseline household_count must be positive") - if household_count > len(household_ids): - raise ValueError( - "matched baseline household_count cannot exceed baseline household rows" - ) - - resolved_baseline_path = Path(baseline_dataset_path).expanduser().resolve() - resolved_output_path = Path(output_dataset_path).expanduser().resolve() - if household_count == len(household_ids): - if resolved_baseline_path != resolved_output_path: - resolved_output_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(resolved_baseline_path, resolved_output_path) - return str(resolved_output_path) - - sampled_household_ids = _sample_matched_household_ids( - household_ids, - household_weights, - household_count=household_count, - random_seed=random_seed, - sample_method=sample_method, - ) - household_mask = np.isin(household_ids, sampled_household_ids) - person_mask = np.isin( - np.asarray(arrays["person_household_id"]), - sampled_household_ids, - ) - - entity_masks: dict[object, np.ndarray] = { - EntityType.HOUSEHOLD: household_mask, - EntityType.PERSON: person_mask, - } - entity_lengths: dict[EntityType, int] = { - EntityType.HOUSEHOLD: len(household_ids), - EntityType.PERSON: len(np.asarray(arrays["person_id"])), - } - for entity_type, id_name, person_membership_name in ( - (EntityType.TAX_UNIT, "tax_unit_id", "person_tax_unit_id"), - (EntityType.SPM_UNIT, "spm_unit_id", "person_spm_unit_id"), - (EntityType.FAMILY, "family_id", "person_family_id"), - ("marital_unit", "marital_unit_id", "person_marital_unit_id"), - ): - entity_ids = arrays.get(id_name) - person_entity_ids = arrays.get(person_membership_name) - if entity_ids is None or person_entity_ids is None: - continue - entity_ids = np.asarray(entity_ids) - person_entity_ids = np.asarray(person_entity_ids) - selected_entity_ids = np.unique(person_entity_ids[person_mask]) - entity_masks[entity_type] = np.isin(entity_ids, selected_entity_ids) - entity_lengths[entity_type] = len(entity_ids) - - original_weight_sum = float(household_weights.sum()) - sampled_weight_sum = float(household_weights[household_mask].sum()) - if sampled_weight_sum <= 0.0: - raise ValueError("matched baseline sample produced nonpositive household weight sum") - weight_scale = original_weight_sum / sampled_weight_sum - - structural_entities: dict[str, object] = { - "household_id": EntityType.HOUSEHOLD, - "household_weight": EntityType.HOUSEHOLD, - "person_id": EntityType.PERSON, - "person_household_id": EntityType.PERSON, - "person_weight": EntityType.PERSON, - "tax_unit_id": EntityType.TAX_UNIT, - "person_tax_unit_id": EntityType.PERSON, - "tax_unit_weight": EntityType.TAX_UNIT, - "spm_unit_id": EntityType.SPM_UNIT, - "person_spm_unit_id": EntityType.PERSON, - "spm_unit_weight": EntityType.SPM_UNIT, - "family_id": EntityType.FAMILY, - "person_family_id": EntityType.PERSON, - "family_weight": EntityType.FAMILY, - "marital_unit_id": "marital_unit", - "person_marital_unit_id": EntityType.PERSON, - "marital_unit_weight": "marital_unit", - } - scaled_weight_variables = { - "household_weight", - "person_weight", - "tax_unit_weight", - "spm_unit_weight", - "family_weight", - "marital_unit_weight", - } - try: - tax_benefit_system = _resolve_policyengine_us_tax_benefit_system(None) - except (ImportError, ValueError): - tax_benefit_system = None - - sampled_arrays: dict[str, dict[str, np.ndarray]] = {} - with h5py.File(resolved_baseline_path, "r") as handle: - for variable_name, group in handle.items(): - if not isinstance(group, h5py.Group): - continue - period_values = { - stored_period: np.asarray(dataset) - for stored_period, dataset in group.items() - } - if not period_values: - continue - - representative_values = next(iter(period_values.values())) - entity = structural_entities.get(variable_name) - if entity is None: - entity = _infer_policyengine_array_entity( - variable_name=variable_name, - values=representative_values, - entity_lengths=entity_lengths, - tax_benefit_system=tax_benefit_system, - ) - mask = entity_masks.get(entity) - if mask is None: - continue - - sampled_periods: dict[str, np.ndarray] = {} - for stored_period, values in period_values.items(): - sampled_values = values[mask] - if variable_name in scaled_weight_variables: - sampled_values = sampled_values.astype(np.float64) * weight_scale - sampled_periods[stored_period] = sampled_values - sampled_arrays[variable_name] = sampled_periods - - return str( - write_policyengine_us_time_period_dataset( - sampled_arrays, - output_dataset_path, - ).resolve() - ) - - -def _sample_matched_household_ids( - household_ids: np.ndarray, - household_weights: np.ndarray, - *, - household_count: int, - random_seed: int, - sample_method: str, -) -> np.ndarray: - """Choose household IDs for a matched-size PE dataset copy.""" - - method = sample_method.lower().replace("-", "_") - if method == "uniform": - return ( - pd.Series(household_ids) - .sample( - n=household_count, - replace=False, - random_state=random_seed, - ) - .to_numpy() - ) - if method in {"weight_proportional", "pps"}: - positive_weights = np.clip(household_weights.astype(np.float64), 0.0, None) - weight_sum = float(positive_weights.sum()) - if weight_sum <= 0.0: - raise ValueError( - "weight-proportional matched sample requires positive household weights" - ) - rng = np.random.default_rng(random_seed) - return rng.choice( - household_ids, - size=household_count, - replace=False, - p=positive_weights / weight_sum, - ) - if method == "largest_weight": - frame = pd.DataFrame( - {"household_id": household_ids, "household_weight": household_weights} - ) - return ( - frame.sort_values( - ["household_weight", "household_id"], - ascending=[False, True], - kind="mergesort", - )["household_id"] - .head(household_count) - .to_numpy() - ) - raise ValueError( - "matched sample_method must be one of: uniform, weight_proportional, " - "largest_weight" - ) - - -def _reweight_matched_policyengine_us_baseline_dataset( - input_dataset_path: str | Path, - output_dataset_path: str | Path, - *, - period: int, - epochs: int, - l0_lambda: float, - seed: int, - policyengine_us_data_repo: str | Path | None = None, -) -> str: - resolved_repo = resolve_policyengine_us_data_repo_root(policyengine_us_data_repo) - env = build_policyengine_us_data_subprocess_env(resolved_repo) - with TemporaryDirectory(prefix="microplex-us-matched-baseline-reweight-") as temp_dir: - weights_path = Path(temp_dir) / "matched_baseline_weights.npy" - completed = subprocess.run( - [ - "uv", - "run", - "--project", - str(resolved_repo), - "python", - "-c", - _MATCHED_BASELINE_REWEIGHT_SCRIPT, - str(resolved_repo), - json.dumps(_ENHANCED_CPS_BAD_TARGETS), - str(int(period)), - str(Path(input_dataset_path).expanduser().resolve()), - str(weights_path), - str(int(epochs)), - str(float(l0_lambda)), - str(int(seed)), - ], - cwd=resolved_repo, - env=env, - capture_output=True, - text=True, - check=False, - ) - if completed.returncode != 0: - stderr = completed.stderr.strip() - stdout = completed.stdout.strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise RuntimeError(f"Matched baseline reweighting failed: {detail}") - optimized_weights = np.load(weights_path) - rewritten = rewrite_policyengine_us_dataset_weights( - input_dataset_path=input_dataset_path, - output_dataset_path=output_dataset_path, - household_weights=optimized_weights, - period=period, - ) - return str(rewritten.resolve()) - - -def _default_provider_query( - config: USMicroplexPerformanceHarnessConfig, -) -> SourceQuery: - return SourceQuery( - provider_filters={ - "sample_n": config.sample_n, - "random_seed": config.random_seed, - } - ) - - -def _load_frames( - providers: list[SourceProvider], - provider_queries: dict[str, SourceQuery], - *, - frame_cache: dict[SourceQueryCacheKey, ObservationFrame] | None, -) -> tuple[list[ObservationFrame], list[SourceQueryCacheKey]]: - frames: list[ObservationFrame] = [] - frame_keys: list[SourceQueryCacheKey] = [] - for provider in providers: - provider_query = provider_queries.get(provider.descriptor.name) - cache_key = _source_query_cache_key(provider, provider_query) - frame_keys.append(cache_key) - if frame_cache is not None and cache_key in frame_cache: - frames.append(frame_cache[cache_key]) - continue - frame = provider.load_frame(provider_query) - if frame_cache is not None: - frame_cache[cache_key] = frame - frames.append(frame) - return frames, frame_keys - - -def _mark_skipped_stages(stage_timings: dict[str, float], stage_names: tuple[str, ...]) -> None: - for stage_name in stage_names: - stage_timings.setdefault(stage_name, 0.0) - - -def _resolve_build_config(config: USMicroplexPerformanceHarnessConfig) -> USMicroplexBuildConfig: - default_base_kwargs: dict[str, object] = { - "synthesis_backend": "bootstrap", - "calibration_backend": "entropy", - } - if config.target_profile is None and config.calibration_target_profile is None: - default_base_kwargs.update( - { - "policyengine_target_variables": ( - "adjusted_gross_income", - "income_tax", - "dividend_income", - "taxable_interest_income", - "self_employment_income", - ), - "policyengine_target_geo_levels": ("national",), - } - ) - base = config.build_config or USMicroplexBuildConfig(**default_base_kwargs) - target_variables = ( - base.policyengine_target_variables - if config.target_variables is None - else config.target_variables - ) - target_domains = ( - base.policyengine_target_domains - if config.target_domains is None - else config.target_domains - ) - target_geo_levels = ( - base.policyengine_target_geo_levels - if config.target_geo_levels is None - else config.target_geo_levels - ) - calibration_target_variables = ( - config.calibration_target_variables - if config.calibration_target_variables is not None - else base.policyengine_calibration_target_variables - or ( - default_fast_calibration_target_variables(target_variables) - if config.fast_inner_loop_calibration and target_variables - else target_variables - ) - ) - calibration_target_domains = ( - config.calibration_target_domains - if config.calibration_target_domains is not None - else base.policyengine_calibration_target_domains or target_domains - ) - calibration_target_geo_levels = ( - config.calibration_target_geo_levels - if config.calibration_target_geo_levels is not None - else base.policyengine_calibration_target_geo_levels or target_geo_levels - ) - return replace( - base, - n_synthetic=config.n_synthetic, - random_seed=config.random_seed, - policyengine_targets_db=( - str(config.targets_db) - if config.targets_db is not None - else base.policyengine_targets_db - ), - policyengine_baseline_dataset=( - str(config.baseline_dataset) - if config.baseline_dataset is not None - else base.policyengine_baseline_dataset - ), - policyengine_target_period=config.target_period, - policyengine_target_variables=target_variables, - policyengine_target_domains=target_domains, - policyengine_target_geo_levels=target_geo_levels, - policyengine_target_profile=config.target_profile or base.policyengine_target_profile, - policyengine_calibration_target_variables=calibration_target_variables, - policyengine_calibration_target_domains=calibration_target_domains, - policyengine_calibration_target_geo_levels=calibration_target_geo_levels, - policyengine_calibration_target_profile=( - config.calibration_target_profile - or base.policyengine_calibration_target_profile - ), - policyengine_dataset_year=base.policyengine_dataset_year or config.target_period, - ) - - -@dataclass -class USMicroplexPerformanceSession: - """Reusable local optimization session with a persistent PE-US comparison cache.""" - - comparison_cache: PolicyEngineUSComparisonCache = field( - default_factory=PolicyEngineUSComparisonCache - ) - frame_cache: dict[SourceQueryCacheKey, ObservationFrame] = field(default_factory=dict) - precalibration_cache: dict[PreCalibrationCacheKey, USMicroplexPreCalibrationCacheEntry] = field( - default_factory=dict - ) - calibration_cache: dict[CalibrationCacheKey, USMicroplexCalibrationCacheEntry] = field( - default_factory=dict - ) - - def warm_parity_cache( - self, - *, - config: USMicroplexPerformanceHarnessConfig, - ) -> PolicyEngineUSComparisonCache: - return warm_us_microplex_parity_cache( - config=config, - comparison_cache=self.comparison_cache, - ) - - def run( - self, - providers: list[SourceProvider], - *, - config: USMicroplexPerformanceHarnessConfig, - queries: dict[str, SourceQuery] | None = None, - ) -> USMicroplexPerformanceHarnessResult: - return run_us_microplex_performance_harness( - providers, - config=config, - queries=queries, - comparison_cache=self.comparison_cache, - frame_cache=self.frame_cache, - precalibration_cache=self.precalibration_cache, - calibration_cache=self.calibration_cache, - ) - - def run_batch( - self, - requests: list[USMicroplexPerformanceHarnessRequest] - | tuple[USMicroplexPerformanceHarnessRequest, ...], - ) -> tuple[USMicroplexPerformanceHarnessResult, ...]: - """Run multiple requests with shared caches and grouped PE-native batch scoring.""" - - if not requests: - return () - - indexed_results: list[USMicroplexPerformanceHarnessResult] = [] - batch_groups: dict[ - tuple[str, int, str | None], - list[tuple[int, str, USMicroplexPerformanceHarnessConfig]], - ] = {} - pending_target_deltas: list[tuple[int, str, USMicroplexPerformanceHarnessConfig]] = [] - - with TemporaryDirectory(prefix="microplex-us-harness-batch-") as temp_dir: - temp_root = Path(temp_dir) - for index, request in enumerate(requests): - original_config = request.config - should_batch_native_loss = ( - original_config.evaluate_pe_native_loss - and not original_config.optimize_pe_native_loss - and original_config.baseline_dataset is not None - ) - dataset_output_path = original_config.output_policyengine_dataset_path - if should_batch_native_loss and dataset_output_path is None: - dataset_output_path = temp_root / f"candidate_{index}.h5" - - run_config = original_config - if should_batch_native_loss: - run_config = replace( - original_config, - evaluate_pe_native_loss=False, - output_json_path=None, - output_pe_native_target_delta_path=None, - output_policyengine_dataset_path=dataset_output_path, - ) - - result = self.run( - list(request.providers), - config=run_config, - queries=request.queries, - ) - if should_batch_native_loss: - if result.policyengine_dataset_path is None: - raise ValueError( - "Batched PE-native scoring requires an exported policyengine dataset path" - ) - result = replace(result, config=original_config) - group_key = ( - str(original_config.baseline_dataset), - ( - result.build_config.policyengine_dataset_year - or original_config.target_period - ), - ( - str(original_config.policyengine_us_data_repo) - if original_config.policyengine_us_data_repo is not None - else None - ), - ) - batch_groups.setdefault(group_key, []).append( - (index, result.policyengine_dataset_path, original_config) - ) - if original_config.output_pe_native_target_delta_path is not None: - pending_target_deltas.append( - (index, result.policyengine_dataset_path, original_config) - ) - indexed_results.append(result) - - for group_key, group_items in batch_groups.items(): - baseline_dataset, period, policyengine_us_data_repo = group_key - payloads = compute_batch_us_pe_native_scores( - candidate_dataset_paths=[ - candidate_path for _, candidate_path, _ in group_items - ], - baseline_dataset_path=baseline_dataset, - period=period, - policyengine_us_data_repo=policyengine_us_data_repo, - ) - if len(payloads) != len(group_items): - raise ValueError( - "PE-native batch scorer returned a different number of payloads than requests" - ) - for (result_index, _candidate_path, original_config), payload in zip( - group_items, - payloads, - strict=True, - ): - stage_timings = dict(indexed_results[result_index].stage_timings) - timing = payload.get("timing") - if isinstance(timing, dict): - batch_elapsed = timing.get("batch_elapsed_seconds") - if batch_elapsed is not None: - stage_timings["evaluate_pe_native_loss"] = float(batch_elapsed) - updated_result = replace( - indexed_results[result_index], - config=original_config, - stage_timings=stage_timings, - pe_native_scores=payload, - ) - indexed_results[result_index] = updated_result - - for result_index, candidate_path, original_config in pending_target_deltas: - target_deltas = compare_us_pe_native_target_deltas( - from_dataset_path=str(original_config.baseline_dataset), - to_dataset_path=candidate_path, - period=( - indexed_results[result_index].build_config.policyengine_dataset_year - or original_config.target_period - ), - top_k=original_config.pe_native_target_delta_top_k, - policyengine_us_data_repo=original_config.policyengine_us_data_repo, - ) - destination = Path(original_config.output_pe_native_target_delta_path) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text( - json.dumps( - _json_compatible_value(target_deltas), - indent=2, - sort_keys=True, - ) - ) - stage_timings = dict(indexed_results[result_index].stage_timings) - stage_timings.setdefault("evaluate_pe_native_target_deltas", 0.0) - stage_timings.setdefault("write_pe_native_target_delta_json", 0.0) - indexed_results[result_index] = replace( - indexed_results[result_index], - stage_timings=stage_timings, - pe_native_target_deltas=target_deltas, - ) - - final_results: list[USMicroplexPerformanceHarnessResult] = [] - for result in indexed_results: - if result.config.output_json_path is not None: - result.save(result.config.output_json_path) - final_results.append(result) - - return tuple(final_results) - - -def _union_target_set(target_sets: dict[str, TargetSet]) -> TargetSet: - union = TargetSet() - seen_names: set[str] = set() - for target_set in target_sets.values(): - for target in target_set.targets: - if target.name in seen_names: - continue - seen_names.add(target.name) - union.add(target) - return union - - -def warm_us_microplex_parity_cache( - *, - config: USMicroplexPerformanceHarnessConfig, - comparison_cache: PolicyEngineUSComparisonCache | None = None, -) -> PolicyEngineUSComparisonCache: - """Preload target slices and the baseline PE-US report into a reusable cache.""" - if config.targets_db is None or config.baseline_dataset is None: - raise ValueError( - "warm_us_microplex_parity_cache requires both targets_db and baseline_dataset" - ) - - cache = comparison_cache or PolicyEngineUSComparisonCache() - build_config = _resolve_build_config(config) - target_provider = PolicyEngineUSDBTargetProvider(str(config.targets_db)) - slices = default_policyengine_us_db_harness_slices( - period=config.target_period, - variables=build_config.policyengine_target_variables, - domain_variables=build_config.policyengine_target_domains, - geo_levels=build_config.policyengine_target_geo_levels, - reform_id=build_config.policyengine_target_reform_id, - ) - slices = filter_nonempty_policyengine_us_harness_slices( - target_provider, - slices, - cache=cache, - ) - slice_target_sets = { - slice_spec.name: cache.load_target_set(target_provider, slice_spec.query) - for slice_spec in slices - } - union_target_set = _union_target_set(slice_target_sets) - cache.load_baseline_report( - target_set=union_target_set, - baseline_dataset=str(config.baseline_dataset), - period=config.target_period, - dataset_year=build_config.policyengine_dataset_year, - simulation_cls=build_config.policyengine_simulation_cls, - baseline_label="policyengine_baseline", - strict_materialization=config.strict_materialization, - ) - return cache - - -def run_us_microplex_performance_harness( - providers: list[SourceProvider], - *, - config: USMicroplexPerformanceHarnessConfig, - queries: dict[str, SourceQuery] | None = None, - comparison_cache: PolicyEngineUSComparisonCache | None = None, - frame_cache: dict[SourceQueryCacheKey, ObservationFrame] | None = None, - precalibration_cache: dict[PreCalibrationCacheKey, USMicroplexPreCalibrationCacheEntry] - | None = None, - calibration_cache: dict[CalibrationCacheKey, USMicroplexCalibrationCacheEntry] - | None = None, -) -> USMicroplexPerformanceHarnessResult: - """Run a repeatable build+parity loop with stage-level timings.""" - if not providers: - raise ValueError("USMicroplex performance harness requires at least one provider") - if config.evaluate_parity and ( - config.targets_db is None or config.baseline_dataset is None - ): - raise ValueError( - "USMicroplex performance harness requires both targets_db and baseline_dataset" - ) - if config.evaluate_pe_native_loss and config.baseline_dataset is None: - raise ValueError( - "USMicroplex performance harness requires baseline_dataset for PE-native loss scoring" - ) - if config.evaluate_matched_pe_native_loss and config.baseline_dataset is None: - raise ValueError( - "USMicroplex performance harness requires baseline_dataset for matched PE-native loss scoring" - ) - if config.reweight_matched_pe_native_loss and not config.evaluate_matched_pe_native_loss: - raise ValueError( - "reweight_matched_pe_native_loss requires evaluate_matched_pe_native_loss" - ) - if config.optimize_pe_native_loss and not config.evaluate_pe_native_loss: - raise ValueError( - "USMicroplex performance harness requires evaluate_pe_native_loss when optimize_pe_native_loss is enabled" - ) - if ( - config.pe_native_household_budget is not None - and config.pe_native_household_budget <= 0 - ): - raise ValueError("pe_native_household_budget must be positive when provided") - if config.pe_native_score_consistency_tol <= 0.0: - raise ValueError("pe_native_score_consistency_tol must be positive") - if config.pe_native_target_delta_top_k <= 0: - raise ValueError("pe_native_target_delta_top_k must be positive") - if ( - config.matched_baseline_household_count is not None - and config.matched_baseline_household_count <= 0 - ): - raise ValueError("matched_baseline_household_count must be positive") - if config.matched_baseline_reweight_epochs <= 0: - raise ValueError("matched_baseline_reweight_epochs must be positive") - if config.matched_baseline_reweight_l0_lambda < 0.0: - raise ValueError("matched_baseline_reweight_l0_lambda must be nonnegative") - if ( - config.output_pe_native_target_delta_path is not None - and config.baseline_dataset is None - ): - raise ValueError( - "USMicroplex performance harness requires baseline_dataset for PE-native target deltas" - ) - if ( - config.output_pe_native_support_audit_path is not None - and config.baseline_dataset is None - ): - raise ValueError( - "USMicroplex performance harness requires baseline_dataset for PE-native support audit" - ) - - build_config = _resolve_build_config(config) - pipeline = USMicroplexPipeline(build_config) - provider_queries = dict(queries or {}) - for provider in providers: - provider_queries.setdefault( - provider.descriptor.name, - _default_provider_query(config), - ) - - stage_timings: dict[str, float] = {} - total_start = perf_counter() - - start = _stage(stage_timings, "load_frames") - frames, frame_keys = _load_frames( - providers, - provider_queries, - frame_cache=frame_cache, - ) - _finish_stage(stage_timings, "load_frames", start) - precalibration_key = ( - tuple(frame_keys), - _precalibration_build_config_key(build_config), - ) - precalibration = ( - precalibration_cache.get(precalibration_key) - if precalibration_cache is not None - else None - ) - - if precalibration is None: - start = _stage(stage_timings, "prepare_source_inputs") - source_inputs = [pipeline.prepare_source_input(frame) for frame in frames] - fusion_plan = FusionPlan.from_sources([frame.source for frame in frames]) - scaffold_input = pipeline._select_scaffold_source(source_inputs) - _finish_stage(stage_timings, "prepare_source_inputs", start) - - start = _stage(stage_timings, "prepare_seed_data") - seed_data = pipeline.prepare_seed_data_from_source(scaffold_input) - _finish_stage(stage_timings, "prepare_seed_data", start) - - start = _stage(stage_timings, "integrate_donor_sources") - donor_integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=scaffold_input, - donor_inputs=[ - source for source in source_inputs if source is not scaffold_input - ], - ) - seed_data = donor_integration["seed_data"] - _finish_stage(stage_timings, "integrate_donor_sources", start) - - start = _stage(stage_timings, "build_targets") - targets = pipeline.build_targets(seed_data) - _finish_stage(stage_timings, "build_targets", start) - - start = _stage(stage_timings, "resolve_synthesis_variables") - synthesis_variables = pipeline._resolve_synthesis_variables( - scaffold_input, - fusion_plan=fusion_plan, - include_all_observed_targets=len(source_inputs) > 1, - available_columns=set(seed_data.columns), - ) - _finish_stage(stage_timings, "resolve_synthesis_variables", start) - - start = _stage(stage_timings, "synthesize") - synthetic_data, synthesizer, synthesis_metadata = pipeline.synthesize( - seed_data, - synthesis_variables=synthesis_variables, - ) - _finish_stage(stage_timings, "synthesize", start) - - start = _stage(stage_timings, "ensure_target_support") - synthetic_data = pipeline.ensure_target_support( - synthetic_data, - seed_data, - targets, - ) - _finish_stage(stage_timings, "ensure_target_support", start) - - start = _stage(stage_timings, "build_policyengine_tables") - synthetic_tables = pipeline.build_policyengine_entity_tables(synthetic_data) - _finish_stage(stage_timings, "build_policyengine_tables", start) - - synthesis_metadata = { - **synthesis_metadata, - "source_names": fusion_plan.source_names, - "condition_vars": list(synthesis_variables.condition_vars), - "target_vars": list(synthesis_variables.target_vars), - "scaffold_source": scaffold_input.frame.source.name, - "donor_integrated_variables": donor_integration["integrated_variables"], - } - precalibration = USMicroplexPreCalibrationCacheEntry( - seed_data=seed_data, - synthetic_data=synthetic_data, - synthetic_tables=synthetic_tables, - targets=targets, - synthesis_metadata=synthesis_metadata, - synthesizer=synthesizer, - source_frame=scaffold_input.frame, - source_frames=tuple(frames), - fusion_plan=fusion_plan, - ) - if precalibration_cache is not None: - precalibration_cache[precalibration_key] = precalibration - else: - _mark_skipped_stages(stage_timings, PRECALIBRATION_STAGE_NAMES) - - calibration_key = ( - precalibration_key, - _calibration_build_config_key(build_config), - ) - calibration = ( - calibration_cache.get(calibration_key) - if calibration_cache is not None - else None - ) - if calibration is None: - start = _stage(stage_timings, "calibrate_policyengine_tables") - policyengine_tables, calibrated_data, calibration_summary = ( - pipeline.calibrate_policyengine_tables( - _clone_policyengine_us_tables(precalibration.synthetic_tables) - ) - ) - _finish_stage(stage_timings, "calibrate_policyengine_tables", start) - calibration = USMicroplexCalibrationCacheEntry( - policyengine_tables=_clone_policyengine_us_tables(policyengine_tables), - calibrated_data=calibrated_data.copy(deep=True), - calibration_summary=dict(calibration_summary), - ) - if calibration_cache is not None: - calibration_cache[calibration_key] = calibration - else: - stage_timings.setdefault("calibrate_policyengine_tables", 0.0) - - calibration = _clone_calibration_cache_entry(calibration) - - build_result = USMicroplexBuildResult( - config=build_config, - seed_data=precalibration.seed_data, - synthetic_data=precalibration.synthetic_data, - calibrated_data=calibration.calibrated_data, - targets=precalibration.targets, - calibration_summary=calibration.calibration_summary, - synthesis_metadata=dict(precalibration.synthesis_metadata), - synthesizer=precalibration.synthesizer, - policyengine_tables=calibration.policyengine_tables, - source_frame=precalibration.source_frame, - source_frames=precalibration.source_frames, - fusion_plan=precalibration.fusion_plan, - ) - - parity_run = None - if config.evaluate_parity: - start = _stage(stage_timings, "evaluate_parity_harness") - target_provider = PolicyEngineUSDBTargetProvider(str(config.targets_db)) - slices = default_policyengine_us_db_harness_slices( - period=config.target_period, - variables=build_config.policyengine_target_variables, - domain_variables=build_config.policyengine_target_domains, - geo_levels=build_config.policyengine_target_geo_levels, - reform_id=build_config.policyengine_target_reform_id, - ) - slices = filter_nonempty_policyengine_us_harness_slices( - target_provider, - slices, - cache=comparison_cache, - ) - parity_run = evaluate_policyengine_us_harness( - build_result.policyengine_tables, - target_provider, - slices, - baseline_dataset=str(config.baseline_dataset), - dataset_year=build_config.policyengine_dataset_year, - simulation_cls=build_config.policyengine_simulation_cls, - metadata={ - "sample_n": config.sample_n, - "n_synthetic": config.n_synthetic, - "source_names": precalibration.fusion_plan.source_names, - "target_variables": list(build_config.policyengine_target_variables), - "calibration_target_variables": list( - build_config.policyengine_calibration_target_variables - ), - }, - strict_materialization=config.strict_materialization, - cache=comparison_cache, - ) - _finish_stage(stage_timings, "evaluate_parity_harness", start) - - policyengine_dataset_path = None - pe_native_scores = None - matched_pe_native_scores = None - pe_native_target_deltas = None - pe_native_support_audit = None - needs_pe_dataset = ( - config.evaluate_pe_native_loss - or config.evaluate_matched_pe_native_loss - or config.output_pe_native_target_delta_path is not None - or config.output_pe_native_support_audit_path is not None - ) - if needs_pe_dataset: - with TemporaryDirectory(prefix="microplex-us-native-score-") as temp_dir: - pe_stage_started = False - candidate_dataset_path = pipeline.export_policyengine_dataset( - build_result, - Path(temp_dir) / "candidate_policyengine_us.h5", - direct_override_variables=build_config.policyengine_direct_override_variables, - ) - dataset_to_score = candidate_dataset_path - pe_native_optimization = None - if config.optimize_pe_native_loss: - if not pe_stage_started: - start = _stage(stage_timings, "evaluate_pe_native_loss") - pe_stage_started = True - optimize_start = _stage( - stage_timings, - "optimize_pe_native_loss_weights", - ) - optimized_dataset_path = ( - Path(temp_dir) / "candidate_policyengine_us_optimized.h5" - ) - optimization_result = optimize_policyengine_us_native_loss_dataset( - input_dataset_path=candidate_dataset_path, - output_dataset_path=optimized_dataset_path, - period=build_config.policyengine_dataset_year or config.target_period, - budget=config.pe_native_household_budget, - max_iter=config.pe_native_optimizer_max_iter, - l2_penalty=config.pe_native_optimizer_l2_penalty, - tol=config.pe_native_optimizer_tol, - policyengine_us_data_repo=config.policyengine_us_data_repo, - ) - dataset_to_score = optimized_dataset_path - pe_native_optimization = optimization_result.to_dict() - _finish_stage( - stage_timings, - "optimize_pe_native_loss_weights", - optimize_start, - ) - if config.evaluate_pe_native_loss: - if not pe_stage_started: - start = _stage(stage_timings, "evaluate_pe_native_loss") - pe_stage_started = True - pe_native_scores = compute_us_pe_native_scores( - candidate_dataset_path=dataset_to_score, - baseline_dataset_path=str(config.baseline_dataset), - period=build_config.policyengine_dataset_year or config.target_period, - policyengine_us_data_repo=config.policyengine_us_data_repo, - ) - if pe_native_optimization is not None: - summary = pe_native_scores.get("summary") - if not isinstance(summary, dict): - raise ValueError( - "PE-native optimization requires score summary metadata for consistency validation" - ) - rescored_loss = summary.get("candidate_enhanced_cps_native_loss") - if rescored_loss is None: - raise ValueError( - "PE-native optimization consistency validation requires candidate_enhanced_cps_native_loss" - ) - abs_error = abs( - float(rescored_loss) - float(pe_native_optimization["optimized_loss"]) - ) - if abs_error > config.pe_native_score_consistency_tol: - raise ValueError( - "PE-native optimized loss does not match rescored loss within tolerance: " - f"{abs_error:.6g} > {config.pe_native_score_consistency_tol:.6g}" - ) - pe_native_scores = dict(pe_native_scores) - pe_native_optimization = dict(pe_native_optimization) - pe_native_optimization["rescored_loss_abs_error"] = abs_error - pe_native_scores["optimization"] = pe_native_optimization - matched_baseline_dataset_path = None - if config.evaluate_matched_pe_native_loss: - matched_start = _stage( - stage_timings, - "build_matched_baseline_dataset", - ) - candidate_household_count = len(build_result.policyengine_tables.households) - matched_baseline_dataset_path = _write_matched_policyengine_us_baseline_dataset( - config.baseline_dataset, - config.output_matched_baseline_dataset_path - or (Path(temp_dir) / "matched_baseline_policyengine_us.h5"), - period=build_config.policyengine_dataset_year or config.target_period, - household_count=( - config.matched_baseline_household_count - or candidate_household_count - ), - random_seed=config.matched_baseline_random_seed, - ) - _finish_stage( - stage_timings, - "build_matched_baseline_dataset", - matched_start, - ) - if config.reweight_matched_pe_native_loss: - reweight_start = _stage( - stage_timings, - "reweight_matched_baseline_dataset", - ) - matched_baseline_dataset_path = _reweight_matched_policyengine_us_baseline_dataset( - matched_baseline_dataset_path, - config.output_matched_baseline_dataset_path - or (Path(temp_dir) / "matched_baseline_policyengine_us_reweighted.h5"), - period=build_config.policyengine_dataset_year or config.target_period, - epochs=config.matched_baseline_reweight_epochs, - l0_lambda=config.matched_baseline_reweight_l0_lambda, - seed=config.matched_baseline_reweight_seed, - policyengine_us_data_repo=config.policyengine_us_data_repo, - ) - _finish_stage( - stage_timings, - "reweight_matched_baseline_dataset", - reweight_start, - ) - matched_score_start = _stage( - stage_timings, - "evaluate_matched_pe_native_loss", - ) - matched_pe_native_scores = compute_us_pe_native_scores( - candidate_dataset_path=dataset_to_score, - baseline_dataset_path=matched_baseline_dataset_path, - period=build_config.policyengine_dataset_year or config.target_period, - policyengine_us_data_repo=config.policyengine_us_data_repo, - ) - _finish_stage( - stage_timings, - "evaluate_matched_pe_native_loss", - matched_score_start, - ) - if config.output_policyengine_dataset_path is not None: - policyengine_dataset_path = _copy_output_file( - dataset_to_score, - config.output_policyengine_dataset_path, - ) - if config.output_pe_native_target_delta_path is not None: - delta_start = _stage(stage_timings, "evaluate_pe_native_target_deltas") - pe_native_target_deltas = compare_us_pe_native_target_deltas( - from_dataset_path=str(config.baseline_dataset), - to_dataset_path=dataset_to_score, - period=build_config.policyengine_dataset_year or config.target_period, - top_k=config.pe_native_target_delta_top_k, - policyengine_us_data_repo=config.policyengine_us_data_repo, - ) - _finish_stage( - stage_timings, - "evaluate_pe_native_target_deltas", - delta_start, - ) - write_delta_start = _stage( - stage_timings, - "write_pe_native_target_delta_json", - ) - destination = Path(config.output_pe_native_target_delta_path) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text( - json.dumps( - _json_compatible_value(pe_native_target_deltas), - indent=2, - sort_keys=True, - ) - ) - _finish_stage( - stage_timings, - "write_pe_native_target_delta_json", - write_delta_start, - ) - if config.output_pe_native_support_audit_path is not None: - support_start = _stage( - stage_timings, - "evaluate_pe_native_support_audit", - ) - pe_native_support_audit = compute_us_pe_native_support_audit( - candidate_dataset_path=dataset_to_score, - baseline_dataset_path=str(config.baseline_dataset), - period=build_config.policyengine_dataset_year or config.target_period, - policyengine_us_data_repo=config.policyengine_us_data_repo, - ) - _finish_stage( - stage_timings, - "evaluate_pe_native_support_audit", - support_start, - ) - write_support_start = _stage( - stage_timings, - "write_pe_native_support_audit_json", - ) - destination = Path(config.output_pe_native_support_audit_path) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text( - json.dumps( - _json_compatible_value(pe_native_support_audit), - indent=2, - sort_keys=True, - ) - ) - _finish_stage( - stage_timings, - "write_pe_native_support_audit_json", - write_support_start, - ) - if pe_stage_started: - _finish_stage(stage_timings, "evaluate_pe_native_loss", start) - elif config.output_policyengine_dataset_path is not None: - start = _stage(stage_timings, "write_policyengine_dataset") - policyengine_dataset_path = str( - pipeline.export_policyengine_dataset( - build_result, - config.output_policyengine_dataset_path, - direct_override_variables=build_config.policyengine_direct_override_variables, - ) - ) - _finish_stage(stage_timings, "write_policyengine_dataset", start) - - total_seconds = perf_counter() - total_start - result = USMicroplexPerformanceHarnessResult( - config=config, - build_config=build_config, - build_result=build_result, - source_names=precalibration.fusion_plan.source_names, - stage_timings=stage_timings, - total_seconds=total_seconds, - parity_run=parity_run, - pe_native_scores=pe_native_scores, - matched_pe_native_scores=matched_pe_native_scores, - pe_native_target_deltas=pe_native_target_deltas, - pe_native_support_audit=pe_native_support_audit, - policyengine_dataset_path=policyengine_dataset_path, - matched_baseline_dataset_path=( - matched_baseline_dataset_path if needs_pe_dataset else None - ), - ) - if config.output_json_path is not None: - start = _stage(stage_timings, "write_output_json") - result.save(config.output_json_path) - _finish_stage(stage_timings, "write_output_json", start) - return result - - -__all__ = [ - "USMicroplexPerformanceHarnessConfig", - "USMicroplexPerformanceHarnessRequest", - "USMicroplexPerformanceHarnessResult", - "USMicroplexCalibrationCacheEntry", - "USMicroplexPreCalibrationCacheEntry", - "USMicroplexPerformanceSession", - "default_fast_calibration_target_variables", - "run_us_microplex_performance_harness", - "warm_us_microplex_parity_cache", -] diff --git a/src/microplex_us/pipelines/pre_sim_parity.py b/src/microplex_us/pipelines/pre_sim_parity.py deleted file mode 100644 index 7dc35a70..00000000 --- a/src/microplex_us/pipelines/pre_sim_parity.py +++ /dev/null @@ -1,623 +0,0 @@ -"""Audit candidate PE-US datasets against PE's pre-sim input surface.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd - -from microplex_us.pipelines.source_stage_parity import ( - _compare_series, - _normalize_categorical_series, - _resolve_bundle_variable, - _summarize_series, -) -from microplex_us.policyengine.us import ( - _decode_policyengine_array, - _load_policyengine_us_period_arrays, - load_policyengine_us_entity_tables, -) - - -@dataclass(frozen=True) -class PreSimParityVariableSpec: - """One pre-sim input comparison between candidate and reference datasets.""" - - label: str - candidate_variable: str - reference_variable: str | None = None - value_kind: str = "auto" - - @property - def resolved_reference_variable(self) -> str: - return self.reference_variable or self.candidate_variable - - -DEFAULT_PRE_SIM_FOCUS_VARIABLES: tuple[PreSimParityVariableSpec, ...] = ( - PreSimParityVariableSpec("age", "age", value_kind="numeric"), - PreSimParityVariableSpec("state_fips", "state_fips", value_kind="categorical"), - PreSimParityVariableSpec("county_fips", "county_fips", value_kind="categorical"), - PreSimParityVariableSpec( - "employment_income_before_lsr", - "employment_income_before_lsr", - value_kind="numeric", - ), - PreSimParityVariableSpec( - "self_employment_income_before_lsr", - "self_employment_income_before_lsr", - value_kind="numeric", - ), - PreSimParityVariableSpec("dividend_income", "dividend_income", value_kind="numeric"), - PreSimParityVariableSpec("interest_income", "interest_income", value_kind="numeric"), - PreSimParityVariableSpec( - "long_term_capital_gains_before_response", - "long_term_capital_gains_before_response", - value_kind="numeric", - ), - PreSimParityVariableSpec( - "partnership_s_corp_income", - "partnership_s_corp_income", - value_kind="numeric", - ), - PreSimParityVariableSpec("farm_income", "farm_income", value_kind="numeric"), - PreSimParityVariableSpec("rent", "rent", value_kind="numeric"), - PreSimParityVariableSpec( - "real_estate_taxes", - "real_estate_taxes", - value_kind="numeric", - ), - PreSimParityVariableSpec("net_worth", "net_worth", value_kind="numeric"), - PreSimParityVariableSpec( - "health_savings_account_ald", - "health_savings_account_ald", - value_kind="numeric", - ), - PreSimParityVariableSpec( - "self_employed_health_insurance_ald", - "self_employed_health_insurance_ald", - value_kind="numeric", - ), - PreSimParityVariableSpec( - "self_employed_pension_contribution_ald", - "self_employed_pension_contribution_ald", - value_kind="numeric", - ), - PreSimParityVariableSpec( - "is_household_head", - "is_household_head", - value_kind="categorical", - ), - PreSimParityVariableSpec("is_hispanic", "is_hispanic", value_kind="categorical"), - PreSimParityVariableSpec("is_disabled", "is_disabled", value_kind="categorical"), - PreSimParityVariableSpec("has_esi", "has_esi", value_kind="categorical"), - PreSimParityVariableSpec( - "has_marketplace_health_coverage", - "has_marketplace_health_coverage", - value_kind="categorical", - ), - PreSimParityVariableSpec("has_medicare", "has_medicare", value_kind="categorical"), - PreSimParityVariableSpec("has_medicaid", "has_medicaid", value_kind="categorical"), -) - -DEFAULT_CRITICAL_REFERENCE_VARIABLES: tuple[str, ...] = ( - "county_fips", - "cps_race", - "is_household_head", - "is_hispanic", - "is_disabled", - "rent", - "real_estate_taxes", - "net_worth", - "has_esi", - "has_marketplace_health_coverage", -) - -_AGE_BIN_LABELS: tuple[str, ...] = tuple( - [f"{start}-{start + 4}" for start in range(0, 85, 5)] + ["85+"] -) -_AGE_BIN_EDGES = np.array(list(range(0, 90, 5)) + [200], dtype=float) - - -def build_us_pre_sim_parity_audit( - candidate_dataset: str | Path, - reference_dataset: str | Path, - *, - period: int = 2024, - focus_variables: tuple[PreSimParityVariableSpec | str, ...] - | list[PreSimParityVariableSpec | str] = DEFAULT_PRE_SIM_FOCUS_VARIABLES, - critical_reference_variables: tuple[str, ...] - | list[str] = DEFAULT_CRITICAL_REFERENCE_VARIABLES, -) -> dict[str, Any]: - """Compare one candidate PE-US dataset to PE's own pre-sim input dataset.""" - - candidate_path = Path(candidate_dataset).resolve() - reference_path = Path(reference_dataset).resolve() - period_key = str(period) - candidate_arrays = _load_policyengine_us_period_arrays( - candidate_path, - period_key=period_key, - variables=None, - ) - reference_arrays = _load_policyengine_us_period_arrays( - reference_path, - period_key=period_key, - variables=None, - ) - candidate_variables = set(candidate_arrays) - reference_variables = set(reference_arrays) - common_variables = sorted(candidate_variables & reference_variables) - missing_in_candidate = sorted(reference_variables - candidate_variables) - extra_in_candidate = sorted(candidate_variables - reference_variables) - - candidate_bundle = load_policyengine_us_entity_tables(candidate_path, period=period) - reference_bundle = load_policyengine_us_entity_tables(reference_path, period=period) - - focus = _normalize_focus_variable_specs(focus_variables) - critical = tuple( - dict.fromkeys(str(variable) for variable in critical_reference_variables) - ) - - return { - "period": period, - "candidate_dataset": str(candidate_path), - "reference_dataset": str(reference_path), - "schema": { - "candidate_variable_count": len(candidate_variables), - "reference_variable_count": len(reference_variables), - "common_variable_count": len(common_variables), - "missing_in_candidate_count": len(missing_in_candidate), - "extra_in_candidate_count": len(extra_in_candidate), - "schema_recall": _safe_ratio(len(common_variables), len(reference_variables)), - "schema_precision": _safe_ratio(len(common_variables), len(candidate_variables)), - "missing_in_candidate": missing_in_candidate, - "extra_in_candidate": extra_in_candidate, - "missing_critical_reference_variables": [ - variable - for variable in critical - if variable in reference_variables and variable not in candidate_variables - ], - }, - "entity_structure": { - "candidate": _entity_structure_summary(candidate_bundle), - "reference": _entity_structure_summary(reference_bundle), - }, - "focus_variables": { - spec.label: _variable_comparison( - spec=spec, - candidate_bundle=candidate_bundle, - reference_bundle=reference_bundle, - candidate_arrays=candidate_arrays, - reference_arrays=reference_arrays, - ) - for spec in focus - }, - "state_age_support": _state_age_support_comparison( - candidate_bundle=candidate_bundle, - reference_bundle=reference_bundle, - ), - } - - -def write_us_pre_sim_parity_audit( - candidate_dataset: str | Path, - reference_dataset: str | Path, - output_path: str | Path, - *, - period: int = 2024, - focus_variables: tuple[PreSimParityVariableSpec | str, ...] - | list[PreSimParityVariableSpec | str] = DEFAULT_PRE_SIM_FOCUS_VARIABLES, - critical_reference_variables: tuple[str, ...] - | list[str] = DEFAULT_CRITICAL_REFERENCE_VARIABLES, -) -> Path: - """Build and persist one PE pre-sim parity audit as JSON.""" - - output = Path(output_path).resolve() - payload = build_us_pre_sim_parity_audit( - candidate_dataset, - reference_dataset, - period=period, - focus_variables=focus_variables, - critical_reference_variables=critical_reference_variables, - ) - output.parent.mkdir(parents=True, exist_ok=True) - output.write_text(json.dumps(payload, indent=2, sort_keys=True)) - return output - - -def _entity_structure_summary(bundle) -> dict[str, Any]: - households = bundle.households - persons = bundle.persons - tax_units = bundle.tax_units - families = bundle.families - spm_units = bundle.spm_units - marital_units = bundle.marital_units - - summary: dict[str, Any] = { - "household_rows": int(len(households)), - "person_rows": int(len(persons)) if persons is not None else 0, - "tax_unit_rows": int(len(tax_units)) if tax_units is not None else 0, - "family_rows": int(len(families)) if families is not None else 0, - "spm_unit_rows": int(len(spm_units)) if spm_units is not None else 0, - "marital_unit_rows": int(len(marital_units)) if marital_units is not None else 0, - } - if persons is None: - return summary - - household_sizes = persons.groupby("household_id", observed=True).size() - summary["mean_household_size"] = float(household_sizes.mean()) - summary["share_multi_person_households"] = float((household_sizes >= 2).mean()) - - if "tax_unit_id" in persons.columns: - grouped = persons.dropna(subset=["tax_unit_id"]) - if not grouped.empty: - tax_unit_sizes = grouped.groupby("tax_unit_id", observed=True).size() - summary["mean_tax_unit_size"] = float(tax_unit_sizes.mean()) - summary["share_multi_person_tax_units"] = float((tax_unit_sizes >= 2).mean()) - - if "is_household_head" in persons.columns: - head_flags = _boolean_series(persons["is_household_head"]) - head_counts = ( - persons.assign(_is_head=head_flags) - .groupby("household_id", observed=True)["_is_head"] - .sum(min_count=1) - ) - summary["households_with_exactly_one_head_share"] = float( - (head_counts == 1).mean() - ) - summary["households_with_no_head_count"] = int((head_counts == 0).sum()) - - return summary - - -def _state_age_support_comparison(*, candidate_bundle, reference_bundle) -> dict[str, Any]: - candidate = _state_age_support(candidate_bundle) - reference = _state_age_support(reference_bundle) - candidate_cells = set(candidate["cell_counts"]) - reference_cells = set(reference["cell_counts"]) - missing_cells = reference_cells - candidate_cells - missing_by_state: dict[str, int] = {} - for state, _age_bin in missing_cells: - missing_by_state[state] = missing_by_state.get(state, 0) + 1 - top_missing_states = sorted( - ( - {"state_fips": state, "missing_cell_count": count} - for state, count in missing_by_state.items() - ), - key=lambda row: (-row["missing_cell_count"], row["state_fips"]), - )[:10] - return { - "candidate": { - "nonempty_cell_count": len(candidate_cells), - "state_count": candidate["state_count"], - "support_rate": candidate["support_rate"], - }, - "reference": { - "nonempty_cell_count": len(reference_cells), - "state_count": reference["state_count"], - "support_rate": reference["support_rate"], - }, - "support_recall": _safe_ratio(len(candidate_cells & reference_cells), len(reference_cells)), - "missing_cell_count": len(missing_cells), - "top_missing_states": top_missing_states, - } - - -def _state_age_support(bundle) -> dict[str, Any]: - persons = bundle.persons - households = bundle.households - if persons is None or "age" not in persons.columns or "state_fips" not in households.columns: - return {"cell_counts": {}, "state_count": 0, "support_rate": 0.0} - - merged = persons[["household_id", "age"]].merge( - households[["household_id", "state_fips"]], - on="household_id", - how="left", - ) - merged = merged.dropna(subset=["age", "state_fips"]).copy() - if merged.empty: - return {"cell_counts": {}, "state_count": 0, "support_rate": 0.0} - - merged["state_fips"] = merged["state_fips"].astype(int).astype(str).str.zfill(2) - merged["age_bin"] = pd.cut( - merged["age"].astype(float), - bins=_AGE_BIN_EDGES, - labels=_AGE_BIN_LABELS, - right=False, - include_lowest=True, - ) - merged = merged.dropna(subset=["age_bin"]) - cell_counts = ( - merged.groupby(["state_fips", "age_bin"], observed=True) - .size() - .astype(int) - .to_dict() - ) - state_count = int(merged["state_fips"].nunique()) - total_possible = state_count * len(_AGE_BIN_LABELS) - return { - "cell_counts": cell_counts, - "state_count": state_count, - "support_rate": _safe_ratio(len(cell_counts), total_possible), - } - - -def _variable_comparison( - *, - spec: PreSimParityVariableSpec, - candidate_bundle, - reference_bundle, - candidate_arrays: dict[str, np.ndarray], - reference_arrays: dict[str, np.ndarray], -) -> dict[str, Any]: - candidate_present = spec.candidate_variable in candidate_arrays - reference_present = spec.resolved_reference_variable in reference_arrays - result: dict[str, Any] = { - "candidate_variable": spec.candidate_variable, - "reference_variable": spec.resolved_reference_variable, - "candidate_present": candidate_present, - "reference_present": reference_present, - } - if not reference_present and not candidate_present: - return result - - reference_entry = _resolve_bundle_variable( - reference_bundle, - spec.resolved_reference_variable, - ) - candidate_entry = _resolve_bundle_variable( - candidate_bundle, - spec.candidate_variable, - preferred_entity=reference_entry["entity"] if reference_entry is not None else None, - ) - - if reference_entry is not None: - result["reference_entity"] = reference_entry["entity"].value - result["reference"] = _summarize_series( - reference_entry["series"], - weights=reference_entry["weights"], - value_kind=spec.value_kind, - ) - elif reference_present: - reference_values = _decode_policyengine_array( - reference_arrays[spec.resolved_reference_variable] - ) - result["reference"] = _summarize_values( - reference_values, - value_kind=spec.value_kind, - ) - - if candidate_entry is not None: - result["candidate_entity"] = candidate_entry["entity"].value - result["candidate"] = _summarize_series( - candidate_entry["series"], - weights=candidate_entry["weights"], - value_kind=spec.value_kind, - ) - elif candidate_present: - candidate_values = _decode_policyengine_array(candidate_arrays[spec.candidate_variable]) - result["candidate"] = _summarize_values( - candidate_values, - value_kind=spec.value_kind, - ) - - if candidate_entry is not None and reference_entry is not None: - result["comparison"] = _compare_series( - candidate_entry["series"], - reference_entry["series"], - candidate_weights=candidate_entry["weights"], - reference_weights=reference_entry["weights"], - value_kind=spec.value_kind, - ) - elif candidate_present and reference_present: - candidate_values = _decode_policyengine_array(candidate_arrays[spec.candidate_variable]) - reference_values = _decode_policyengine_array( - reference_arrays[spec.resolved_reference_variable] - ) - result["comparison"] = _compare_values( - candidate_values, - reference_values, - value_kind=spec.value_kind, - ) - return result - - -def _summarize_values(values: np.ndarray, *, value_kind: str = "auto") -> dict[str, Any]: - array = np.asarray(values) - if value_kind == "categorical" or array.dtype.kind in {"U", "S", "O"}: - series = pd.Series(array.astype(str)) - normalized = _normalize_categorical_series(series) - value_counts = normalized.dropna().value_counts() - return { - "kind": "categorical", - "n": int(len(series)), - "nonnull_share": _safe_ratio(int(normalized.notna().sum()), len(series)), - "weighted_nonnull_share": _safe_ratio(int(normalized.notna().sum()), len(series)), - "unique_count": int(normalized.nunique(dropna=True)), - "top_values": [ - { - "value": str(value), - "count": int(count), - "weighted_sum": float(count), - "weighted_share": _safe_ratio(int(count), len(normalized)), - } - for value, count in value_counts.head(10).items() - ], - } - - numeric = pd.Series(array).replace([np.inf, -np.inf], np.nan).dropna() - if numeric.empty: - return { - "kind": "numeric", - "n": int(len(array)), - "nonnull_share": 0.0, - "weighted_nonnull_share": 0.0, - } - - unique_count = int(numeric.nunique()) - is_categorical = value_kind == "categorical" or ( - value_kind == "auto" - and unique_count <= 64 - and numeric.dtype.kind in {"i", "u", "b"} - ) - if is_categorical: - value_counts = numeric.astype(int).astype(str).value_counts() - return { - "kind": "categorical", - "n": int(len(array)), - "nonnull_share": _safe_ratio(int(len(numeric)), len(array)), - "weighted_nonnull_share": _safe_ratio(int(len(numeric)), len(array)), - "unique_count": unique_count, - "top_values": [ - { - "value": str(value), - "count": int(count), - "weighted_sum": float(count), - "weighted_share": _safe_ratio(int(count), len(numeric)), - } - for value, count in value_counts.head(10).items() - ], - } - - numeric_values = numeric.astype(float) - return { - "kind": "numeric", - "n": int(len(array)), - "nonnull_share": _safe_ratio(int(len(numeric_values)), len(array)), - "weighted_nonnull_share": _safe_ratio(int(len(numeric_values)), len(array)), - "zero_share": float((numeric_values == 0).mean()), - "weighted_zero_share": float((numeric_values == 0).mean()), - "positive_share": float((numeric_values > 0).mean()), - "weighted_positive_share": float((numeric_values > 0).mean()), - "negative_share": float((numeric_values < 0).mean()), - "weighted_negative_share": float((numeric_values < 0).mean()), - "mean": float(numeric_values.mean()), - "weighted_mean": float(numeric_values.mean()), - "p50": float(np.quantile(numeric_values, 0.5)), - "p90": float(np.quantile(numeric_values, 0.9)), - "p99": float(np.quantile(numeric_values, 0.99)), - "sum": float(numeric_values.sum()), - "weighted_sum": float(numeric_values.sum()), - } - - -def _compare_values( - candidate: np.ndarray, - reference: np.ndarray, - *, - value_kind: str = "auto", -) -> dict[str, Any]: - candidate_array = np.asarray(candidate) - reference_array = np.asarray(reference) - if value_kind == "categorical" or ( - candidate_array.dtype.kind in {"U", "S", "O"} - or reference_array.dtype.kind in {"U", "S", "O"} - ): - return _compare_categorical(candidate_array.astype(str), reference_array.astype(str)) - - candidate_numeric = pd.Series(candidate_array).replace([np.inf, -np.inf], np.nan).dropna() - reference_numeric = pd.Series(reference_array).replace([np.inf, -np.inf], np.nan).dropna() - candidate_unique = int(candidate_numeric.nunique()) if not candidate_numeric.empty else 0 - reference_unique = int(reference_numeric.nunique()) if not reference_numeric.empty else 0 - if value_kind != "numeric" and max(candidate_unique, reference_unique) <= 64 and ( - candidate_numeric.dtype.kind in {"i", "u", "b"} - or reference_numeric.dtype.kind in {"i", "u", "b"} - ): - return _compare_categorical( - candidate_numeric.astype(int).astype(str).to_numpy(), - reference_numeric.astype(int).astype(str).to_numpy(), - ) - return _compare_numeric(candidate_numeric.to_numpy(dtype=float), reference_numeric.to_numpy(dtype=float)) - - -def _compare_categorical(candidate: np.ndarray, reference: np.ndarray) -> dict[str, Any]: - candidate_series = _normalize_categorical_series(pd.Series(candidate)).dropna() - reference_series = _normalize_categorical_series(pd.Series(reference)).dropna() - candidate_support = set(candidate_series.astype(str)) - reference_support = set(reference_series.astype(str)) - missing = sorted(reference_support - candidate_support) - return { - "type": "categorical", - "support_recall": _safe_ratio(len(candidate_support & reference_support), len(reference_support)), - "support_precision": _safe_ratio(len(candidate_support & reference_support), len(candidate_support)), - "missing_reference_values": missing[:20], - } - - -def _compare_numeric(candidate: np.ndarray, reference: np.ndarray) -> dict[str, Any]: - if len(reference) == 0: - return {"type": "numeric", "positive_share_ratio": 0.0, "row_count_ratio": 0.0} - candidate_positive = candidate > 0 - reference_positive = reference > 0 - return { - "type": "numeric", - "row_count_ratio": _safe_ratio(len(candidate), len(reference)), - "candidate_zero_share": float((candidate == 0).mean()) if len(candidate) else 0.0, - "reference_zero_share": float((reference == 0).mean()), - "candidate_positive_share": float(candidate_positive.mean()) if len(candidate) else 0.0, - "reference_positive_share": float(reference_positive.mean()), - "positive_share_ratio": _safe_ratio( - float(candidate_positive.mean()) if len(candidate) else 0.0, - float(reference_positive.mean()), - ), - } - - -def _boolean_series(values: pd.Series) -> pd.Series: - if values.dtype.kind == "b": - return values.fillna(False) - if values.dtype.kind in {"i", "u", "f"}: - return values.fillna(0).astype(float) > 0 - normalized = values.astype(str).str.lower() - return normalized.isin({"1", "true", "t", "yes"}) - - -def _safe_ratio(numerator: int | float, denominator: int | float) -> float: - if not denominator: - return 0.0 - return float(numerator) / float(denominator) - - -def _normalize_focus_variable_specs( - focus_variables: tuple[PreSimParityVariableSpec | str, ...] - | list[PreSimParityVariableSpec | str], -) -> tuple[PreSimParityVariableSpec, ...]: - specs: list[PreSimParityVariableSpec] = [] - seen_labels: set[str] = set() - for variable in focus_variables: - spec = ( - variable - if isinstance(variable, PreSimParityVariableSpec) - else PreSimParityVariableSpec(str(variable), str(variable)) - ) - if spec.label in seen_labels: - continue - seen_labels.add(spec.label) - specs.append(spec) - return tuple(specs) - - -def main() -> None: - import argparse - - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("candidate_dataset") - parser.add_argument("reference_dataset") - parser.add_argument("output_path") - parser.add_argument("--period", type=int, default=2024) - args = parser.parse_args() - - output = write_us_pre_sim_parity_audit( - args.candidate_dataset, - args.reference_dataset, - args.output_path, - period=args.period, - ) - print(output) - - -if __name__ == "__main__": - main() diff --git a/src/microplex_us/pipelines/r2_artifacts.py b/src/microplex_us/pipelines/r2_artifacts.py deleted file mode 100644 index ebd83bb8..00000000 --- a/src/microplex_us/pipelines/r2_artifacts.py +++ /dev/null @@ -1,420 +0,0 @@ -"""Archive Microplex artifact directories to Cloudflare R2.""" - -from __future__ import annotations - -import argparse -import hashlib -import json -import os -import sys -from dataclasses import dataclass -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -R2_ARCHIVE_MANIFEST_FILENAME = "r2_archive_manifest.json" -R2_ARCHIVE_INDEX_FILENAME = "r2_archive_index.jsonl" -DEFAULT_R2_PREFIX = "microplex-us/artifacts" -DEFAULT_REGION = "auto" -SUMMARY_FILENAMES = frozenset( - { - "manifest.json", - "summary.md", - "scores.json", - "fit_summary.json", - "target_deltas_top50.json", - "matrix_residual_drilldown_top100.json", - "calibration_summary.json", - "source_spine_composition.json", - "support_audit.json", - "run_manifest.json", - } -) - - -@dataclass(frozen=True) -class R2ArchiveConfig: - """R2 destination and credentials for one archive operation.""" - - bucket: str - endpoint_url: str - prefix: str = DEFAULT_R2_PREFIX - region: str = DEFAULT_REGION - access_key_id: str | None = None - secret_access_key: str | None = None - session_token: str | None = None - - @classmethod - def from_env(cls) -> R2ArchiveConfig: - """Build config from Microplex-specific env vars with R2/AWS fallbacks.""" - bucket = _first_env("MICROPLEX_R2_BUCKET", "R2_BUCKET", "AWS_BUCKET") - if not bucket: - raise ValueError( - "Missing R2 bucket. Set MICROPLEX_R2_BUCKET or pass --bucket." - ) - endpoint_url = _first_env("MICROPLEX_R2_ENDPOINT_URL", "R2_ENDPOINT_URL") - account_id = _first_env("MICROPLEX_R2_ACCOUNT_ID", "CLOUDFLARE_ACCOUNT_ID") - if not endpoint_url: - if not account_id: - raise ValueError( - "Missing R2 endpoint. Set MICROPLEX_R2_ENDPOINT_URL, " - "R2_ENDPOINT_URL, or CLOUDFLARE_ACCOUNT_ID." - ) - endpoint_url = f"https://{account_id}.r2.cloudflarestorage.com" - return cls( - bucket=bucket, - endpoint_url=endpoint_url, - prefix=( - _first_env("MICROPLEX_R2_PREFIX", "R2_PREFIX") - or DEFAULT_R2_PREFIX - ), - region=_first_env("MICROPLEX_R2_REGION", "AWS_DEFAULT_REGION") - or DEFAULT_REGION, - access_key_id=_first_env( - "MICROPLEX_R2_ACCESS_KEY_ID", - "R2_ACCESS_KEY_ID", - "AWS_ACCESS_KEY_ID", - ), - secret_access_key=_first_env( - "MICROPLEX_R2_SECRET_ACCESS_KEY", - "R2_SECRET_ACCESS_KEY", - "AWS_SECRET_ACCESS_KEY", - ), - session_token=_first_env( - "MICROPLEX_R2_SESSION_TOKEN", - "R2_SESSION_TOKEN", - "AWS_SESSION_TOKEN", - ), - ) - - -def _first_env(*names: str) -> str | None: - for name in names: - value = os.environ.get(name) - if value: - return value - return None - - -def normalize_r2_prefix(value: str) -> str: - """Normalize an R2 key prefix without changing internal separators.""" - return value.strip("/") - - -def build_r2_object_key(prefix: str, artifact_id: str, relative_path: str) -> str: - """Return a stable R2 object key for one artifact file.""" - parts = [ - normalize_r2_prefix(prefix), - artifact_id.strip("/"), - relative_path.replace(os.sep, "/").strip("/"), - ] - return "/".join(part for part in parts if part) - - -def iter_artifact_files(artifact_dir: str | Path) -> list[Path]: - """List regular artifact files, excluding the local R2 archive sidecar.""" - root = Path(artifact_dir) - return sorted( - path - for path in root.rglob("*") - if path.is_file() and path.name != R2_ARCHIVE_MANIFEST_FILENAME - ) - - -def file_sha256(path: str | Path, *, chunk_size: int = 1024 * 1024) -> str: - """Compute a SHA-256 digest for a local file.""" - digest = hashlib.sha256() - with Path(path).open("rb") as file: - for chunk in iter(lambda: file.read(chunk_size), b""): - digest.update(chunk) - return digest.hexdigest() - - -def build_archive_manifest( - artifact_dir: str | Path, - config: R2ArchiveConfig, - *, - artifact_id: str | None = None, - hash_files: bool = True, - status: str = "planned", -) -> dict[str, Any]: - """Build the local manifest describing files and destination object keys.""" - root = Path(artifact_dir).resolve() - if not root.is_dir(): - raise NotADirectoryError(f"Artifact directory not found: {root}") - resolved_artifact_id = artifact_id or root.name - files: list[dict[str, Any]] = [] - total_bytes = 0 - for path in iter_artifact_files(root): - relative_path = path.relative_to(root).as_posix() - size_bytes = path.stat().st_size - total_bytes += size_bytes - entry: dict[str, Any] = { - "path": relative_path, - "size_bytes": size_bytes, - "object_key": build_r2_object_key( - config.prefix, - resolved_artifact_id, - relative_path, - ), - "status": status, - "summary": path.name in SUMMARY_FILENAMES, - } - if hash_files: - entry["sha256"] = file_sha256(path) - files.append(entry) - return { - "schema_version": 1, - "created_at": datetime.now(UTC).isoformat(), - "artifact_id": resolved_artifact_id, - "artifact_dir": str(root), - "r2": { - "bucket": config.bucket, - "endpoint_url": config.endpoint_url, - "prefix": normalize_r2_prefix(config.prefix), - "region": config.region, - "manifest_object_key": build_r2_object_key( - config.prefix, - resolved_artifact_id, - R2_ARCHIVE_MANIFEST_FILENAME, - ), - }, - "summary_files": [ - entry["path"] for entry in files if bool(entry.get("summary")) - ], - "file_count": len(files), - "total_bytes": total_bytes, - "files": files, - } - - -def create_r2_s3_client(config: R2ArchiveConfig) -> Any: - """Create a boto3 S3 client configured for Cloudflare R2.""" - try: - import boto3 - except ImportError as error: # pragma: no cover - exercised by CLI environment. - raise RuntimeError( - "boto3 is required for R2 uploads. Install the optional extra with " - "`uv sync --extra r2` or run through `uv run --extra r2 ...`." - ) from error - client_kwargs: dict[str, Any] = { - "service_name": "s3", - "endpoint_url": config.endpoint_url, - "region_name": config.region, - } - if config.access_key_id is not None: - client_kwargs["aws_access_key_id"] = config.access_key_id - if config.secret_access_key is not None: - client_kwargs["aws_secret_access_key"] = config.secret_access_key - if config.session_token is not None: - client_kwargs["aws_session_token"] = config.session_token - return boto3.client(**client_kwargs) - - -def upload_artifact_manifest_to_r2( - artifact_dir: str | Path, - config: R2ArchiveConfig, - *, - artifact_id: str | None = None, - client: Any | None = None, - dry_run: bool = False, - force: bool = False, - hash_files: bool = True, -) -> dict[str, Any]: - """Upload an artifact directory to R2 and write a local upload manifest.""" - root = Path(artifact_dir).resolve() - manifest = build_archive_manifest( - root, - config, - artifact_id=artifact_id, - hash_files=hash_files, - status="dry_run" if dry_run else "pending", - ) - local_manifest_path = root / R2_ARCHIVE_MANIFEST_FILENAME - if dry_run: - _write_json(local_manifest_path, manifest) - return manifest - s3 = client or create_r2_s3_client(config) - for entry in manifest["files"]: - path = root / entry["path"] - object_key = entry["object_key"] - if not force and _object_exists(s3, config.bucket, object_key): - entry["status"] = "already_exists" - continue - s3.upload_file(str(path), config.bucket, object_key) - entry["status"] = "uploaded" - entry["uploaded_at"] = datetime.now(UTC).isoformat() - manifest["completed_at"] = datetime.now(UTC).isoformat() - manifest["status"] = "uploaded" - _write_json(local_manifest_path, manifest) - manifest_key = manifest["r2"]["manifest_object_key"] - s3.upload_file(str(local_manifest_path), config.bucket, manifest_key) - return manifest - - -def append_archive_index_entry( - index_path: str | Path, - manifest: dict[str, Any], - *, - pruned_local: bool = False, -) -> Path: - """Append a compact archive record to a local JSONL index.""" - path = Path(index_path) - path.parent.mkdir(parents=True, exist_ok=True) - entry = { - "recorded_at": datetime.now(UTC).isoformat(), - "artifact_id": manifest["artifact_id"], - "artifact_dir": manifest["artifact_dir"], - "bucket": manifest["r2"]["bucket"], - "prefix": manifest["r2"]["prefix"], - "manifest_object_key": manifest["r2"]["manifest_object_key"], - "file_count": manifest["file_count"], - "total_bytes": manifest["total_bytes"], - "status": manifest.get("status"), - "pruned_local": pruned_local, - } - with path.open("a") as file: - file.write(json.dumps(entry, sort_keys=True) + "\n") - return path - - -def _object_exists(client: Any, bucket: str, key: str) -> bool: - try: - client.head_object(Bucket=bucket, Key=key) - except Exception as error: # noqa: BLE001 - boto3 exposes provider-specific errors. - response = getattr(error, "response", None) - code = None - if isinstance(response, dict): - code = str(response.get("Error", {}).get("Code", "")) - if code in {"404", "NoSuchKey", "NotFound"}: - return False - # Some fakes and S3-compatible clients use a generic missing-object error. - if error.__class__.__name__ in {"NoSuchKey", "NotFound"}: - return False - raise - return True - - -def _write_json(path: str | Path, payload: dict[str, Any]) -> None: - resolved = Path(path) - temp_path = resolved.with_suffix(resolved.suffix + ".tmp") - temp_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - temp_path.replace(resolved) - - -def _build_config_from_args(args: argparse.Namespace) -> R2ArchiveConfig: - env_config: R2ArchiveConfig | None = None - if args.bucket is None or args.endpoint_url is None: - try: - env_config = R2ArchiveConfig.from_env() - except ValueError: - if args.bucket is None or args.endpoint_url is None: - raise - bucket = args.bucket or (env_config.bucket if env_config is not None else None) - endpoint_url = args.endpoint_url or ( - env_config.endpoint_url if env_config is not None else None - ) - if bucket is None or endpoint_url is None: - raise ValueError("Both bucket and endpoint URL are required.") - return R2ArchiveConfig( - bucket=bucket, - endpoint_url=endpoint_url, - prefix=args.prefix - or (env_config.prefix if env_config is not None else DEFAULT_R2_PREFIX), - region=args.region - or (env_config.region if env_config is not None else DEFAULT_REGION), - access_key_id=( - args.access_key_id - or (env_config.access_key_id if env_config is not None else None) - ), - secret_access_key=( - args.secret_access_key - or (env_config.secret_access_key if env_config is not None else None) - ), - session_token=( - args.session_token - or (env_config.session_token if env_config is not None else None) - ), - ) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Archive a Microplex artifact directory to Cloudflare R2." - ) - parser.add_argument("artifact_dir", type=Path) - parser.add_argument("--artifact-id", default=None) - parser.add_argument("--bucket", default=None) - parser.add_argument("--endpoint-url", default=None) - parser.add_argument("--prefix", default=None) - parser.add_argument("--region", default=None) - parser.add_argument("--access-key-id", default=None) - parser.add_argument("--secret-access-key", default=None) - parser.add_argument("--session-token", default=None) - parser.add_argument( - "--dry-run", - action="store_true", - help="Write the local archive manifest without uploading to R2.", - ) - parser.add_argument( - "--force", - action="store_true", - help="Upload files even when an object with the same key already exists.", - ) - parser.add_argument( - "--no-hash", - action="store_true", - help="Skip SHA-256 file hashing when building the archive manifest.", - ) - parser.add_argument( - "--index-path", - type=Path, - default=None, - help=( - "Optional local JSONL archive index. Defaults to " - "/r2_archive_index.jsonl when uploading." - ), - ) - parser.add_argument( - "--mark-pruned-local", - action="store_true", - help="Mark the local archive-index row as pruned after external cleanup.", - ) - args = parser.parse_args(argv) - try: - config = _build_config_from_args(args) - manifest = upload_artifact_manifest_to_r2( - args.artifact_dir, - config, - artifact_id=args.artifact_id, - dry_run=args.dry_run, - force=args.force, - hash_files=not args.no_hash, - ) - except Exception as error: # noqa: BLE001 - CLI should report a concise failure. - print(f"R2 archive failed: {error}", file=sys.stderr) - return 1 - if not args.dry_run: - index_path = args.index_path or args.artifact_dir.parent / R2_ARCHIVE_INDEX_FILENAME - append_archive_index_entry( - index_path, - manifest, - pruned_local=args.mark_pruned_local, - ) - uploaded = sum( - 1 - for entry in manifest["files"] - if entry["status"] in {"uploaded", "already_exists"} - ) - mode = "planned" if args.dry_run else "archived" - print( - f"R2 artifact {mode}: {manifest['artifact_id']} " - f"({uploaded}/{manifest['file_count']} files, " - f"{manifest['total_bytes']} bytes)" - ) - print(args.artifact_dir / R2_ARCHIVE_MANIFEST_FILENAME) - return 0 - - -if __name__ == "__main__": # pragma: no cover - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/reduced_benchmark.py b/src/microplex_us/pipelines/reduced_benchmark.py deleted file mode 100644 index 34e01234..00000000 --- a/src/microplex_us/pipelines/reduced_benchmark.py +++ /dev/null @@ -1,1393 +0,0 @@ -"""Reduced benchmark harness for staged Microplex-US debugging.""" - -from __future__ import annotations - -import json -from dataclasses import asdict, dataclass, field, replace -from pathlib import Path -from tempfile import TemporaryDirectory -from time import perf_counter -from typing import Any, Literal - -import numpy as np -import pandas as pd -from microplex.core import EntityType, SourceProvider, SourceQuery -from microplex.targets import ( - FilterOperator, - TargetAggregation, - TargetFilter, - TargetSpec, -) - -from microplex_us.pipelines.local_reweighting import reweight_us_household_targets -from microplex_us.pipelines.performance import ( - USMicroplexPerformanceHarnessConfig, - USMicroplexPerformanceHarnessResult, - run_us_microplex_performance_harness, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - build_policyengine_us_time_period_arrays, - load_policyengine_us_entity_tables, - write_policyengine_us_time_period_dataset, -) - -USReducedBenchmarkEntity = Literal[ - "household", "person", "tax_unit", "spm_unit", "family", "marital_unit" -] -USReducedBenchmarkAggregation = Literal[ - "weighted_count", "weighted_sum", "weighted_mean" -] - -DEFAULT_ATOMIC_AGE_BINS: tuple[float, ...] = (0.0, 18.0, 30.0, 45.0, 65.0, 200.0) -DEFAULT_ATOMIC_AGE_LABELS: tuple[str, ...] = ( - "0_to_17", - "18_to_29", - "30_to_44", - "45_to_64", - "65_plus", -) -DEFAULT_ATOMIC_EMPLOYMENT_INCOME_BINS: tuple[float, ...] = ( - -1_000_000_000.0, - 0.01, - 10_000.0, - 50_000.0, - 1_000_000_000.0, -) -DEFAULT_ATOMIC_EMPLOYMENT_INCOME_LABELS: tuple[str, ...] = ( - "zero_or_less", - "1_to_10k", - "10k_to_50k", - "50k_plus", -) - - -@dataclass(frozen=True) -class USMicroplexReducedDimensionSpec: - """One grouped dimension for a reduced benchmark rung.""" - - variable: str - label: str | None = None - bins: tuple[float, ...] | None = None - bin_labels: tuple[str, ...] | None = None - right: bool = False - include_lowest: bool = True - missing_label: str | None = "__missing__" - zero_pad: int | None = None - - @property - def output_name(self) -> str: - return self.label or self.variable - - -@dataclass(frozen=True) -class USMicroplexReducedMeasureSpec: - """One weighted measure to compare on a reduced rung.""" - - name: str - aggregation: USReducedBenchmarkAggregation = "weighted_count" - variable: str | None = None - - -@dataclass(frozen=True) -class USMicroplexReducedBenchmarkSpec: - """A reduced benchmark rung defined on one entity table and a small target surface.""" - - name: str - entity: USReducedBenchmarkEntity - dimensions: tuple[USMicroplexReducedDimensionSpec, ...] - measures: tuple[USMicroplexReducedMeasureSpec, ...] = field( - default_factory=lambda: ( - USMicroplexReducedMeasureSpec(name="weighted_count"), - ) - ) - top_k: int = 10 - - -@dataclass(frozen=True) -class USMicroplexReducedBenchmarkReport: - """Candidate-vs-baseline comparison for one reduced benchmark rung.""" - - spec: USMicroplexReducedBenchmarkSpec - candidate_dataset: str - baseline_dataset: str - period: int - summary: dict[str, Any] - measure_summaries: dict[str, dict[str, Any]] - top_cell_gaps: dict[str, list[dict[str, Any]]] - - def to_dict(self) -> dict[str, Any]: - return { - "spec": _json_compatible_value(asdict(self.spec)), - "candidate_dataset": self.candidate_dataset, - "baseline_dataset": self.baseline_dataset, - "period": self.period, - "summary": _json_compatible_value(self.summary), - "measure_summaries": _json_compatible_value(self.measure_summaries), - "top_cell_gaps": _json_compatible_value(self.top_cell_gaps), - } - - -@dataclass(frozen=True) -class USMicroplexReducedBenchmarkHarnessConfig: - """Config for building one candidate and evaluating reduced benchmark rungs.""" - - performance_config: USMicroplexPerformanceHarnessConfig = field( - default_factory=USMicroplexPerformanceHarnessConfig - ) - benchmark_specs: tuple[USMicroplexReducedBenchmarkSpec, ...] = field( - default_factory=lambda: default_us_atomic_rung0_benchmarks() - ) - baseline_dataset: str | Path | None = None - period: int | None = None - output_json_path: str | Path | None = None - output_policyengine_dataset_path: str | Path | None = None - - -@dataclass(frozen=True) -class USMicroplexReducedBenchmarkHarnessResult: - """One reduced benchmark harness run plus the inner build/export result.""" - - config: USMicroplexReducedBenchmarkHarnessConfig - performance_result: USMicroplexPerformanceHarnessResult - benchmark_reports: dict[str, USMicroplexReducedBenchmarkReport] - candidate_dataset_path: str - baseline_dataset_path: str - period: int - stage_timings: dict[str, float] - total_seconds: float - - def to_dict(self) -> dict[str, Any]: - return { - "config": _json_compatible_value(asdict(self.config)), - "performance_result": self.performance_result.to_dict(), - "candidate_dataset_path": self.candidate_dataset_path, - "baseline_dataset_path": self.baseline_dataset_path, - "period": self.period, - "stage_timings": dict(self.stage_timings), - "total_seconds": float(self.total_seconds), - "benchmark_reports": { - name: report.to_dict() for name, report in self.benchmark_reports.items() - }, - } - - def save(self, path: str | Path) -> Path: - destination = Path(path) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text(json.dumps(self.to_dict(), indent=2, sort_keys=True)) - return destination - - -@dataclass(frozen=True) -class USMicroplexReducedCalibrationReport: - """Pre/post reduced benchmark comparison around a small household reweight step.""" - - calibration_spec: USMicroplexReducedBenchmarkSpec - evaluation_specs: tuple[USMicroplexReducedBenchmarkSpec, ...] - candidate_dataset: str - baseline_dataset: str - period: int - target_count: int - reweighting_summary: dict[str, Any] - pre_reports: dict[str, USMicroplexReducedBenchmarkReport] - post_reports: dict[str, USMicroplexReducedBenchmarkReport] - benchmark_deltas: dict[str, dict[str, Any]] - reweighted_dataset_path: str | None = None - - def to_dict(self) -> dict[str, Any]: - return { - "calibration_spec": _json_compatible_value(asdict(self.calibration_spec)), - "evaluation_specs": _json_compatible_value( - [asdict(spec) for spec in self.evaluation_specs] - ), - "candidate_dataset": self.candidate_dataset, - "baseline_dataset": self.baseline_dataset, - "period": self.period, - "target_count": self.target_count, - "reweighting_summary": _json_compatible_value(self.reweighting_summary), - "benchmark_deltas": _json_compatible_value(self.benchmark_deltas), - "reweighted_dataset_path": self.reweighted_dataset_path, - "pre_reports": { - name: report.to_dict() for name, report in self.pre_reports.items() - }, - "post_reports": { - name: report.to_dict() for name, report in self.post_reports.items() - }, - } - - -@dataclass(frozen=True) -class USMicroplexReducedMultiCalibrationReport: - """Pre/post reduced benchmark comparison around a multi-surface household reweight step.""" - - calibration_specs: tuple[USMicroplexReducedBenchmarkSpec, ...] - evaluation_specs: tuple[USMicroplexReducedBenchmarkSpec, ...] - candidate_dataset: str - baseline_dataset: str - period: int - target_count: int - calibration_target_counts: dict[str, int] - reweighting_summary: dict[str, Any] - pre_reports: dict[str, USMicroplexReducedBenchmarkReport] - post_reports: dict[str, USMicroplexReducedBenchmarkReport] - benchmark_deltas: dict[str, dict[str, Any]] - reweighted_dataset_path: str | None = None - - def to_dict(self) -> dict[str, Any]: - return { - "calibration_specs": _json_compatible_value( - [asdict(spec) for spec in self.calibration_specs] - ), - "evaluation_specs": _json_compatible_value( - [asdict(spec) for spec in self.evaluation_specs] - ), - "candidate_dataset": self.candidate_dataset, - "baseline_dataset": self.baseline_dataset, - "period": self.period, - "target_count": self.target_count, - "calibration_target_counts": _json_compatible_value( - self.calibration_target_counts - ), - "reweighting_summary": _json_compatible_value(self.reweighting_summary), - "benchmark_deltas": _json_compatible_value(self.benchmark_deltas), - "reweighted_dataset_path": self.reweighted_dataset_path, - "pre_reports": { - name: report.to_dict() for name, report in self.pre_reports.items() - }, - "post_reports": { - name: report.to_dict() for name, report in self.post_reports.items() - }, - } - - -def default_us_atomic_rung0_benchmarks() -> tuple[USMicroplexReducedBenchmarkSpec, ...]: - """A minimal first rung: counts by state, then by state x age.""" - - return ( - USMicroplexReducedBenchmarkSpec( - name="household_count_by_state", - entity="household", - dimensions=( - USMicroplexReducedDimensionSpec( - variable="state_fips", - zero_pad=2, - ), - ), - measures=( - USMicroplexReducedMeasureSpec(name="weighted_household_count"), - ), - ), - USMicroplexReducedBenchmarkSpec( - name="person_count_by_state_age", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec( - variable="state_fips", - zero_pad=2, - ), - USMicroplexReducedDimensionSpec( - variable="age", - label="age_bucket", - bins=DEFAULT_ATOMIC_AGE_BINS, - bin_labels=DEFAULT_ATOMIC_AGE_LABELS, - ), - ), - measures=( - USMicroplexReducedMeasureSpec(name="weighted_person_count"), - ), - ), - ) - - -def default_us_atomic_rung1_benchmarks() -> tuple[USMicroplexReducedBenchmarkSpec, ...]: - """A slightly richer CPS-like rung on person demographics and earnings.""" - - return ( - USMicroplexReducedBenchmarkSpec( - name="person_count_by_state_sex", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec( - variable="state_fips", - zero_pad=2, - ), - USMicroplexReducedDimensionSpec( - variable="is_female", - ), - ), - measures=( - USMicroplexReducedMeasureSpec(name="weighted_person_count"), - ), - ), - USMicroplexReducedBenchmarkSpec( - name="employment_income_sum_by_state", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec( - variable="state_fips", - zero_pad=2, - ), - ), - measures=( - USMicroplexReducedMeasureSpec( - name="weighted_employment_income_sum", - aggregation="weighted_sum", - variable="employment_income_before_lsr", - ), - ), - ), - USMicroplexReducedBenchmarkSpec( - name="employment_income_mean_by_state_sex", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec( - variable="state_fips", - zero_pad=2, - ), - USMicroplexReducedDimensionSpec( - variable="is_female", - ), - ), - measures=( - USMicroplexReducedMeasureSpec( - name="weighted_employment_income_mean", - aggregation="weighted_mean", - variable="employment_income_before_lsr", - ), - ), - ), - ) - - -def default_us_atomic_rung2_calibration() -> tuple[ - USMicroplexReducedBenchmarkSpec, - tuple[USMicroplexReducedBenchmarkSpec, ...], -]: - """Default reduced calibration comparison: fit state household counts, then evaluate rung 0+1.""" - - rung0 = default_us_atomic_rung0_benchmarks() - return ( - rung0[0], - rung0 + default_us_atomic_rung1_benchmarks(), - ) - - -def default_us_atomic_rung3_calibration() -> tuple[ - USMicroplexReducedBenchmarkSpec, - tuple[USMicroplexReducedBenchmarkSpec, ...], -]: - """Reduced calibration comparison: fit state-by-age person counts, then evaluate rung 0+1.""" - - rung0 = default_us_atomic_rung0_benchmarks() - return ( - rung0[1], - rung0 + default_us_atomic_rung1_benchmarks(), - ) - - -def default_us_atomic_rung4_calibration() -> tuple[ - USMicroplexReducedBenchmarkSpec, - tuple[USMicroplexReducedBenchmarkSpec, ...], -]: - """Reduced calibration comparison: fit age-by-income person counts, then evaluate rung 0+1.""" - - rung0 = default_us_atomic_rung0_benchmarks() - calibration_spec = USMicroplexReducedBenchmarkSpec( - name="person_count_by_age_employment_income_bucket", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec( - variable="age", - label="age_bucket", - bins=DEFAULT_ATOMIC_AGE_BINS, - bin_labels=DEFAULT_ATOMIC_AGE_LABELS, - ), - USMicroplexReducedDimensionSpec( - variable="employment_income_before_lsr", - label="employment_income_bucket", - bins=DEFAULT_ATOMIC_EMPLOYMENT_INCOME_BINS, - bin_labels=DEFAULT_ATOMIC_EMPLOYMENT_INCOME_LABELS, - ), - ), - measures=(USMicroplexReducedMeasureSpec(name="weighted_person_count"),), - ) - return ( - calibration_spec, - rung0 + default_us_atomic_rung1_benchmarks(), - ) - - -def default_us_atomic_rung5_calibration() -> tuple[ - tuple[USMicroplexReducedBenchmarkSpec, ...], - tuple[USMicroplexReducedBenchmarkSpec, ...], -]: - """Reduced calibration comparison: fit state-age and age-income counts jointly.""" - - rung0 = default_us_atomic_rung0_benchmarks() - rung1 = default_us_atomic_rung1_benchmarks() - rung3_spec, _ = default_us_atomic_rung3_calibration() - rung4_spec, _ = default_us_atomic_rung4_calibration() - return ( - (rung3_spec, rung4_spec), - rung0 + rung1 + (rung4_spec,), - ) - - -def evaluate_us_reduced_benchmark( - candidate_dataset: str | Path, - baseline_dataset: str | Path, - spec: USMicroplexReducedBenchmarkSpec, - *, - period: int = 2024, -) -> USMicroplexReducedBenchmarkReport: - """Compare one candidate H5 to one baseline H5 on a small grouped target surface.""" - - _validate_reduced_benchmark_spec(spec) - requested_variables = _required_reduced_benchmark_variables(spec) - candidate_bundle = load_policyengine_us_entity_tables( - candidate_dataset, - period=period, - variables=tuple(sorted(requested_variables)), - ) - baseline_bundle = load_policyengine_us_entity_tables( - baseline_dataset, - period=period, - variables=tuple(sorted(requested_variables)), - ) - return _evaluate_us_reduced_benchmark_bundles( - candidate_bundle, - baseline_bundle, - spec, - candidate_dataset=candidate_dataset, - baseline_dataset=baseline_dataset, - period=period, - ) - - -def _evaluate_us_reduced_benchmark_bundles( - candidate_bundle: PolicyEngineUSEntityTableBundle, - baseline_bundle: PolicyEngineUSEntityTableBundle, - spec: USMicroplexReducedBenchmarkSpec, - *, - candidate_dataset: str | Path, - baseline_dataset: str | Path, - period: int, -) -> USMicroplexReducedBenchmarkReport: - """Compare one candidate bundle to one baseline bundle on a small grouped target surface.""" - - _validate_reduced_benchmark_spec(spec) - candidate_grouped, candidate_row_count = _group_reduced_benchmark_bundle( - candidate_bundle, - spec, - ) - baseline_grouped, baseline_row_count = _group_reduced_benchmark_bundle( - baseline_bundle, - spec, - ) - - dimension_names = [dimension.output_name for dimension in spec.dimensions] - merged = candidate_grouped.merge( - baseline_grouped, - on=dimension_names, - how="outer", - suffixes=("_candidate", "_baseline"), - ) - # Fill NaN with 0.0 only for count/sum measures (missing cell = zero). - # Leave weighted_mean NaN so the error metric correctly reflects missing cells. - fill_zero_columns = [] - for measure in spec.measures: - if measure.aggregation in {"weighted_count", "weighted_sum"}: - fill_zero_columns.append(f"{measure.name}_candidate") - fill_zero_columns.append(f"{measure.name}_baseline") - if fill_zero_columns: - merged[fill_zero_columns] = merged[fill_zero_columns].fillna(0.0) - - measure_summaries: dict[str, dict[str, Any]] = {} - top_cell_gaps: dict[str, list[dict[str, Any]]] = {} - for measure in spec.measures: - candidate_column = f"{measure.name}_candidate" - baseline_column = f"{measure.name}_baseline" - candidate_values = merged[candidate_column].to_numpy(dtype=float) - baseline_values = merged[baseline_column].to_numpy(dtype=float) - delta = candidate_values - baseline_values - abs_relative_error = np.abs(delta) / (np.abs(baseline_values) + 1.0) - baseline_nonzero = np.abs(baseline_values) > 1e-9 - candidate_nonzero = np.abs(candidate_values) > 1e-9 - shared_nonzero = baseline_nonzero & candidate_nonzero - measure_summaries[measure.name] = { - "candidate_total": float(candidate_values.sum()), - "baseline_total": float(baseline_values.sum()), - "total_delta": float(delta.sum()), - "n_cells": int(len(candidate_values)), - "baseline_nonzero_cell_count": int(baseline_nonzero.sum()), - "candidate_nonzero_cell_count": int(candidate_nonzero.sum()), - "shared_nonzero_cell_count": int(shared_nonzero.sum()), - "support_recall": _safe_ratio( - int(shared_nonzero.sum()), - int(baseline_nonzero.sum()), - ), - "mean_abs_relative_error": float(np.nanmean(abs_relative_error)), - "max_abs_relative_error": float(np.nanmax(abs_relative_error)), - } - sort_frame = merged[dimension_names + [candidate_column, baseline_column]].copy() - sort_frame["delta"] = delta - sort_frame["abs_relative_error"] = abs_relative_error - sort_frame["abs_delta"] = np.abs(delta) - sort_frame = sort_frame.sort_values( - by=["abs_relative_error", "abs_delta"], - ascending=False, - kind="mergesort", - ) - rows: list[dict[str, Any]] = [] - for _, row in sort_frame.head(spec.top_k).iterrows(): - payload = { - dimension_name: _json_compatible_value(row[dimension_name]) - for dimension_name in dimension_names - } - payload.update( - { - "candidate_value": float(row[candidate_column]), - "baseline_value": float(row[baseline_column]), - "delta": float(row["delta"]), - "abs_relative_error": float(row["abs_relative_error"]), - } - ) - rows.append(payload) - top_cell_gaps[measure.name] = rows - - summary = { - "entity": spec.entity, - "n_candidate_rows": candidate_row_count, - "n_baseline_rows": baseline_row_count, - "n_dimensions": len(spec.dimensions), - "n_measures": len(spec.measures), - "n_cells": int(len(merged)), - "mean_measure_mare": float( - np.nanmean( - [ - measure_summaries[measure.name]["mean_abs_relative_error"] - for measure in spec.measures - ] - ) - ), - } - return USMicroplexReducedBenchmarkReport( - spec=spec, - candidate_dataset=str(Path(candidate_dataset).expanduser().resolve()), - baseline_dataset=str(Path(baseline_dataset).expanduser().resolve()), - period=int(period), - summary=summary, - measure_summaries=measure_summaries, - top_cell_gaps=top_cell_gaps, - ) - - -def run_us_microplex_reduced_benchmark_harness( - providers: list[SourceProvider], - *, - config: USMicroplexReducedBenchmarkHarnessConfig, - queries: dict[str, SourceQuery] | None = None, - **performance_kwargs: Any, -) -> USMicroplexReducedBenchmarkHarnessResult: - """Build one candidate dataset, then score reduced benchmark rungs against a baseline H5.""" - - if not providers: - raise ValueError("US reduced benchmark harness requires at least one provider") - if not config.benchmark_specs: - raise ValueError("US reduced benchmark harness requires at least one benchmark spec") - - baseline_dataset = ( - Path(config.baseline_dataset).expanduser().resolve() - if config.baseline_dataset is not None - else ( - Path(config.performance_config.baseline_dataset).expanduser().resolve() - if config.performance_config.baseline_dataset is not None - else None - ) - ) - if baseline_dataset is None: - raise ValueError( - "US reduced benchmark harness requires baseline_dataset either directly or via performance_config" - ) - period = int(config.period or config.performance_config.target_period) - - stage_timings: dict[str, float] = {} - total_start = perf_counter() - with TemporaryDirectory(prefix="microplex-us-reduced-benchmark-") as temp_dir: - candidate_output = ( - Path(config.output_policyengine_dataset_path).expanduser().resolve() - if config.output_policyengine_dataset_path is not None - else (Path(temp_dir) / "candidate_policyengine_us.h5") - ) - inner_config = replace( - config.performance_config, - evaluate_parity=False, - evaluate_pe_native_loss=False, - evaluate_matched_pe_native_loss=False, - reweight_matched_pe_native_loss=False, - optimize_pe_native_loss=False, - output_json_path=None, - output_pe_native_target_delta_path=None, - output_pe_native_support_audit_path=None, - output_matched_baseline_dataset_path=None, - output_policyengine_dataset_path=str(candidate_output), - ) - build_start = perf_counter() - performance_result = run_us_microplex_performance_harness( - providers, - config=inner_config, - queries=queries, - **performance_kwargs, - ) - stage_timings["build_candidate_dataset"] = perf_counter() - build_start - candidate_dataset_path = ( - performance_result.policyengine_dataset_path or str(candidate_output.resolve()) - ) - - benchmark_reports: dict[str, USMicroplexReducedBenchmarkReport] = {} - reduced_start = perf_counter() - for spec in config.benchmark_specs: - benchmark_reports[spec.name] = evaluate_us_reduced_benchmark( - candidate_dataset_path, - baseline_dataset, - spec, - period=period, - ) - stage_timings["evaluate_reduced_benchmarks"] = perf_counter() - reduced_start - - result = USMicroplexReducedBenchmarkHarnessResult( - config=config, - performance_result=performance_result, - benchmark_reports=benchmark_reports, - candidate_dataset_path=candidate_dataset_path, - baseline_dataset_path=str(baseline_dataset), - period=period, - stage_timings=stage_timings, - total_seconds=perf_counter() - total_start, - ) - if config.output_json_path is not None: - result.save(config.output_json_path) - return result - - -def reduced_benchmark_to_calibration_targets( - spec: USMicroplexReducedBenchmarkSpec, - baseline_dataset: str | Path, - *, - period: int = 2024, -) -> list[TargetSpec]: - """Convert a reduced weighted-count spec into household reweighting targets from baseline cells.""" - - _validate_reduced_benchmark_spec(spec) - unsupported = [ - measure.name - for measure in spec.measures - if measure.aggregation != "weighted_count" - ] - if unsupported: - raise ValueError( - "Reduced calibration currently supports weighted_count measures only: " - + ", ".join(unsupported) - ) - - requested_variables = _required_reduced_benchmark_variables(spec) - baseline_bundle = load_policyengine_us_entity_tables( - baseline_dataset, - period=period, - variables=tuple(sorted(requested_variables)), - ) - grouped, _ = _group_reduced_benchmark_bundle(baseline_bundle, spec) - entity = _entity_type_for_reduced_entity(spec.entity) - targets: list[TargetSpec] = [] - for _, row in grouped.iterrows(): - filters: list[TargetFilter] = [] - name_parts: list[str] = [] - for dimension in spec.dimensions: - value = row[dimension.output_name] - filters.extend(_dimension_filters_for_value(dimension, value)) - name_parts.append(f"{dimension.output_name}={_json_compatible_value(value)}") - measure = spec.measures[0] - target_value = float(row[measure.name]) - targets.append( - TargetSpec( - name=f"{spec.name}::{'|'.join(name_parts)}", - entity=entity, - value=target_value, - period=period, - aggregation=TargetAggregation.COUNT, - filters=tuple(filters), - ) - ) - return targets - - -def reduced_benchmark_specs_to_calibration_targets( - specs: tuple[USMicroplexReducedBenchmarkSpec, ...], - baseline_dataset: str | Path, - *, - period: int = 2024, -) -> tuple[list[TargetSpec], dict[str, int]]: - """Convert multiple reduced weighted-count specs into one target list.""" - - if not specs: - raise ValueError("Reduced calibration requires at least one calibration spec") - - targets: list[TargetSpec] = [] - target_counts: dict[str, int] = {} - for spec in specs: - spec_targets = reduced_benchmark_to_calibration_targets( - spec, - baseline_dataset, - period=period, - ) - targets.extend(spec_targets) - target_counts[spec.name] = len(spec_targets) - return targets, target_counts - - -def calibrate_and_evaluate_us_reduced_benchmarks( - candidate_dataset: str | Path, - baseline_dataset: str | Path, - calibration_spec: USMicroplexReducedBenchmarkSpec, - evaluation_specs: tuple[USMicroplexReducedBenchmarkSpec, ...], - *, - period: int = 2024, - max_iter: int = 8, - tol: float = 1e-4, - factor_bounds: tuple[float, float] = (0.5, 2.0), - output_reweighted_dataset_path: str | Path | None = None, -) -> USMicroplexReducedCalibrationReport: - """Reweight candidate household weights to a reduced surface, then compare pre/post rung errors.""" - - if not evaluation_specs: - raise ValueError("Reduced calibration requires at least one evaluation spec") - - candidate_path = Path(candidate_dataset).expanduser().resolve() - baseline_path = Path(baseline_dataset).expanduser().resolve() - calibration_targets = reduced_benchmark_to_calibration_targets( - calibration_spec, - baseline_path, - period=period, - ) - requested_variables = set().union( - _required_reduced_benchmark_variables(calibration_spec), - *(_required_reduced_benchmark_variables(spec) for spec in evaluation_specs), - ) - candidate_bundle = load_policyengine_us_entity_tables( - candidate_path, - period=period, - variables=tuple(sorted(requested_variables)), - ) - baseline_bundle = load_policyengine_us_entity_tables( - baseline_path, - period=period, - variables=tuple(sorted(requested_variables)), - ) - - pre_reports = { - spec.name: _evaluate_us_reduced_benchmark_bundles( - candidate_bundle, - baseline_bundle, - spec, - candidate_dataset=candidate_path, - baseline_dataset=baseline_path, - period=period, - ) - for spec in evaluation_specs - } - - calibration_bundle = _bundle_with_materialized_calibration_variables( - candidate_bundle, - calibration_spec, - ) - reweight_result = reweight_us_household_targets( - calibration_bundle, - targets=calibration_targets, - max_iter=max_iter, - tol=tol, - factor_bounds=factor_bounds, - ) - reweighted_dataset_path: str | None = None - if output_reweighted_dataset_path is not None: - destination = Path(output_reweighted_dataset_path).expanduser().resolve() - destination.parent.mkdir(parents=True, exist_ok=True) - write_policyengine_us_time_period_dataset( - build_policyengine_us_time_period_arrays( - reweight_result.tables, - period=period, - **_infer_reduced_export_maps(reweight_result.tables), - ), - destination, - ) - reweighted_dataset_path = str(destination) - - post_candidate_label = reweighted_dataset_path or f"{candidate_path}#reweighted" - post_reports = { - spec.name: _evaluate_us_reduced_benchmark_bundles( - reweight_result.tables, - baseline_bundle, - spec, - candidate_dataset=post_candidate_label, - baseline_dataset=baseline_path, - period=period, - ) - for spec in evaluation_specs - } - - benchmark_deltas = { - spec.name: { - "pre_mean_measure_mare": float(pre_reports[spec.name].summary["mean_measure_mare"]), - "post_mean_measure_mare": float( - post_reports[spec.name].summary["mean_measure_mare"] - ), - "delta_mean_measure_mare": float( - post_reports[spec.name].summary["mean_measure_mare"] - - pre_reports[spec.name].summary["mean_measure_mare"] - ), - } - for spec in evaluation_specs - } - reweighting_summary = { - "target_count": reweight_result.diagnostics.target_count, - "constraint_count": reweight_result.diagnostics.constraint_count, - "iterations": reweight_result.diagnostics.iterations, - "converged": reweight_result.diagnostics.converged, - "mean_abs_relative_error": reweight_result.diagnostics.mean_abs_relative_error, - "max_abs_relative_error": reweight_result.diagnostics.max_abs_relative_error, - "skipped_targets": list(reweight_result.compilation.skipped_targets), - } - return USMicroplexReducedCalibrationReport( - calibration_spec=calibration_spec, - evaluation_specs=evaluation_specs, - candidate_dataset=str(candidate_path), - baseline_dataset=str(baseline_path), - period=int(period), - target_count=len(calibration_targets), - reweighting_summary=reweighting_summary, - pre_reports=pre_reports, - post_reports=post_reports, - benchmark_deltas=benchmark_deltas, - reweighted_dataset_path=reweighted_dataset_path, - ) - - -def calibrate_and_evaluate_us_reduced_benchmark_specs( - candidate_dataset: str | Path, - baseline_dataset: str | Path, - calibration_specs: tuple[USMicroplexReducedBenchmarkSpec, ...], - evaluation_specs: tuple[USMicroplexReducedBenchmarkSpec, ...], - *, - period: int = 2024, - max_iter: int = 8, - tol: float = 1e-4, - factor_bounds: tuple[float, float] = (0.5, 2.0), - output_reweighted_dataset_path: str | Path | None = None, -) -> USMicroplexReducedMultiCalibrationReport: - """Reweight candidate household weights to multiple reduced surfaces, then compare pre/post rung errors.""" - - if not calibration_specs: - raise ValueError("Reduced multi-calibration requires at least one calibration spec") - if not evaluation_specs: - raise ValueError("Reduced multi-calibration requires at least one evaluation spec") - - candidate_path = Path(candidate_dataset).expanduser().resolve() - baseline_path = Path(baseline_dataset).expanduser().resolve() - calibration_targets, target_counts = reduced_benchmark_specs_to_calibration_targets( - calibration_specs, - baseline_path, - period=period, - ) - requested_variables = set().union( - *(_required_reduced_benchmark_variables(spec) for spec in calibration_specs), - *(_required_reduced_benchmark_variables(spec) for spec in evaluation_specs), - ) - candidate_bundle = load_policyengine_us_entity_tables( - candidate_path, - period=period, - variables=tuple(sorted(requested_variables)), - ) - baseline_bundle = load_policyengine_us_entity_tables( - baseline_path, - period=period, - variables=tuple(sorted(requested_variables)), - ) - - pre_reports = { - spec.name: _evaluate_us_reduced_benchmark_bundles( - candidate_bundle, - baseline_bundle, - spec, - candidate_dataset=candidate_path, - baseline_dataset=baseline_path, - period=period, - ) - for spec in evaluation_specs - } - - calibration_bundle = _bundle_with_materialized_calibration_specs( - candidate_bundle, - calibration_specs, - ) - reweight_result = reweight_us_household_targets( - calibration_bundle, - targets=calibration_targets, - max_iter=max_iter, - tol=tol, - factor_bounds=factor_bounds, - ) - reweighted_dataset_path: str | None = None - if output_reweighted_dataset_path is not None: - destination = Path(output_reweighted_dataset_path).expanduser().resolve() - destination.parent.mkdir(parents=True, exist_ok=True) - write_policyengine_us_time_period_dataset( - build_policyengine_us_time_period_arrays( - reweight_result.tables, - period=period, - **_infer_reduced_export_maps(reweight_result.tables), - ), - destination, - ) - reweighted_dataset_path = str(destination) - - post_candidate_label = reweighted_dataset_path or f"{candidate_path}#reweighted" - post_reports = { - spec.name: _evaluate_us_reduced_benchmark_bundles( - reweight_result.tables, - baseline_bundle, - spec, - candidate_dataset=post_candidate_label, - baseline_dataset=baseline_path, - period=period, - ) - for spec in evaluation_specs - } - - benchmark_deltas = { - spec.name: { - "pre_mean_measure_mare": float(pre_reports[spec.name].summary["mean_measure_mare"]), - "post_mean_measure_mare": float( - post_reports[spec.name].summary["mean_measure_mare"] - ), - "delta_mean_measure_mare": float( - post_reports[spec.name].summary["mean_measure_mare"] - - pre_reports[spec.name].summary["mean_measure_mare"] - ), - } - for spec in evaluation_specs - } - reweighting_summary = { - "target_count": reweight_result.diagnostics.target_count, - "constraint_count": reweight_result.diagnostics.constraint_count, - "iterations": reweight_result.diagnostics.iterations, - "converged": reweight_result.diagnostics.converged, - "mean_abs_relative_error": reweight_result.diagnostics.mean_abs_relative_error, - "max_abs_relative_error": reweight_result.diagnostics.max_abs_relative_error, - "skipped_targets": list(reweight_result.compilation.skipped_targets), - } - return USMicroplexReducedMultiCalibrationReport( - calibration_specs=calibration_specs, - evaluation_specs=evaluation_specs, - candidate_dataset=str(candidate_path), - baseline_dataset=str(baseline_path), - period=int(period), - target_count=len(calibration_targets), - calibration_target_counts=target_counts, - reweighting_summary=reweighting_summary, - pre_reports=pre_reports, - post_reports=post_reports, - benchmark_deltas=benchmark_deltas, - reweighted_dataset_path=reweighted_dataset_path, - ) - - -def _validate_reduced_benchmark_spec(spec: USMicroplexReducedBenchmarkSpec) -> None: - if spec.top_k <= 0: - raise ValueError("Reduced benchmark top_k must be positive") - if not spec.measures: - raise ValueError("Reduced benchmark spec requires at least one measure") - for measure in spec.measures: - if measure.aggregation in {"weighted_sum", "weighted_mean"} and not measure.variable: - raise ValueError( - f"Reduced benchmark measure '{measure.name}' requires variable for {measure.aggregation}" - ) - seen_output_names: set[str] = set() - for dimension in spec.dimensions: - output_name = dimension.output_name - if output_name in seen_output_names: - raise ValueError( - f"Reduced benchmark has duplicate dimension output name '{output_name}'" - ) - seen_output_names.add(output_name) - if dimension.bins is not None and len(dimension.bins) < 2: - raise ValueError( - f"Reduced benchmark dimension '{dimension.variable}' requires at least two bin edges" - ) - if ( - dimension.bins is not None - and dimension.bin_labels is not None - and len(dimension.bin_labels) != len(dimension.bins) - 1 - ): - raise ValueError( - f"Reduced benchmark dimension '{dimension.variable}' has mismatched bin labels" - ) - - -def _required_reduced_benchmark_variables( - spec: USMicroplexReducedBenchmarkSpec, -) -> set[str]: - variables = {dimension.variable for dimension in spec.dimensions} - variables.update( - measure.variable for measure in spec.measures if measure.variable is not None - ) - return variables - - -def _group_reduced_benchmark_bundle( - bundle: PolicyEngineUSEntityTableBundle, - spec: USMicroplexReducedBenchmarkSpec, -) -> tuple[pd.DataFrame, int]: - required_variables = _required_reduced_benchmark_variables(spec) - table, weights = _materialize_entity_frame( - bundle, - spec.entity, - required_variables, - ) - frame = pd.DataFrame({"__weight__": weights}) - for dimension in spec.dimensions: - if dimension.variable not in table.columns: - raise KeyError( - f"Reduced benchmark dimension '{dimension.variable}' is missing from {spec.entity} table" - ) - frame[dimension.output_name] = _materialize_dimension( - table[dimension.variable], - dimension, - ) - for measure in spec.measures: - if measure.variable is None: - continue - if measure.variable not in table.columns: - raise KeyError( - f"Reduced benchmark measure variable '{measure.variable}' is missing from {spec.entity} table" - ) - frame[f"__value__{measure.name}"] = pd.to_numeric( - table[measure.variable], - errors="coerce", - ).fillna(0.0) - - group_columns = [dimension.output_name for dimension in spec.dimensions] - if not group_columns: - frame["__all__"] = "all" - group_columns = ["__all__"] - grouped = ( - frame.groupby(group_columns, dropna=False, observed=True)["__weight__"] - .sum() - .reset_index(name="__group_weight__") - ) - result = grouped.copy() - for measure in spec.measures: - if measure.aggregation == "weighted_count": - result[measure.name] = grouped["__group_weight__"] - continue - weighted_sum = ( - frame.assign( - __weighted_value__=frame["__weight__"] * frame[f"__value__{measure.name}"] - ) - .groupby(group_columns, dropna=False, observed=True)["__weighted_value__"] - .sum() - .reset_index(name="__weighted_sum__") - ) - result = result.merge(weighted_sum, on=group_columns, how="left") - if measure.aggregation == "weighted_sum": - result[measure.name] = result["__weighted_sum__"].fillna(0.0) - else: - denominator = result["__group_weight__"].to_numpy(dtype=float) - numerator = result["__weighted_sum__"].fillna(0.0).to_numpy(dtype=float) - result[measure.name] = np.where( - np.abs(denominator) > 1e-12, - numerator / denominator, - 0.0, - ) - result = result.drop(columns=["__weighted_sum__"]) - if "__all__" in result.columns: - result = result.drop(columns=["__all__"]) - return result, int(len(table)) - - -def _materialize_entity_frame( - bundle: PolicyEngineUSEntityTableBundle, - entity: USReducedBenchmarkEntity, - variables: set[str], -) -> tuple[pd.DataFrame, pd.Series]: - table, weights = _resolve_entity_table_and_weights(bundle, entity) - table = table.copy() - missing = sorted(variable for variable in variables if variable not in table.columns) - if not missing: - return table, weights - - household_lookup = bundle.households.set_index("household_id") - if "household_id" in table.columns: - for variable in missing: - if variable in household_lookup.columns: - table[variable] = table["household_id"].map(household_lookup[variable]) - return table, weights - - -def _bundle_with_materialized_calibration_variables( - bundle: PolicyEngineUSEntityTableBundle, - spec: USMicroplexReducedBenchmarkSpec, -) -> PolicyEngineUSEntityTableBundle: - required_variables = _required_reduced_benchmark_variables(spec) - table, _ = _materialize_entity_frame(bundle, spec.entity, required_variables) - if spec.entity == "household": - return replace(bundle, households=table) - if spec.entity == "person": - return replace(bundle, persons=table) - if spec.entity == "tax_unit": - return replace(bundle, tax_units=table) - if spec.entity == "spm_unit": - return replace(bundle, spm_units=table) - if spec.entity == "family": - return replace(bundle, families=table) - if spec.entity == "marital_unit": - return replace(bundle, marital_units=table) - return bundle - - -def _bundle_with_materialized_calibration_specs( - bundle: PolicyEngineUSEntityTableBundle, - specs: tuple[USMicroplexReducedBenchmarkSpec, ...], -) -> PolicyEngineUSEntityTableBundle: - materialized_bundle = bundle - for spec in specs: - materialized_bundle = _bundle_with_materialized_calibration_variables( - materialized_bundle, - spec, - ) - return materialized_bundle - - -def _entity_type_for_reduced_entity(entity: USReducedBenchmarkEntity) -> EntityType: - mapping: dict[USReducedBenchmarkEntity, EntityType] = { - "household": EntityType.HOUSEHOLD, - "person": EntityType.PERSON, - "tax_unit": EntityType.TAX_UNIT, - "spm_unit": EntityType.SPM_UNIT, - "family": EntityType.FAMILY, - } - if entity == "marital_unit": - raise ValueError( - "Reduced calibration targets do not support marital_unit " - "(no corresponding EntityType in microplex core)" - ) - return mapping[entity] - - -def _dimension_filters_for_value( - dimension: USMicroplexReducedDimensionSpec, - value: Any, -) -> tuple[TargetFilter, ...]: - if dimension.bins is None: - filter_value = value - if dimension.zero_pad is not None: - numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0] - if pd.notna(numeric): - filter_value = int(round(float(numeric))) - return ( - TargetFilter( - feature=dimension.variable, - operator=FilterOperator.EQ, - value=filter_value, - ), - ) - - if dimension.bin_labels is None: - raise ValueError( - f"Reduced calibration requires explicit bin labels for '{dimension.variable}'" - ) - if value not in dimension.bin_labels: - raise ValueError( - f"Reduced calibration cannot map bucket '{value}' for '{dimension.variable}'" - ) - index = dimension.bin_labels.index(value) - lower = dimension.bins[index] - upper = dimension.bins[index + 1] - filters = [TargetFilter(dimension.variable, FilterOperator.GTE, lower)] - if dimension.right: - filters.append(TargetFilter(dimension.variable, FilterOperator.LTE, upper)) - else: - filters.append(TargetFilter(dimension.variable, FilterOperator.LT, upper)) - return tuple(filters) - - -def _infer_reduced_export_maps( - tables: PolicyEngineUSEntityTableBundle, -) -> dict[str, dict[str, str] | None]: - return { - "household_variable_map": _infer_export_map( - tables.households, - excluded_columns={"household_id", "household_weight", "weight"}, - ), - "person_variable_map": _infer_export_map( - tables.persons, - excluded_columns={ - "person_id", - "household_id", - "weight", - "tax_unit_id", - "spm_unit_id", - "family_id", - "marital_unit_id", - }, - ), - "tax_unit_variable_map": _infer_export_map( - tables.tax_units, - excluded_columns={"tax_unit_id", "household_id", "household_weight", "weight"}, - ), - "spm_unit_variable_map": _infer_export_map( - tables.spm_units, - excluded_columns={"spm_unit_id", "household_id", "household_weight", "weight"}, - ), - "family_variable_map": _infer_export_map( - tables.families, - excluded_columns={"family_id", "household_id", "household_weight", "weight"}, - ), - "marital_unit_variable_map": _infer_export_map( - tables.marital_units, - excluded_columns={ - "marital_unit_id", - "household_id", - "household_weight", - "weight", - }, - ), - } - - -def _infer_export_map( - table: pd.DataFrame | None, - *, - excluded_columns: set[str], -) -> dict[str, str] | None: - if table is None: - return None - return { - column: column - for column in table.columns - if column not in excluded_columns - } or None - - -def _resolve_entity_table_and_weights( - bundle: PolicyEngineUSEntityTableBundle, - entity: USReducedBenchmarkEntity, -) -> tuple[pd.DataFrame, pd.Series]: - household_weights = bundle.households.set_index("household_id")["household_weight"] - if entity == "household": - table = bundle.households.copy() - return table, pd.to_numeric(table["household_weight"], errors="coerce").fillna(0.0) - - table_map: dict[USReducedBenchmarkEntity, pd.DataFrame | None] = { - "person": bundle.persons, - "tax_unit": bundle.tax_units, - "spm_unit": bundle.spm_units, - "family": bundle.families, - "marital_unit": bundle.marital_units, - } - table = table_map[entity] - if table is None: - raise ValueError(f"Reduced benchmark entity '{entity}' is unavailable") - table = table.copy() - if entity == "person" and "weight" in table.columns: - weights = pd.to_numeric(table["weight"], errors="coerce").fillna(0.0) - else: - if "household_id" not in table.columns: - raise ValueError( - f"Reduced benchmark entity '{entity}' requires household_id to infer weights" - ) - weights = pd.to_numeric( - table["household_id"].map(household_weights), - errors="coerce", - ).fillna(0.0) - return table, weights - - -def _materialize_dimension( - values: pd.Series, - spec: USMicroplexReducedDimensionSpec, -) -> pd.Series: - series = values.copy() - if spec.bins is not None: - series = pd.cut( - pd.to_numeric(series, errors="coerce"), - bins=spec.bins, - labels=spec.bin_labels, - right=spec.right, - include_lowest=spec.include_lowest, - ) - elif spec.zero_pad is not None: - numeric = pd.to_numeric(series, errors="coerce") - padded = pd.Series(pd.NA, index=series.index, dtype="object") - valid = numeric.notna() - if valid.any(): - padded.loc[valid] = ( - numeric.loc[valid].round().astype(int).astype(str).str.zfill(spec.zero_pad) - ) - series = padded - else: - series = series.astype(object) - series = series.astype(object) - if spec.missing_label is not None: - series = series.where(pd.notna(series), spec.missing_label) - return series - - -def _json_compatible_value(value: Any) -> Any: - if isinstance(value, Path): - return str(value) - if isinstance(value, dict): - return { - str(key): _json_compatible_value(item) - for key, item in value.items() - } - if isinstance(value, (list, tuple)): - return [_json_compatible_value(item) for item in value] - if hasattr(value, "item") and callable(getattr(value, "item")): - try: - return _json_compatible_value(value.item()) - except (TypeError, ValueError): - pass - if isinstance(value, pd.Interval): - return str(value) - if isinstance(value, (str, int, float, bool)) or value is None: - return value - return str(value) - - -def _safe_ratio(numerator: int, denominator: int) -> float | None: - if denominator == 0: - return None - return float(numerator) / float(denominator) - - -__all__ = [ - "DEFAULT_ATOMIC_AGE_BINS", - "DEFAULT_ATOMIC_AGE_LABELS", - "DEFAULT_ATOMIC_EMPLOYMENT_INCOME_BINS", - "DEFAULT_ATOMIC_EMPLOYMENT_INCOME_LABELS", - "USMicroplexReducedCalibrationReport", - "USMicroplexReducedBenchmarkHarnessConfig", - "USMicroplexReducedBenchmarkHarnessResult", - "USMicroplexReducedBenchmarkReport", - "USMicroplexReducedBenchmarkSpec", - "USMicroplexReducedDimensionSpec", - "USMicroplexReducedMeasureSpec", - "calibrate_and_evaluate_us_reduced_benchmarks", - "default_us_atomic_rung0_benchmarks", - "default_us_atomic_rung1_benchmarks", - "default_us_atomic_rung2_calibration", - "default_us_atomic_rung3_calibration", - "default_us_atomic_rung4_calibration", - "evaluate_us_reduced_benchmark", - "reduced_benchmark_to_calibration_targets", - "run_us_microplex_reduced_benchmark_harness", -] diff --git a/src/microplex_us/pipelines/registry.py b/src/microplex_us/pipelines/registry.py deleted file mode 100644 index d267193d..00000000 --- a/src/microplex_us/pipelines/registry.py +++ /dev/null @@ -1,493 +0,0 @@ -"""Persistent run-registry helpers for saved US microplex artifacts.""" - -from __future__ import annotations - -import hashlib -import json -from dataclasses import dataclass, field -from datetime import UTC, datetime -from pathlib import Path -from typing import Any, Literal - -FrontierMetric = Literal[ - "full_oracle_capped_mean_abs_relative_error", - "full_oracle_mean_abs_relative_error", - "candidate_composite_parity_loss", - "candidate_mean_abs_relative_error", - "mean_abs_relative_error_delta", - "candidate_enhanced_cps_native_loss", - "enhanced_cps_native_loss_delta", -] - - -@dataclass(frozen=True) -class USMicroplexRunRegistryEntry: - """Compact cross-build summary for one saved artifact bundle.""" - - created_at: str - artifact_id: str - artifact_dir: str - manifest_path: str - policyengine_harness_path: str | None = None - config_hash: str | None = None - synthesis_backend: str | None = None - calibration_backend: str | None = None - calibration_converged: bool | None = None - weight_collapse_suspected: bool | None = None - source_names: tuple[str, ...] = () - rows: dict[str, int] = field(default_factory=dict) - weights: dict[str, float | int] = field(default_factory=dict) - full_oracle_capped_mean_abs_relative_error: float | None = None - full_oracle_mean_abs_relative_error: float | None = None - candidate_mean_abs_relative_error: float | None = None - baseline_mean_abs_relative_error: float | None = None - mean_abs_relative_error_delta: float | None = None - candidate_composite_parity_loss: float | None = None - baseline_composite_parity_loss: float | None = None - composite_parity_loss_delta: float | None = None - candidate_enhanced_cps_native_loss: float | None = None - baseline_enhanced_cps_native_loss: float | None = None - enhanced_cps_native_loss_delta: float | None = None - candidate_beats_baseline_native_loss: bool | None = None - candidate_unweighted_msre: float | None = None - baseline_unweighted_msre: float | None = None - unweighted_msre_delta: float | None = None - slice_win_rate: float | None = None - target_win_rate: float | None = None - supported_target_rate: float | None = None - tag_summaries: dict[str, dict[str, float | None]] = field(default_factory=dict) - parity_scorecard: dict[str, dict[str, float | bool | None]] = field( - default_factory=dict - ) - baseline_dataset: str | None = None - targets_db: str | None = None - target_period: int | None = None - target_variables: tuple[str, ...] = () - target_domains: tuple[str, ...] = () - target_geo_levels: tuple[str, ...] = () - target_reform_id: int | None = None - policyengine_us_runtime_version: str | None = None - improved_candidate_frontier: bool | None = None - improved_delta_frontier: bool | None = None - improved_composite_frontier: bool | None = None - improved_native_frontier: bool | None = None - metadata: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - """Serialize the registry entry to a JSON-compatible dict.""" - return { - "created_at": self.created_at, - "artifact_id": self.artifact_id, - "artifact_dir": self.artifact_dir, - "manifest_path": self.manifest_path, - "policyengine_harness_path": self.policyengine_harness_path, - "config_hash": self.config_hash, - "synthesis_backend": self.synthesis_backend, - "calibration_backend": self.calibration_backend, - "calibration_converged": self.calibration_converged, - "weight_collapse_suspected": self.weight_collapse_suspected, - "source_names": list(self.source_names), - "rows": dict(self.rows), - "weights": dict(self.weights), - "full_oracle_capped_mean_abs_relative_error": ( - self.full_oracle_capped_mean_abs_relative_error - ), - "full_oracle_mean_abs_relative_error": ( - self.full_oracle_mean_abs_relative_error - ), - "candidate_mean_abs_relative_error": self.candidate_mean_abs_relative_error, - "baseline_mean_abs_relative_error": self.baseline_mean_abs_relative_error, - "mean_abs_relative_error_delta": self.mean_abs_relative_error_delta, - "candidate_composite_parity_loss": self.candidate_composite_parity_loss, - "baseline_composite_parity_loss": self.baseline_composite_parity_loss, - "composite_parity_loss_delta": self.composite_parity_loss_delta, - "candidate_enhanced_cps_native_loss": self.candidate_enhanced_cps_native_loss, - "baseline_enhanced_cps_native_loss": self.baseline_enhanced_cps_native_loss, - "enhanced_cps_native_loss_delta": self.enhanced_cps_native_loss_delta, - "candidate_beats_baseline_native_loss": ( - self.candidate_beats_baseline_native_loss - ), - "candidate_unweighted_msre": self.candidate_unweighted_msre, - "baseline_unweighted_msre": self.baseline_unweighted_msre, - "unweighted_msre_delta": self.unweighted_msre_delta, - "slice_win_rate": self.slice_win_rate, - "target_win_rate": self.target_win_rate, - "supported_target_rate": self.supported_target_rate, - "tag_summaries": dict(self.tag_summaries), - "parity_scorecard": dict(self.parity_scorecard), - "baseline_dataset": self.baseline_dataset, - "targets_db": self.targets_db, - "target_period": self.target_period, - "target_variables": list(self.target_variables), - "target_domains": list(self.target_domains), - "target_geo_levels": list(self.target_geo_levels), - "target_reform_id": self.target_reform_id, - "policyengine_us_runtime_version": self.policyengine_us_runtime_version, - "improved_candidate_frontier": self.improved_candidate_frontier, - "improved_delta_frontier": self.improved_delta_frontier, - "improved_composite_frontier": self.improved_composite_frontier, - "improved_native_frontier": self.improved_native_frontier, - "metadata": dict(self.metadata), - } - - @classmethod - def from_dict(cls, payload: dict[str, Any]) -> USMicroplexRunRegistryEntry: - """Restore a registry entry from JSON payload.""" - return cls( - created_at=payload["created_at"], - artifact_id=payload["artifact_id"], - artifact_dir=payload["artifact_dir"], - manifest_path=payload["manifest_path"], - policyengine_harness_path=payload.get("policyengine_harness_path"), - config_hash=payload.get("config_hash"), - synthesis_backend=payload.get("synthesis_backend"), - calibration_backend=payload.get("calibration_backend"), - calibration_converged=payload.get("calibration_converged"), - weight_collapse_suspected=payload.get("weight_collapse_suspected"), - source_names=tuple(payload.get("source_names", [])), - rows=dict(payload.get("rows", {})), - weights=dict(payload.get("weights", {})), - full_oracle_capped_mean_abs_relative_error=payload.get( - "full_oracle_capped_mean_abs_relative_error" - ), - full_oracle_mean_abs_relative_error=payload.get( - "full_oracle_mean_abs_relative_error" - ), - candidate_mean_abs_relative_error=payload.get( - "candidate_mean_abs_relative_error" - ), - baseline_mean_abs_relative_error=payload.get( - "baseline_mean_abs_relative_error" - ), - mean_abs_relative_error_delta=payload.get("mean_abs_relative_error_delta"), - candidate_composite_parity_loss=payload.get( - "candidate_composite_parity_loss" - ), - baseline_composite_parity_loss=payload.get( - "baseline_composite_parity_loss" - ), - composite_parity_loss_delta=payload.get("composite_parity_loss_delta"), - candidate_enhanced_cps_native_loss=payload.get( - "candidate_enhanced_cps_native_loss" - ), - baseline_enhanced_cps_native_loss=payload.get( - "baseline_enhanced_cps_native_loss" - ), - enhanced_cps_native_loss_delta=payload.get( - "enhanced_cps_native_loss_delta" - ), - candidate_beats_baseline_native_loss=payload.get( - "candidate_beats_baseline_native_loss" - ), - candidate_unweighted_msre=payload.get("candidate_unweighted_msre"), - baseline_unweighted_msre=payload.get("baseline_unweighted_msre"), - unweighted_msre_delta=payload.get("unweighted_msre_delta"), - slice_win_rate=payload.get("slice_win_rate"), - target_win_rate=payload.get("target_win_rate"), - supported_target_rate=payload.get("supported_target_rate"), - tag_summaries={ - key: dict(value) - for key, value in dict(payload.get("tag_summaries", {})).items() - }, - parity_scorecard={ - key: dict(value) - for key, value in dict(payload.get("parity_scorecard", {})).items() - }, - baseline_dataset=payload.get("baseline_dataset"), - targets_db=payload.get("targets_db"), - target_period=payload.get("target_period"), - target_variables=tuple(payload.get("target_variables", [])), - target_domains=tuple(payload.get("target_domains", [])), - target_geo_levels=tuple(payload.get("target_geo_levels", [])), - target_reform_id=payload.get("target_reform_id"), - policyengine_us_runtime_version=payload.get( - "policyengine_us_runtime_version" - ), - improved_candidate_frontier=payload.get("improved_candidate_frontier"), - improved_delta_frontier=payload.get("improved_delta_frontier"), - improved_composite_frontier=payload.get("improved_composite_frontier"), - improved_native_frontier=payload.get("improved_native_frontier"), - metadata=dict(payload.get("metadata", {})), - ) - - -def load_us_microplex_run_registry( - path: str | Path, -) -> list[USMicroplexRunRegistryEntry]: - """Load a JSONL run registry from disk.""" - registry_path = Path(path) - if not registry_path.exists(): - return [] - entries: list[USMicroplexRunRegistryEntry] = [] - for line in registry_path.read_text().splitlines(): - line = line.strip() - if not line: - continue - entries.append(USMicroplexRunRegistryEntry.from_dict(json.loads(line))) - return entries - - -def select_us_microplex_frontier_entry( - path: str | Path, - *, - metric: FrontierMetric = "candidate_composite_parity_loss", -) -> USMicroplexRunRegistryEntry | None: - """Select the current best run from the registry using one summary metric.""" - entries = load_us_microplex_run_registry(_resolve_run_registry_path(path)) - metric_values = { - "full_oracle_capped_mean_abs_relative_error": ( - lambda entry: entry.full_oracle_capped_mean_abs_relative_error - ), - "full_oracle_mean_abs_relative_error": lambda entry: entry.full_oracle_mean_abs_relative_error, - "candidate_composite_parity_loss": lambda entry: entry.candidate_composite_parity_loss, - "candidate_mean_abs_relative_error": lambda entry: entry.candidate_mean_abs_relative_error, - "mean_abs_relative_error_delta": lambda entry: entry.mean_abs_relative_error_delta, - "candidate_enhanced_cps_native_loss": lambda entry: entry.candidate_enhanced_cps_native_loss, - "enhanced_cps_native_loss_delta": lambda entry: entry.enhanced_cps_native_loss_delta, - } - value_fn = metric_values[metric] - comparable_entries = [ - entry for entry in entries if value_fn(entry) is not None and _is_frontier_eligible(entry) - ] - if not comparable_entries: - return None - return min(comparable_entries, key=value_fn) - - -def resolve_us_microplex_frontier_artifact_dir( - path: str | Path, - *, - metric: FrontierMetric = "candidate_composite_parity_loss", -) -> Path | None: - """Return the artifact directory for the current frontier run, if any.""" - frontier = select_us_microplex_frontier_entry(path, metric=metric) - if frontier is None: - return None - return Path(frontier.artifact_dir) - - -def append_us_microplex_run_registry_entry( - path: str | Path, - entry: USMicroplexRunRegistryEntry, -) -> USMicroplexRunRegistryEntry: - """Append one registry entry, computing frontier flags from prior history.""" - registry_path = Path(path) - registry_path.parent.mkdir(parents=True, exist_ok=True) - existing_entries = load_us_microplex_run_registry(registry_path) - entry_to_write = USMicroplexRunRegistryEntry( - **{ - **entry.to_dict(), - "improved_candidate_frontier": _improves_candidate_frontier( - existing_entries, - entry, - ), - "improved_delta_frontier": _improves_delta_frontier(existing_entries, entry), - "improved_composite_frontier": _improves_composite_frontier( - existing_entries, - entry, - ), - "improved_native_frontier": _improves_native_frontier( - existing_entries, - entry, - ), - } - ) - with registry_path.open("a", encoding="utf-8") as handle: - handle.write(json.dumps(entry_to_write.to_dict(), sort_keys=True) + "\n") - return entry_to_write - - -def build_us_microplex_run_registry_entry( - *, - artifact_dir: str | Path, - manifest_path: str | Path, - manifest: dict[str, Any], - policyengine_harness_path: str | Path | None = None, - policyengine_harness_payload: dict[str, Any] | None = None, - metadata: dict[str, Any] | None = None, -) -> USMicroplexRunRegistryEntry: - """Build a compact registry entry from a saved artifact manifest.""" - harness_payload = dict(policyengine_harness_payload or {}) - harness_summary = dict(manifest.get("policyengine_harness", {})) - harness_metadata = dict(harness_payload.get("metadata", {})) - calibration_summary = dict(manifest.get("calibration", {})) - native_summary = dict(manifest.get("policyengine_native_scores", {})) - diagnostics = dict(manifest.get("diagnostics", {})) - created_at = manifest.get("created_at") or datetime.now(UTC).isoformat() - config = dict(manifest.get("config", {})) - synthesis = dict(manifest.get("synthesis", {})) - calibration_oracle_loss = dict(calibration_summary.get("oracle_loss", {})) - full_oracle_summary = dict(calibration_oracle_loss.get("full_oracle", {})) - merged_metadata = dict(metadata or {}) - child_tax_unit_agi_drift = diagnostics.get("child_tax_unit_agi_drift") - if child_tax_unit_agi_drift is not None: - merged_metadata.setdefault("child_tax_unit_agi_drift", child_tax_unit_agi_drift) - - return USMicroplexRunRegistryEntry( - created_at=created_at, - artifact_id=Path(artifact_dir).name, - artifact_dir=str(Path(artifact_dir).resolve()), - manifest_path=str(Path(manifest_path).resolve()), - policyengine_harness_path=( - str(Path(policyengine_harness_path).resolve()) - if policyengine_harness_path is not None - else None - ), - config_hash=_stable_config_hash(config), - synthesis_backend=config.get("synthesis_backend"), - calibration_backend=config.get("calibration_backend"), - calibration_converged=calibration_summary.get("converged"), - weight_collapse_suspected=calibration_summary.get("weight_collapse_suspected"), - source_names=tuple(synthesis.get("source_names", [])), - rows={key: int(value) for key, value in dict(manifest.get("rows", {})).items()}, - weights=dict(manifest.get("weights", {})), - full_oracle_capped_mean_abs_relative_error=calibration_summary.get( - "full_oracle_capped_mean_abs_relative_error", - full_oracle_summary.get("capped_mean_abs_relative_error"), - ), - full_oracle_mean_abs_relative_error=calibration_summary.get( - "full_oracle_mean_abs_relative_error", - full_oracle_summary.get("mean_abs_relative_error"), - ), - candidate_mean_abs_relative_error=harness_summary.get( - "candidate_mean_abs_relative_error" - ), - baseline_mean_abs_relative_error=harness_summary.get( - "baseline_mean_abs_relative_error" - ), - mean_abs_relative_error_delta=harness_summary.get( - "mean_abs_relative_error_delta" - ), - candidate_composite_parity_loss=harness_summary.get( - "candidate_composite_parity_loss" - ), - baseline_composite_parity_loss=harness_summary.get( - "baseline_composite_parity_loss" - ), - composite_parity_loss_delta=harness_summary.get( - "composite_parity_loss_delta" - ), - candidate_enhanced_cps_native_loss=native_summary.get( - "candidate_enhanced_cps_native_loss" - ), - baseline_enhanced_cps_native_loss=native_summary.get( - "baseline_enhanced_cps_native_loss" - ), - enhanced_cps_native_loss_delta=native_summary.get( - "enhanced_cps_native_loss_delta" - ), - candidate_beats_baseline_native_loss=native_summary.get( - "candidate_beats_baseline" - ), - candidate_unweighted_msre=native_summary.get("candidate_unweighted_msre"), - baseline_unweighted_msre=native_summary.get("baseline_unweighted_msre"), - unweighted_msre_delta=native_summary.get("unweighted_msre_delta"), - slice_win_rate=harness_summary.get("slice_win_rate"), - target_win_rate=harness_summary.get("target_win_rate"), - supported_target_rate=harness_summary.get("supported_target_rate"), - tag_summaries={ - key: dict(value) - for key, value in dict(harness_summary.get("tag_summaries", {})).items() - }, - parity_scorecard={ - key: dict(value) - for key, value in dict(harness_summary.get("parity_scorecard", {})).items() - }, - baseline_dataset=harness_metadata.get("baseline_dataset"), - targets_db=harness_metadata.get("targets_db"), - target_period=harness_metadata.get("target_period"), - target_variables=tuple(harness_metadata.get("target_variables", [])), - target_domains=tuple(harness_metadata.get("target_domains", [])), - target_geo_levels=tuple(harness_metadata.get("target_geo_levels", [])), - target_reform_id=harness_metadata.get("target_reform_id"), - policyengine_us_runtime_version=harness_metadata.get( - "policyengine_us_runtime_version" - ), - metadata=merged_metadata, - ) - - -def _stable_config_hash(config: dict[str, Any]) -> str | None: - if not config: - return None - payload = json.dumps(config, sort_keys=True, separators=(",", ":")) - return hashlib.sha256(payload.encode("utf-8")).hexdigest() - - -def _resolve_run_registry_path(path: str | Path) -> Path: - candidate_path = Path(path) - if candidate_path.suffix == ".jsonl": - return candidate_path - return candidate_path / "run_registry.jsonl" - - -def _is_frontier_eligible(entry: USMicroplexRunRegistryEntry) -> bool: - return entry.weight_collapse_suspected is not True - - -def _improves_candidate_frontier( - entries: list[USMicroplexRunRegistryEntry], - entry: USMicroplexRunRegistryEntry, -) -> bool | None: - candidate_error = entry.candidate_mean_abs_relative_error - if candidate_error is None or not _is_frontier_eligible(entry): - return None - prior_errors = [ - item.candidate_mean_abs_relative_error - for item in entries - if item.candidate_mean_abs_relative_error is not None and _is_frontier_eligible(item) - ] - if not prior_errors: - return True - return candidate_error < min(prior_errors) - - -def _improves_delta_frontier( - entries: list[USMicroplexRunRegistryEntry], - entry: USMicroplexRunRegistryEntry, -) -> bool | None: - error_delta = entry.mean_abs_relative_error_delta - if error_delta is None or not _is_frontier_eligible(entry): - return None - prior_deltas = [ - item.mean_abs_relative_error_delta - for item in entries - if item.mean_abs_relative_error_delta is not None and _is_frontier_eligible(item) - ] - if not prior_deltas: - return True - return error_delta < min(prior_deltas) - - -def _improves_composite_frontier( - entries: list[USMicroplexRunRegistryEntry], - entry: USMicroplexRunRegistryEntry, -) -> bool | None: - composite_loss = entry.candidate_composite_parity_loss - if composite_loss is None or not _is_frontier_eligible(entry): - return None - prior_losses = [ - item.candidate_composite_parity_loss - for item in entries - if item.candidate_composite_parity_loss is not None and _is_frontier_eligible(item) - ] - if not prior_losses: - return True - return composite_loss < min(prior_losses) - - -def _improves_native_frontier( - entries: list[USMicroplexRunRegistryEntry], - entry: USMicroplexRunRegistryEntry, -) -> bool | None: - native_delta = entry.enhanced_cps_native_loss_delta - if native_delta is None or not _is_frontier_eligible(entry): - return None - prior_deltas = [ - item.enhanced_cps_native_loss_delta - for item in entries - if item.enhanced_cps_native_loss_delta is not None and _is_frontier_eligible(item) - ] - if not prior_deltas: - return True - return native_delta < min(prior_deltas) diff --git a/src/microplex_us/pipelines/seed_stage_parity.py b/src/microplex_us/pipelines/seed_stage_parity.py deleted file mode 100644 index 545d79b5..00000000 --- a/src/microplex_us/pipelines/seed_stage_parity.py +++ /dev/null @@ -1,953 +0,0 @@ -"""Audit seed/source-impute rows before synthesis and calibration.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from microplex.core import EntityType - -from microplex_us.pipelines.source_stage_parity import ( - _bundle_table, - _compare_series, - _entity_weights, - _safe_ratio, - _stringify_id_series, - _summarize_series, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - load_policyengine_us_entity_tables, -) - - -@dataclass(frozen=True) -class SeedStageFocusVariableSpec: - """One seed-stage variable comparison against the PE reference surface.""" - - label: str - seed_variable: str - reference_variable: str | None = None - value_kind: str = "auto" - - @property - def resolved_reference_variable(self) -> str: - return self.reference_variable or self.seed_variable - - -@dataclass(frozen=True) -class SeedStageBooleanLandingFeatureSpec: - """One positive-support feature share among seed-stage positive rows.""" - - label: str - seed_variable: str - reference_variable: str | None = None - - @property - def resolved_reference_variable(self) -> str: - return self.reference_variable or self.seed_variable - - -@dataclass(frozen=True) -class SeedStageCategoricalLandingFeatureSpec: - """One categorical landing profile among seed-stage positive rows.""" - - label: str - seed_variable: str - reference_variable: str | None = None - transform: str = "identity" - top_n: int = 10 - - @property - def resolved_reference_variable(self) -> str: - return self.reference_variable or self.seed_variable - - -DEFAULT_SEED_STAGE_FOCUS_VARIABLES: tuple[SeedStageFocusVariableSpec, ...] = ( - SeedStageFocusVariableSpec( - "self_employment_income", - "self_employment_income", - "self_employment_income_before_lsr", - value_kind="numeric", - ), - SeedStageFocusVariableSpec( - "partnership_s_corp_income", - "partnership_s_corp_income", - value_kind="numeric", - ), - SeedStageFocusVariableSpec( - "health_savings_account_ald", - "health_savings_account_ald", - value_kind="numeric", - ), - SeedStageFocusVariableSpec( - "self_employed_health_insurance_ald", - "self_employed_health_insurance_ald", - value_kind="numeric", - ), - SeedStageFocusVariableSpec( - "self_employed_pension_contribution_ald", - "self_employed_pension_contribution_ald", - value_kind="numeric", - ), - SeedStageFocusVariableSpec( - "taxable_interest_income", - "taxable_interest_income", - value_kind="numeric", - ), - SeedStageFocusVariableSpec( - "non_qualified_dividend_income", - "non_qualified_dividend_income", - value_kind="numeric", - ), -) - -DEFAULT_SEED_STAGE_BOOLEAN_LANDING_FEATURES: tuple[ - SeedStageBooleanLandingFeatureSpec, ... -] = ( - SeedStageBooleanLandingFeatureSpec( - "positive_self_employment_income", - "self_employment_income", - "self_employment_income_before_lsr", - ), - SeedStageBooleanLandingFeatureSpec( - "positive_wage_income", - "employment_income", - "employment_income_before_lsr", - ), - SeedStageBooleanLandingFeatureSpec("has_esi", "has_esi"), - SeedStageBooleanLandingFeatureSpec( - "has_marketplace_health_coverage", - "has_marketplace_health_coverage", - ), -) - -DEFAULT_SEED_STAGE_CATEGORICAL_LANDING_FEATURES: tuple[ - SeedStageCategoricalLandingFeatureSpec, ... -] = ( - SeedStageCategoricalLandingFeatureSpec( - "age_bin", - "age", - "age", - transform="age_bin", - ), - SeedStageCategoricalLandingFeatureSpec( - "state_fips", - "state_fips", - "state_fips", - ), -) - -DEFAULT_SEED_STAGE_CANDIDATE_ONLY_LANDING_FEATURES: tuple[ - SeedStageCategoricalLandingFeatureSpec, ... -] = ( - SeedStageCategoricalLandingFeatureSpec( - "employment_status", - "employment_status", - ), -) - -_AGE_BIN_LABELS: tuple[str, ...] = tuple( - [f"{start}-{start + 4}" for start in range(0, 85, 5)] + ["85+"] -) -_AGE_BIN_EDGES = np.array(list(range(0, 90, 5)) + [200], dtype=float) -_PERSON_PROJECTION_ORDER: tuple[tuple[EntityType, str], ...] = ( - (EntityType.HOUSEHOLD, "household_id"), - (EntityType.TAX_UNIT, "tax_unit_id"), - (EntityType.SPM_UNIT, "spm_unit_id"), - (EntityType.FAMILY, "family_id"), -) - - -def build_us_seed_stage_parity_audit( - seed_data: str | Path, - reference_dataset: str | Path, - *, - period: int = 2024, - focus_variables: tuple[SeedStageFocusVariableSpec | str, ...] - | list[SeedStageFocusVariableSpec | str] = DEFAULT_SEED_STAGE_FOCUS_VARIABLES, - boolean_landing_features: tuple[SeedStageBooleanLandingFeatureSpec, ...] - | list[SeedStageBooleanLandingFeatureSpec] = DEFAULT_SEED_STAGE_BOOLEAN_LANDING_FEATURES, - categorical_landing_features: tuple[SeedStageCategoricalLandingFeatureSpec, ...] - | list[SeedStageCategoricalLandingFeatureSpec] = DEFAULT_SEED_STAGE_CATEGORICAL_LANDING_FEATURES, - candidate_only_landing_features: tuple[ - SeedStageCategoricalLandingFeatureSpec, ... - ] - | list[SeedStageCategoricalLandingFeatureSpec] = DEFAULT_SEED_STAGE_CANDIDATE_ONLY_LANDING_FEATURES, -) -> dict[str, Any]: - """Compare seed-stage donor landing against PE's person-level reference surface.""" - - seed_path = Path(seed_data).resolve() - reference_path = Path(reference_dataset).resolve() - seed_rows = pd.read_parquet(seed_path) - seed_weights = _seed_weight_series(seed_rows) - - focus_specs = _normalize_focus_variable_specs(focus_variables) - boolean_specs = tuple(boolean_landing_features) - categorical_specs = tuple(categorical_landing_features) - candidate_only_specs = tuple(candidate_only_landing_features) - - reference_bundle = load_policyengine_us_entity_tables(reference_path, period=period) - required_reference_variables = { - spec.resolved_reference_variable for spec in focus_specs - } | { - spec.resolved_reference_variable for spec in boolean_specs - } | { - spec.resolved_reference_variable for spec in categorical_specs - } - reference_person_rows = _build_reference_person_projection( - reference_bundle, - required_reference_variables, - ) - reference_weights = pd.to_numeric( - reference_person_rows["weight"], - errors="coerce", - ).fillna(0.0) - seed_total_weight = float(seed_weights.sum()) - reference_total_weight = float(reference_weights.sum()) - - return { - "schemaVersion": 1, - "comparisonStage": "seed_source_impute", - "period": int(period), - "seedData": str(seed_path), - "referenceDataset": str(reference_path), - "weightScale": { - "seed_total_weight": seed_total_weight, - "reference_total_weight": reference_total_weight, - "reference_to_seed_weight_scale": _safe_ratio( - reference_total_weight, - seed_total_weight, - ), - }, - "seedStructure": _seed_structure_summary(seed_rows, seed_weights), - "referenceStructure": _reference_person_structure_summary( - reference_person_rows, - reference_weights, - bundle=reference_bundle, - ), - "focusVariables": { - spec.label: _seed_focus_variable_audit( - seed_rows=seed_rows, - seed_weights=seed_weights, - reference_rows=reference_person_rows, - reference_weights=reference_weights, - focus_spec=spec, - boolean_specs=boolean_specs, - categorical_specs=categorical_specs, - candidate_only_specs=candidate_only_specs, - ) - for spec in focus_specs - }, - } - - -def write_us_seed_stage_parity_audit( - seed_data: str | Path, - reference_dataset: str | Path, - output_path: str | Path, - *, - period: int = 2024, - focus_variables: tuple[SeedStageFocusVariableSpec | str, ...] - | list[SeedStageFocusVariableSpec | str] = DEFAULT_SEED_STAGE_FOCUS_VARIABLES, - boolean_landing_features: tuple[SeedStageBooleanLandingFeatureSpec, ...] - | list[SeedStageBooleanLandingFeatureSpec] = DEFAULT_SEED_STAGE_BOOLEAN_LANDING_FEATURES, - categorical_landing_features: tuple[SeedStageCategoricalLandingFeatureSpec, ...] - | list[SeedStageCategoricalLandingFeatureSpec] = DEFAULT_SEED_STAGE_CATEGORICAL_LANDING_FEATURES, - candidate_only_landing_features: tuple[ - SeedStageCategoricalLandingFeatureSpec, ... - ] - | list[SeedStageCategoricalLandingFeatureSpec] = DEFAULT_SEED_STAGE_CANDIDATE_ONLY_LANDING_FEATURES, -) -> Path: - """Persist one seed/source-impute parity audit as JSON.""" - - output = Path(output_path).resolve() - payload = build_us_seed_stage_parity_audit( - seed_data, - reference_dataset, - period=period, - focus_variables=focus_variables, - boolean_landing_features=boolean_landing_features, - categorical_landing_features=categorical_landing_features, - candidate_only_landing_features=candidate_only_landing_features, - ) - output.parent.mkdir(parents=True, exist_ok=True) - output.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return output - - -def build_us_seed_tax_unit_support_audit( - seed_data: str | Path, - reference_dataset: str | Path, - *, - period: int = 2024, -) -> dict[str, Any]: - """Compare seed-derived tax-unit support against a PE reference dataset.""" - - seed_path = Path(seed_data).resolve() - reference_path = Path(reference_dataset).resolve() - support_audit = _compute_seed_tax_unit_support_audit( - pd.read_parquet(seed_path), - reference_path=reference_path, - period=period, - ) - return _seed_tax_unit_support_payload( - seed_path=seed_path, - reference_path=reference_path, - period=period, - support_audit=support_audit, - ) - - -def write_us_seed_tax_unit_support_audit( - seed_data: str | Path, - reference_dataset: str | Path, - output_path: str | Path, - *, - period: int = 2024, -) -> Path: - """Persist one seed tax-unit support audit as JSON.""" - - output = Path(output_path).resolve() - payload = build_us_seed_tax_unit_support_audit( - seed_data, - reference_dataset, - period=period, - ) - output.parent.mkdir(parents=True, exist_ok=True) - output.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return output - - -def _seed_focus_variable_audit( - *, - seed_rows: pd.DataFrame, - seed_weights: pd.Series, - reference_rows: pd.DataFrame, - reference_weights: pd.Series, - focus_spec: SeedStageFocusVariableSpec, - boolean_specs: tuple[SeedStageBooleanLandingFeatureSpec, ...], - categorical_specs: tuple[SeedStageCategoricalLandingFeatureSpec, ...], - candidate_only_specs: tuple[SeedStageCategoricalLandingFeatureSpec, ...], -) -> dict[str, Any]: - seed_series = seed_rows.get(focus_spec.seed_variable) - reference_series = reference_rows.get(focus_spec.resolved_reference_variable) - payload: dict[str, Any] = { - "seed_variable": focus_spec.seed_variable, - "reference_variable": focus_spec.resolved_reference_variable, - "seed_present": seed_series is not None, - "reference_present": reference_series is not None, - } - if seed_series is None or reference_series is None: - return payload - - payload["seed"] = _summarize_series( - seed_series, - weights=seed_weights, - value_kind=focus_spec.value_kind, - ) - payload["reference"] = _summarize_series( - reference_series, - weights=reference_weights, - value_kind=focus_spec.value_kind, - ) - payload["comparison"] = _compare_series( - seed_series, - reference_series, - candidate_weights=seed_weights, - reference_weights=reference_weights, - value_kind=focus_spec.value_kind, - ) - if payload["comparison"].get("type") == "numeric": - weighted_sum_case = _undefined_ratio_case( - payload["seed"]["weighted_sum"], - payload["reference"]["weighted_sum"], - ) - weighted_positive_share_case = _undefined_ratio_case( - payload["seed"]["weighted_positive_share"], - payload["reference"]["weighted_positive_share"], - ) - payload["comparison"]["weighted_sum_ratio_defined"] = weighted_sum_case == "defined" - payload["comparison"]["weighted_sum_ratio_case"] = weighted_sum_case - payload["comparison"]["weighted_positive_share_ratio_defined"] = ( - weighted_positive_share_case == "defined" - ) - payload["comparison"]["weighted_positive_share_ratio_case"] = ( - weighted_positive_share_case - ) - payload["comparison"]["reference_scaled_weighted_sum_ratio"] = _safe_ratio( - payload["seed"]["weighted_sum"] - * _safe_ratio(float(reference_weights.sum()), float(seed_weights.sum())), - payload["reference"]["weighted_sum"], - ) - payload["comparison"]["reference_scaled_weighted_sum_ratio_defined"] = ( - weighted_sum_case == "defined" - ) - payload["comparison"]["reference_scaled_weighted_sum_ratio_case"] = ( - weighted_sum_case - ) - - seed_positive = _positive_mask(seed_series) - reference_positive = _positive_mask(reference_series) - payload["positiveSupport"] = { - "seed_positive_row_count": int(seed_positive.sum()), - "reference_positive_row_count": int(reference_positive.sum()), - "seed_positive_weight_share": _weighted_share(seed_positive, seed_weights), - "reference_positive_weight_share": _weighted_share( - reference_positive, - reference_weights, - ), - } - payload["positiveBooleanProfiles"] = { - spec.label: _boolean_positive_profile( - seed_rows=seed_rows, - seed_weights=seed_weights, - seed_positive=seed_positive, - reference_rows=reference_rows, - reference_weights=reference_weights, - reference_positive=reference_positive, - spec=spec, - ) - for spec in boolean_specs - } - payload["positiveCategoricalProfiles"] = { - spec.label: _categorical_positive_profile( - seed_rows=seed_rows, - seed_weights=seed_weights, - seed_positive=seed_positive, - reference_rows=reference_rows, - reference_weights=reference_weights, - reference_positive=reference_positive, - spec=spec, - ) - for spec in categorical_specs - } - payload["positiveCandidateOnlyProfiles"] = { - spec.label: _candidate_only_positive_profile( - seed_rows=seed_rows, - seed_weights=seed_weights, - seed_positive=seed_positive, - spec=spec, - ) - for spec in candidate_only_specs - } - return payload - - -def _build_reference_person_projection( - bundle: PolicyEngineUSEntityTableBundle, - required_variables: set[str], -) -> pd.DataFrame: - persons = bundle.persons - if persons is None: - raise ValueError("Reference dataset must contain person rows") - projected = persons.copy() - projected["weight"] = _entity_weights(bundle, EntityType.PERSON).to_numpy(dtype=float) - for variable in sorted(required_variables): - if variable in projected.columns: - continue - for entity, id_column in _PERSON_PROJECTION_ORDER: - table = _bundle_table(bundle, entity) - if table is None or variable not in table.columns: - continue - if id_column not in projected.columns or id_column not in table.columns: - continue - lookup = pd.Series( - table[variable].to_numpy(), - index=_stringify_id_series(table[id_column]), - ) - projected[variable] = _stringify_id_series(projected[id_column]).map(lookup) - break - return projected - - -def _seed_structure_summary(rows: pd.DataFrame, weights: pd.Series) -> dict[str, Any]: - summary: dict[str, Any] = { - "row_count": int(len(rows)), - } - for column in ("person_id", "household_id", "tax_unit_id"): - if column in rows.columns: - summary[f"{column}_count"] = int(rows[column].nunique(dropna=True)) - if "household_id" in rows.columns: - household_sizes = rows.groupby("household_id", observed=True).size() - household_sizes.index = pd.Index( - _stringify_id_series(pd.Series(household_sizes.index)).tolist() - ) - household_weights = _seed_household_weights(rows).reindex(household_sizes.index) - summary["mean_rows_per_household"] = float(household_sizes.mean()) - summary["weighted_mean_rows_per_household"] = _weighted_mean( - household_sizes.astype(float).to_numpy(), - pd.to_numeric(household_weights, errors="coerce").fillna(0.0).to_numpy( - dtype=float - ), - ) - summary["total_weight"] = float(weights.sum()) - return summary - - -def _reference_person_structure_summary( - rows: pd.DataFrame, - weights: pd.Series, - *, - bundle: PolicyEngineUSEntityTableBundle, -) -> dict[str, Any]: - summary: dict[str, Any] = {"person_row_count": int(len(rows))} - for column in ("person_id", "household_id", "tax_unit_id"): - if column in rows.columns: - summary[f"{column}_count"] = int(rows[column].nunique(dropna=True)) - if "household_id" in rows.columns: - household_sizes = rows.groupby("household_id", observed=True).size() - household_sizes.index = pd.Index( - _stringify_id_series(pd.Series(household_sizes.index)).tolist() - ) - household_weights = _reference_household_weights(bundle).reindex(household_sizes.index) - summary["mean_rows_per_household"] = float(household_sizes.mean()) - summary["weighted_mean_rows_per_household"] = _weighted_mean( - household_sizes.astype(float).to_numpy(), - pd.to_numeric(household_weights, errors="coerce").fillna(0.0).to_numpy( - dtype=float - ), - ) - summary["total_weight"] = float(weights.sum()) - return summary - - -def _boolean_positive_profile( - *, - seed_rows: pd.DataFrame, - seed_weights: pd.Series, - seed_positive: pd.Series, - reference_rows: pd.DataFrame, - reference_weights: pd.Series, - reference_positive: pd.Series, - spec: SeedStageBooleanLandingFeatureSpec, -) -> dict[str, Any]: - payload: dict[str, Any] = { - "seed_present": spec.seed_variable in seed_rows.columns, - "reference_present": spec.resolved_reference_variable in reference_rows.columns, - } - if spec.seed_variable in seed_rows.columns: - seed_feature = _positive_mask(seed_rows[spec.seed_variable]) - payload["seed_positive_share"] = _conditional_weight_share( - seed_feature, - seed_positive, - seed_weights, - ) - if spec.resolved_reference_variable in reference_rows.columns: - reference_feature = _positive_mask(reference_rows[spec.resolved_reference_variable]) - payload["reference_positive_share"] = _conditional_weight_share( - reference_feature, - reference_positive, - reference_weights, - ) - if ( - "seed_positive_share" in payload - and "reference_positive_share" in payload - ): - payload["share_delta"] = ( - payload["seed_positive_share"] - payload["reference_positive_share"] - ) - return payload - - -def _categorical_positive_profile( - *, - seed_rows: pd.DataFrame, - seed_weights: pd.Series, - seed_positive: pd.Series, - reference_rows: pd.DataFrame, - reference_weights: pd.Series, - reference_positive: pd.Series, - spec: SeedStageCategoricalLandingFeatureSpec, -) -> dict[str, Any]: - payload: dict[str, Any] = { - "seed_present": spec.seed_variable in seed_rows.columns, - "reference_present": spec.resolved_reference_variable in reference_rows.columns, - } - if spec.seed_variable in seed_rows.columns: - seed_series = _transform_profile_series(seed_rows[spec.seed_variable], spec.transform) - payload["seed"] = _categorical_subset_summary( - seed_series, - seed_weights, - seed_positive, - top_n=spec.top_n, - ) - if spec.resolved_reference_variable in reference_rows.columns: - reference_series = _transform_profile_series( - reference_rows[spec.resolved_reference_variable], - spec.transform, - ) - payload["reference"] = _categorical_subset_summary( - reference_series, - reference_weights, - reference_positive, - top_n=spec.top_n, - ) - return payload - - -def _candidate_only_positive_profile( - *, - seed_rows: pd.DataFrame, - seed_weights: pd.Series, - seed_positive: pd.Series, - spec: SeedStageCategoricalLandingFeatureSpec, -) -> dict[str, Any]: - payload: dict[str, Any] = {"seed_present": spec.seed_variable in seed_rows.columns} - if spec.seed_variable in seed_rows.columns: - seed_series = _transform_profile_series(seed_rows[spec.seed_variable], spec.transform) - payload["seed"] = _categorical_subset_summary( - seed_series, - seed_weights, - seed_positive, - top_n=spec.top_n, - ) - return payload - - -def _categorical_subset_summary( - series: pd.Series, - weights: pd.Series, - mask: pd.Series, - *, - top_n: int, -) -> dict[str, Any]: - subset_series = series.loc[mask].reset_index(drop=True) - subset_weights = pd.to_numeric( - weights.loc[mask], - errors="coerce", - ).fillna(0.0).reset_index(drop=True) - summary = _summarize_series( - subset_series, - weights=subset_weights, - value_kind="categorical", - ) - summary["top_values"] = list(summary.get("top_values", []))[: int(top_n)] - return summary - - -def _normalize_focus_variable_specs( - specs: tuple[SeedStageFocusVariableSpec | str, ...] - | list[SeedStageFocusVariableSpec | str], -) -> tuple[SeedStageFocusVariableSpec, ...]: - result: list[SeedStageFocusVariableSpec] = [] - for spec in specs: - if isinstance(spec, SeedStageFocusVariableSpec): - result.append(spec) - continue - result.append( - SeedStageFocusVariableSpec( - label=str(spec), - seed_variable=str(spec), - ) - ) - return tuple(result) - - -def _seed_weight_column(rows: pd.DataFrame) -> str: - for candidate in ("hh_weight", "household_weight", "weight"): - if candidate in rows.columns: - return candidate - raise ValueError("Seed rows must contain hh_weight, household_weight, or weight") - - -def _seed_weight_series(rows: pd.DataFrame) -> pd.Series: - return pd.to_numeric(rows[_seed_weight_column(rows)], errors="coerce").fillna(0.0) - - -def _seed_household_weights(rows: pd.DataFrame) -> pd.Series: - if "household_id" not in rows.columns: - raise ValueError("Seed rows must contain household_id to summarize households") - household_ids = _stringify_id_series(rows["household_id"]) - if "hh_weight" in rows.columns: - values = pd.to_numeric(rows["hh_weight"], errors="coerce").fillna(0.0) - elif "household_weight" in rows.columns: - values = pd.to_numeric(rows["household_weight"], errors="coerce").fillna(0.0) - else: - values = pd.to_numeric(rows["weight"], errors="coerce").fillna(0.0) - grouped = ( - pd.DataFrame({"household_id": household_ids, "weight": values.to_numpy(dtype=float)}) - .groupby("household_id", observed=True)["weight"] - .mean() - ) - return grouped - - -def _reference_household_weights(bundle: PolicyEngineUSEntityTableBundle) -> pd.Series: - households = bundle.households - if households is None or "household_id" not in households.columns: - return pd.Series(dtype=float) - household_ids = _stringify_id_series(households["household_id"]) - weights = _entity_weights(bundle, EntityType.HOUSEHOLD) - return pd.Series(weights.to_numpy(dtype=float), index=household_ids) - - -def _positive_mask(values: pd.Series) -> pd.Series: - return pd.to_numeric(values, errors="coerce").fillna(0.0).gt(0.0) - - -def _conditional_weight_share( - feature_mask: pd.Series, - positive_mask: pd.Series, - weights: pd.Series, -) -> float: - aligned_weights = pd.to_numeric(weights, errors="coerce").fillna(0.0) - positive_weight = float(aligned_weights.loc[positive_mask].sum()) - if positive_weight <= 0.0: - return 0.0 - return float(aligned_weights.loc[positive_mask & feature_mask].sum() / positive_weight) - - -def _weighted_share(mask: pd.Series, weights: pd.Series) -> float: - return _safe_ratio( - float(pd.to_numeric(weights, errors="coerce").fillna(0.0).loc[mask].sum()), - float(pd.to_numeric(weights, errors="coerce").fillna(0.0).sum()), - ) - - -def _weighted_mean(values: np.ndarray, weights: np.ndarray) -> float: - total_weight = float(weights.sum()) - if total_weight <= 0.0: - return 0.0 - return float(np.dot(values, weights) / total_weight) - - -def _undefined_ratio_case(candidate_value: float, reference_value: float) -> str: - if float(reference_value) != 0.0: - return "defined" - if float(candidate_value) != 0.0: - return "candidate_nonzero_reference_zero" - return "both_zero" - - -def _compute_seed_tax_unit_support_audit( - seed_rows: pd.DataFrame, - *, - reference_path: Path, - period: int, -) -> dict[str, Any]: - import tempfile - - from microplex_us.pipelines.pe_native_scores import ( - compute_us_pe_native_support_audit, - ) - from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexPipeline, - USMicroplexTargets, - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=int(period)) - ) - working_seed = _normalize_seed_ids_for_policyengine_support(seed_rows) - tables = pipeline.build_policyengine_entity_tables(working_seed) - result = USMicroplexBuildResult( - config=pipeline.config, - seed_data=working_seed, - synthetic_data=working_seed.copy(), - calibrated_data=working_seed.copy(), - targets=USMicroplexTargets(marginal={}, continuous={}), - calibration_summary={}, - policyengine_tables=tables, - ) - with tempfile.TemporaryDirectory(prefix="microplex-seed-tax-unit-") as tmpdir: - dataset_path = Path(tmpdir) / "seed_policyengine_us.h5" - pipeline.export_policyengine_dataset( - result, - dataset_path, - period=int(period), - ) - return compute_us_pe_native_support_audit( - candidate_dataset_path=dataset_path, - baseline_dataset_path=reference_path, - period=int(period), - ) - - -def _normalize_seed_ids_for_policyengine_support(seed_rows: pd.DataFrame) -> pd.DataFrame: - """Normalize saved seed IDs into PE-exportable integer keys for auditing.""" - - normalized = seed_rows.copy() - for column in ( - "person_id", - "household_id", - "tax_unit_id", - "family_id", - "spm_unit_id", - "marital_unit_id", - ): - if column not in normalized.columns: - continue - values = normalized[column] - if pd.api.types.is_integer_dtype(values): - continue - codes, _ = pd.factorize(values.astype("string").fillna(""), sort=False) - normalized[column] = codes.astype(np.int64) - return normalized - - -def _seed_tax_unit_support_payload( - *, - seed_path: Path, - reference_path: Path, - period: int, - support_audit: dict[str, Any], -) -> dict[str, Any]: - candidate_snapshot = support_audit["candidate"] - reference_snapshot = support_audit["baseline"] - candidate_total_weight = float( - sum( - float(row.get("weighted_count", 0.0)) - for row in candidate_snapshot["filing_status_weighted_counts"].values() - ) - ) - reference_total_weight = float( - sum( - float(row.get("weighted_count", 0.0)) - for row in reference_snapshot["filing_status_weighted_counts"].values() - ) - ) - reference_scale = _safe_ratio(reference_total_weight, candidate_total_weight) - filing_status_rows = [] - for status in ( - "SINGLE", - "JOINT", - "SEPARATE", - "HEAD_OF_HOUSEHOLD", - "SURVIVING_SPOUSE", - ): - candidate_row = candidate_snapshot["filing_status_weighted_counts"].get( - status, - {"weighted_count": 0.0}, - ) - reference_row = reference_snapshot["filing_status_weighted_counts"].get( - status, - {"weighted_count": 0.0}, - ) - candidate_weighted_count = float(candidate_row.get("weighted_count", 0.0)) - candidate_reference_scaled_weighted_count = ( - candidate_weighted_count * reference_scale - ) - baseline_weighted_count = float(reference_row.get("weighted_count", 0.0)) - filing_status_rows.append( - { - "filing_status": status, - "candidate_weighted_count": candidate_weighted_count, - "candidate_reference_scaled_weighted_count": ( - candidate_reference_scaled_weighted_count - ), - "baseline_weighted_count": baseline_weighted_count, - "weighted_count_delta": ( - candidate_reference_scaled_weighted_count - - baseline_weighted_count - ), - } - ) - filing_status_rows.sort( - key=lambda row: abs(float(row["weighted_count_delta"])), - reverse=True, - ) - - baseline_mfs = { - row["agi_bin"]: row for row in reference_snapshot["mfs_high_agi_support"] - } - mfs_rows = [] - for row in candidate_snapshot["mfs_high_agi_support"]: - baseline_row = baseline_mfs.get( - row["agi_bin"], - {"weighted_count": 0.0, "weighted_agi": 0.0}, - ) - candidate_weighted_count = float(row.get("weighted_count", 0.0)) - candidate_weighted_agi = float(row.get("weighted_agi", 0.0)) - candidate_reference_scaled_weighted_count = ( - candidate_weighted_count * reference_scale - ) - candidate_reference_scaled_weighted_agi = ( - candidate_weighted_agi * reference_scale - ) - baseline_weighted_count = float(baseline_row.get("weighted_count", 0.0)) - baseline_weighted_agi = float(baseline_row.get("weighted_agi", 0.0)) - mfs_rows.append( - { - "agi_bin": row["agi_bin"], - "candidate_weighted_count": candidate_weighted_count, - "candidate_reference_scaled_weighted_count": ( - candidate_reference_scaled_weighted_count - ), - "baseline_weighted_count": baseline_weighted_count, - "weighted_count_delta": ( - candidate_reference_scaled_weighted_count - - baseline_weighted_count - ), - "candidate_weighted_agi": candidate_weighted_agi, - "candidate_reference_scaled_weighted_agi": ( - candidate_reference_scaled_weighted_agi - ), - "baseline_weighted_agi": baseline_weighted_agi, - "weighted_agi_delta": ( - candidate_reference_scaled_weighted_agi - baseline_weighted_agi - ), - } - ) - mfs_rows.sort( - key=lambda row: abs(float(row["weighted_agi_delta"])), - reverse=True, - ) - return { - "schemaVersion": 1, - "comparisonStage": "seed_tax_unit_support", - "period": int(period), - "seedData": str(seed_path), - "referenceDataset": str(reference_path), - "weightScale": { - "candidate_total_tax_unit_weight": candidate_total_weight, - "reference_total_tax_unit_weight": reference_total_weight, - "reference_to_candidate_tax_unit_scale": reference_scale, - }, - "candidateSnapshot": { - "filing_status_weighted_counts": candidate_snapshot[ - "filing_status_weighted_counts" - ], - "mfs_high_agi_support": candidate_snapshot["mfs_high_agi_support"], - }, - "referenceSnapshot": { - "filing_status_weighted_counts": reference_snapshot[ - "filing_status_weighted_counts" - ], - "mfs_high_agi_support": reference_snapshot["mfs_high_agi_support"], - }, - "comparisons": { - "filing_status_weighted_delta": filing_status_rows, - "mfs_high_agi_delta": mfs_rows, - }, - "verdictHints": { - "largestFilingStatusGap": ( - filing_status_rows[0]["filing_status"] if filing_status_rows else None - ), - "largestMFSAgiGap": ( - mfs_rows[0]["agi_bin"] if mfs_rows else None - ), - }, - } - - -def _transform_profile_series(series: pd.Series, transform: str) -> pd.Series: - if transform == "identity": - return series - if transform == "age_bin": - numeric = pd.to_numeric(series, errors="coerce") - binned = pd.cut( - numeric, - bins=_AGE_BIN_EDGES, - labels=_AGE_BIN_LABELS, - right=False, - include_lowest=True, - ) - return binned.astype("string") - raise ValueError(f"Unsupported profile transform: {transform}") diff --git a/src/microplex_us/pipelines/site_snapshot.py b/src/microplex_us/pipelines/site_snapshot.py deleted file mode 100644 index 51410530..00000000 --- a/src/microplex_us/pipelines/site_snapshot.py +++ /dev/null @@ -1,233 +0,0 @@ -"""Canonical site-snapshot helpers for saved US microplex artifacts.""" - -from __future__ import annotations - -import json -import os -from pathlib import Path -from typing import Any - -from microplex.targets import assert_valid_benchmark_artifact_manifest - -from microplex_us.pipelines.data_flow_snapshot import ( - require_saved_us_microplex_data_flow_snapshot, - write_us_microplex_data_flow_snapshot, -) -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_run import ( - resolve_us_manifest_or_contract_artifact_path, -) - -FOCUS_TAG_PRIORITY: tuple[str, ...] = ( - "state", - "local", - "parity", - "all_targets", - "national", - "tax", - "benchmark", -) - - -def build_us_microplex_site_snapshot( - artifact_dir: str | Path, - *, - snapshot_path: str | Path | None = None, -) -> dict[str, Any]: - """Build one site-facing snapshot from a versioned US artifact bundle.""" - artifact_root = Path(artifact_dir) - manifest = json.loads((artifact_root / "manifest.json").read_text()) - assert_valid_benchmark_artifact_manifest( - manifest, - artifact_dir=artifact_root, - manifest_path=artifact_root / "manifest.json", - summary_section="policyengine_harness", - required_artifact_keys=( - "seed_data", - "synthetic_data", - "calibrated_data", - "targets", - "policyengine_harness", - ), - required_summary_keys=( - "candidate_mean_abs_relative_error", - "baseline_mean_abs_relative_error", - "mean_abs_relative_error_delta", - ), - ) - harness_path = resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - "policyengine_harness", - stage_id="09_validation_benchmarking", - ) - harness = json.loads(harness_path.read_text()) - summary = dict(harness.get("summary", {})) - tag_summaries = { - key: dict(value) - for key, value in dict(summary.get("tag_summaries", {})).items() - } - focus_tag = _select_focus_tag(tag_summaries) - focus_summary = tag_summaries.get(focus_tag, summary) - synthesis = dict(manifest.get("synthesis", {})) - calibration = dict(manifest.get("calibration", {})) - config = dict(manifest.get("config", {})) - data_flow_path = resolve_us_manifest_or_contract_artifact_path( - artifact_root, - manifest, - "data_flow_snapshot", - stage_id="08_dataset_assembly", - ) - data_flow_snapshot = require_saved_us_microplex_data_flow_snapshot(artifact_root) - - source_artifact = { - "artifactRef": _artifact_ref(artifact_root), - "manifestFile": "manifest.json", - "harnessFile": _artifact_path_for_manifest(artifact_root, harness_path), - "dataFlowFile": _artifact_path_for_manifest(artifact_root, data_flow_path), - "versionId": artifact_root.name, - } - if snapshot_path is not None: - source_artifact["artifactPath"] = _artifact_path_from_snapshot( - artifact_root, - Path(snapshot_path), - ) - - return { - "generatedAt": manifest.get("created_at"), - "sourceArtifact": source_artifact, - "currentRun": { - "id": artifact_root.name, - "benchmarkTag": focus_tag, - "nSynthetic": config.get("n_synthetic"), - "scaffoldSource": synthesis.get("scaffold_source"), - "candidateMeanAbsRelativeError": focus_summary.get( - "candidate_mean_abs_relative_error" - ), - "baselineMeanAbsRelativeError": focus_summary.get( - "baseline_mean_abs_relative_error" - ), - "meanAbsRelativeErrorDelta": focus_summary.get( - "mean_abs_relative_error_delta" - ), - "candidateCompositeParityLoss": focus_summary.get( - "candidate_composite_parity_loss" - ), - "baselineCompositeParityLoss": focus_summary.get( - "baseline_composite_parity_loss" - ), - "targetWinRate": focus_summary.get("target_win_rate"), - "sliceWinRate": focus_summary.get("slice_win_rate"), - "supportedTargetRate": focus_summary.get("supported_target_rate"), - "calibration": { - "loadedTargets": calibration.get("n_loaded_targets"), - "supportedTargets": calibration.get("n_supported_targets"), - "converged": calibration.get("converged"), - "weightCollapseSuspected": calibration.get( - "weight_collapse_suspected" - ), - "householdEffectiveSampleSize": _nested_metric( - calibration, - "household_weight_diagnostics", - "effective_sample_size", - ), - "personEffectiveSampleSize": _nested_metric( - calibration, - "person_weight_diagnostics", - "effective_sample_size", - ), - "householdTinyWeightShare": _nested_metric( - calibration, - "household_weight_diagnostics", - "tiny_share", - ), - "personTinyWeightShare": _nested_metric( - calibration, - "person_weight_diagnostics", - "tiny_share", - ), - }, - "supportProxies": dict( - synthesis.get( - "state_program_support_proxies", - {"available": [], "missing": []}, - ) - ), - "availableTags": list(tag_summaries.keys()), - }, - "summary": summary, - "tagSummaries": tag_summaries, - "parityScorecard": { - key: dict(value) - for key, value in dict(summary.get("parity_scorecard", {})).items() - }, - "attributeCellSummaries": { - key: dict(value) - for key, value in dict(summary.get("attribute_cell_summaries", {})).items() - }, - "dataFlow": data_flow_snapshot, - } - - -def write_us_microplex_site_snapshot( - artifact_dir: str | Path, - output_path: str | Path, -) -> Path: - """Write the canonical US site snapshot JSON for one saved artifact bundle.""" - artifact_root = Path(artifact_dir) - write_us_microplex_data_flow_snapshot( - artifact_root, - resolve_us_stage_artifact_contract_path( - artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - ), - ) - snapshot = build_us_microplex_site_snapshot( - artifact_root, - snapshot_path=output_path, - ) - destination = Path(output_path) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text(json.dumps(snapshot, indent=2, sort_keys=True)) - return destination - - -def _artifact_ref(artifact_root: Path) -> str: - for parent in artifact_root.parents: - if parent.name == "artifacts": - return str(artifact_root.relative_to(parent)) - return artifact_root.name - - -def _artifact_path_for_manifest(artifact_root: Path, path: Path) -> str: - try: - return str(path.relative_to(artifact_root)) - except ValueError: - return str(path) - - -def _artifact_path_from_snapshot(artifact_root: Path, snapshot_path: Path) -> str: - return os.path.relpath(artifact_root, snapshot_path.parent) - - -def _nested_metric( - payload: dict[str, Any], - section: str, - key: str, -) -> float | int | None: - section_payload = payload.get(section) - if not isinstance(section_payload, dict): - return None - return section_payload.get(key) - - -def _select_focus_tag(tag_summaries: dict[str, dict[str, Any]]) -> str: - for candidate in FOCUS_TAG_PRIORITY: - if candidate in tag_summaries: - return candidate - if tag_summaries: - return next(iter(tag_summaries)) - return "summary" diff --git a/src/microplex_us/pipelines/source_stage_parity.py b/src/microplex_us/pipelines/source_stage_parity.py deleted file mode 100644 index 11a99bff..00000000 --- a/src/microplex_us/pipelines/source_stage_parity.py +++ /dev/null @@ -1,1316 +0,0 @@ -"""Stage-matched raw source parity audits for CPS and PUF.""" - -from __future__ import annotations - -import argparse -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Any, cast - -import h5py -import numpy as np -import pandas as pd -from microplex.core import ( - EntityType, - ObservationFrame, - SourceQuery, -) - -from microplex_us.data_sources.cps import CPSASECSourceProvider -from microplex_us.data_sources.puf import ( - PUF_UPRATING_MODE_INTERPOLATED, - SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE, - PUFSourceProvider, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - _build_group_household_map, - _decode_policyengine_array, - _infer_policyengine_array_entity, - _normalize_id_value, - _normalize_weight_value, - _policyengine_group_entity_type, - _resolve_policyengine_us_tax_benefit_system, - _resolve_prefixed_policyengine_table, - load_policyengine_us_entity_tables, -) - -_ENTITY_ORDER: tuple[EntityType, ...] = ( - EntityType.HOUSEHOLD, - EntityType.PERSON, - EntityType.TAX_UNIT, - EntityType.SPM_UNIT, - EntityType.FAMILY, -) -_ENTITY_ID_COLUMNS: dict[EntityType, str] = { - EntityType.HOUSEHOLD: "household_id", - EntityType.PERSON: "person_id", - EntityType.TAX_UNIT: "tax_unit_id", - EntityType.SPM_UNIT: "spm_unit_id", - EntityType.FAMILY: "family_id", -} -_ENTITY_BASE_COLUMNS: dict[EntityType, set[str]] = { - EntityType.HOUSEHOLD: {"household_id", "household_weight"}, - EntityType.PERSON: {"person_id", "household_id", "weight"}, - EntityType.TAX_UNIT: {"tax_unit_id", "household_id"}, - EntityType.SPM_UNIT: {"spm_unit_id", "household_id"}, - EntityType.FAMILY: {"family_id", "household_id"}, -} -_HOUSEHOLD_SIZE_BUCKETS: tuple[str, ...] = ("1", "2", "3", "4", "5", "6", "7+") - - -@dataclass(frozen=True) -class SourceStageParityVariableSpec: - """One semantic variable comparison between candidate and reference stages.""" - - label: str - candidate_variable: str - reference_variable: str | None = None - value_kind: str = "auto" - - @property - def resolved_reference_variable(self) -> str: - return self.reference_variable or self.candidate_variable - - -DEFAULT_CPS_SOURCE_STAGE_FOCUS_VARIABLES: tuple[SourceStageParityVariableSpec, ...] = ( - SourceStageParityVariableSpec("age", "age", value_kind="numeric"), - SourceStageParityVariableSpec("state_fips", "state_fips", value_kind="categorical"), - SourceStageParityVariableSpec("county_fips", "county_fips", value_kind="categorical"), - SourceStageParityVariableSpec("cps_race", "cps_race", value_kind="categorical"), - SourceStageParityVariableSpec("is_hispanic", "is_hispanic", value_kind="categorical"), - SourceStageParityVariableSpec("is_disabled", "is_disabled", value_kind="categorical"), - SourceStageParityVariableSpec("has_esi", "has_esi", value_kind="categorical"), - SourceStageParityVariableSpec( - "has_marketplace_health_coverage", - "has_marketplace_health_coverage", - value_kind="categorical", - ), - SourceStageParityVariableSpec( - "employment_income", - "wage_income", - "employment_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "self_employment_income", - "self_employment_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "taxable_interest_income", - "interest_income", - "taxable_interest_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec("rental_income", "rental_income", value_kind="numeric"), - SourceStageParityVariableSpec( - "medicare_part_b_premiums", - "medicare_part_b_premiums", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "other_medical_expenses", - "other_medical_expenses", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "over_the_counter_health_expenses", - "over_the_counter_health_expenses", - value_kind="numeric", - ), - SourceStageParityVariableSpec("receives_wic", "receives_wic", value_kind="categorical"), - SourceStageParityVariableSpec("is_separated", "is_separated", value_kind="categorical"), - SourceStageParityVariableSpec( - "is_surviving_spouse", - "is_surviving_spouse", - value_kind="categorical", - ), - SourceStageParityVariableSpec( - "social_security_retirement", - "social_security_retirement", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "social_security_disability", - "social_security_disability", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "social_security_survivors", - "social_security_survivors", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "social_security_dependents", - "social_security_dependents", - value_kind="numeric", - ), -) - -DEFAULT_PUF_SOURCE_STAGE_FOCUS_VARIABLES: tuple[SourceStageParityVariableSpec, ...] = ( - SourceStageParityVariableSpec("age", "age", value_kind="numeric"), - SourceStageParityVariableSpec("employment_income", "employment_income", value_kind="numeric"), - SourceStageParityVariableSpec( - "self_employment_income", - "self_employment_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "taxable_interest_income", - "taxable_interest_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "qualified_dividend_income", - "qualified_dividend_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "non_qualified_dividend_income", - "non_qualified_dividend_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "partnership_s_corp_income", - "partnership_s_corp_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec("farm_income", "farm_income", value_kind="numeric"), - SourceStageParityVariableSpec( - "farm_operations_income", - "farm_operations_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "farm_rent_income", - "farm_rent_income", - value_kind="numeric", - ), - SourceStageParityVariableSpec("rental_income", "rental_income", value_kind="numeric"), - SourceStageParityVariableSpec("filing_status", "filing_status", value_kind="categorical"), - SourceStageParityVariableSpec( - "health_savings_account_ald", - "health_savings_account_ald", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "self_employed_health_insurance_ald", - "self_employed_health_insurance_ald", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "self_employed_pension_contribution_ald", - "self_employed_pension_contribution_ald", - value_kind="numeric", - ), - SourceStageParityVariableSpec( - "pre_tax_contributions", - "pre_tax_contributions", - value_kind="numeric", - ), -) - - -def observation_frame_to_policyengine_entity_bundle( - frame: ObservationFrame, -) -> PolicyEngineUSEntityTableBundle: - """Project a provider observation frame into a PE-style entity bundle.""" - - households = _table_from_frame(frame, EntityType.HOUSEHOLD) - persons = _table_from_frame(frame, EntityType.PERSON) - if households is None or persons is None: - raise ValueError( - "Source-stage parity requires both household and person tables in the observation frame" - ) - persons = persons.copy() - households = households.copy() - for entity in _ENTITY_ORDER: - id_column = _ENTITY_ID_COLUMNS[entity] - table = households if entity is EntityType.HOUSEHOLD else persons - if id_column in table.columns: - table[id_column] = _stringify_id_series(table[id_column]) - if "household_id" in persons.columns: - persons["household_id"] = _stringify_id_series(persons["household_id"]) - if "household_id" in households.columns: - households["household_id"] = _stringify_id_series(households["household_id"]) - - tax_units = _group_table_from_persons(persons, "tax_unit_id") - spm_units = _group_table_from_persons(persons, "spm_unit_id") - families = _group_table_from_persons(persons, "family_id") - marital_units = _group_table_from_persons(persons, "marital_unit_id") - return PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=tax_units, - spm_units=spm_units, - families=families, - marital_units=marital_units, - ) - - -def build_us_source_stage_parity_audit( - candidate_bundle: PolicyEngineUSEntityTableBundle, - reference_dataset: str | Path, - *, - source_id: str, - period: int, - focus_variables: tuple[SourceStageParityVariableSpec, ...] - | list[SourceStageParityVariableSpec], - metadata: dict[str, Any] | None = None, -) -> dict[str, Any]: - """Compare one raw-source provider stage to a PE saved stage artifact.""" - - reference_path = Path(reference_dataset).expanduser().resolve() - reference_bundle = _load_reference_entity_bundle(reference_path, period=period) - focus_specs = tuple(focus_variables) - return { - "schemaVersion": 1, - "comparisonStage": "raw_source_provider", - "sourceId": source_id, - "period": int(period), - "candidate": { - "metadata": dict(metadata or {}), - }, - "reference": { - "datasetPath": str(reference_path), - }, - "schema": _build_schema_summary(candidate_bundle, reference_bundle), - "entityStructure": { - "candidate": _entity_structure_summary(candidate_bundle), - "reference": _entity_structure_summary(reference_bundle), - "deltas": _numeric_deltas( - _entity_structure_summary(candidate_bundle), - _entity_structure_summary(reference_bundle), - ), - }, - "householdSizeDistribution": { - "candidate": _weighted_household_size_distribution(candidate_bundle), - "reference": _weighted_household_size_distribution(reference_bundle), - "deltas": _distribution_deltas( - _weighted_household_size_distribution(candidate_bundle), - _weighted_household_size_distribution(reference_bundle), - ), - }, - "focusVariables": { - spec.label: _focus_variable_comparison( - candidate_bundle=candidate_bundle, - reference_bundle=reference_bundle, - spec=spec, - ) - for spec in focus_specs - }, - } - - -def write_us_source_stage_parity_audit( - candidate_bundle: PolicyEngineUSEntityTableBundle, - reference_dataset: str | Path, - output_path: str | Path, - *, - source_id: str, - period: int, - focus_variables: tuple[SourceStageParityVariableSpec, ...] - | list[SourceStageParityVariableSpec], - metadata: dict[str, Any] | None = None, -) -> Path: - """Persist one raw-source parity audit JSON.""" - - destination = Path(output_path).expanduser().resolve() - payload = build_us_source_stage_parity_audit( - candidate_bundle, - reference_dataset, - source_id=source_id, - period=period, - focus_variables=focus_variables, - metadata=metadata, - ) - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return destination - - -def build_us_cps_source_stage_parity_audit( - reference_dataset: str | Path, - *, - year: int = 2023, - cache_dir: str | Path | None = None, - download: bool = True, - sample_n: int | None = None, - random_seed: int = 0, - focus_variables: tuple[SourceStageParityVariableSpec, ...] - | list[SourceStageParityVariableSpec] = DEFAULT_CPS_SOURCE_STAGE_FOCUS_VARIABLES, -) -> dict[str, Any]: - """Run the raw CPS provider and compare it to a PE saved CPS artifact.""" - - provider = CPSASECSourceProvider( - year=year, - cache_dir=Path(cache_dir) if cache_dir is not None else None, - download=download, - ) - provider_filters: dict[str, Any] = { - "year": int(year), - "download": bool(download), - "random_seed": int(random_seed), - } - if cache_dir is not None: - provider_filters["cache_dir"] = str(Path(cache_dir).expanduser()) - if sample_n is not None: - provider_filters["sample_n"] = int(sample_n) - frame = provider.load_frame(SourceQuery(provider_filters=provider_filters)) - return build_us_source_stage_parity_audit( - observation_frame_to_policyengine_entity_bundle(frame), - reference_dataset, - source_id="cps_asec", - period=year, - focus_variables=focus_variables, - metadata={ - "candidateSourceName": frame.source.name, - "providerFilters": provider_filters, - }, - ) - - -def write_us_cps_source_stage_parity_audit( - reference_dataset: str | Path, - output_path: str | Path, - *, - year: int = 2023, - cache_dir: str | Path | None = None, - download: bool = True, - sample_n: int | None = None, - random_seed: int = 0, - focus_variables: tuple[SourceStageParityVariableSpec, ...] - | list[SourceStageParityVariableSpec] = DEFAULT_CPS_SOURCE_STAGE_FOCUS_VARIABLES, -) -> Path: - """Persist one raw CPS source-stage parity audit JSON.""" - - payload = build_us_cps_source_stage_parity_audit( - reference_dataset, - year=year, - cache_dir=cache_dir, - download=download, - sample_n=sample_n, - random_seed=random_seed, - focus_variables=focus_variables, - ) - destination = Path(output_path).expanduser().resolve() - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return destination - - -def build_us_puf_source_stage_parity_audit( - reference_dataset: str | Path, - *, - target_year: int = 2024, - cache_dir: str | Path | None = None, - puf_path: str | Path | None = None, - demographics_path: str | Path | None = None, - sample_n: int | None = None, - random_seed: int = 0, - uprating_mode: str = PUF_UPRATING_MODE_INTERPOLATED, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - impute_pre_tax_contributions: bool = False, - pre_tax_training_year: int = 2024, - require_pre_tax_contribution_model: bool = False, - social_security_split_strategy: str = SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE, - focus_variables: tuple[SourceStageParityVariableSpec, ...] - | list[SourceStageParityVariableSpec] = DEFAULT_PUF_SOURCE_STAGE_FOCUS_VARIABLES, -) -> dict[str, Any]: - """Run the raw PUF provider and compare it to a PE saved PUF artifact.""" - - provider = PUFSourceProvider( - target_year=target_year, - cache_dir=Path(cache_dir) if cache_dir is not None else None, - puf_path=puf_path, - demographics_path=demographics_path, - uprating_mode=uprating_mode, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - impute_pre_tax_contributions=impute_pre_tax_contributions, - pre_tax_training_year=pre_tax_training_year, - require_pre_tax_contribution_model=require_pre_tax_contribution_model, - social_security_split_strategy=social_security_split_strategy, - ) - provider_filters: dict[str, Any] = { - "target_year": int(target_year), - "random_seed": int(random_seed), - "uprating_mode": uprating_mode, - "social_security_split_strategy": social_security_split_strategy, - "impute_pre_tax_contributions": bool(impute_pre_tax_contributions), - "pre_tax_training_year": int(pre_tax_training_year), - "require_pre_tax_contribution_model": bool(require_pre_tax_contribution_model), - } - for key, value in ( - ("cache_dir", cache_dir), - ("puf_path", puf_path), - ("demographics_path", demographics_path), - ("policyengine_us_data_repo", policyengine_us_data_repo), - ("policyengine_us_data_python", policyengine_us_data_python), - ): - if value is not None: - provider_filters[key] = str(Path(value).expanduser()) - if sample_n is not None: - provider_filters["sample_n"] = int(sample_n) - frame = provider.load_frame(SourceQuery(provider_filters=provider_filters)) - return build_us_source_stage_parity_audit( - observation_frame_to_policyengine_entity_bundle(frame), - reference_dataset, - source_id="irs_soi_puf", - period=target_year, - focus_variables=focus_variables, - metadata={ - "candidateSourceName": frame.source.name, - "providerFilters": provider_filters, - }, - ) - - -def write_us_puf_source_stage_parity_audit( - reference_dataset: str | Path, - output_path: str | Path, - *, - target_year: int = 2024, - cache_dir: str | Path | None = None, - puf_path: str | Path | None = None, - demographics_path: str | Path | None = None, - sample_n: int | None = None, - random_seed: int = 0, - uprating_mode: str = PUF_UPRATING_MODE_INTERPOLATED, - policyengine_us_data_repo: str | Path | None = None, - policyengine_us_data_python: str | Path | None = None, - impute_pre_tax_contributions: bool = False, - pre_tax_training_year: int = 2024, - require_pre_tax_contribution_model: bool = False, - social_security_split_strategy: str = SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE, - focus_variables: tuple[SourceStageParityVariableSpec, ...] - | list[SourceStageParityVariableSpec] = DEFAULT_PUF_SOURCE_STAGE_FOCUS_VARIABLES, -) -> Path: - """Persist one raw PUF source-stage parity audit JSON.""" - - payload = build_us_puf_source_stage_parity_audit( - reference_dataset, - target_year=target_year, - cache_dir=cache_dir, - puf_path=puf_path, - demographics_path=demographics_path, - sample_n=sample_n, - random_seed=random_seed, - uprating_mode=uprating_mode, - policyengine_us_data_repo=policyengine_us_data_repo, - policyengine_us_data_python=policyengine_us_data_python, - impute_pre_tax_contributions=impute_pre_tax_contributions, - pre_tax_training_year=pre_tax_training_year, - require_pre_tax_contribution_model=require_pre_tax_contribution_model, - social_security_split_strategy=social_security_split_strategy, - focus_variables=focus_variables, - ) - destination = Path(output_path).expanduser().resolve() - destination.parent.mkdir(parents=True, exist_ok=True) - destination.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - return destination - - -def _table_from_frame( - frame: ObservationFrame, - entity: EntityType, -) -> pd.DataFrame | None: - table = frame.tables.get(entity) - if table is None: - return None - if isinstance(table, pd.DataFrame): - return table - to_pandas = getattr(table, "to_pandas", None) - if callable(to_pandas): - return cast(pd.DataFrame, to_pandas()) - raise TypeError(f"Unsupported table type for entity '{entity.value}'") - - -def _load_reference_entity_bundle( - reference_dataset: str | Path, - *, - period: int, -) -> PolicyEngineUSEntityTableBundle: - reference_path = Path(reference_dataset).expanduser().resolve() - if _is_flat_policyengine_h5(reference_path): - return _load_flat_policyengine_us_entity_tables(reference_path) - return load_policyengine_us_entity_tables(reference_path, period=period) - - -def _is_flat_policyengine_h5(path: Path) -> bool: - with h5py.File(path, "r") as handle: - for value in handle.values(): - return isinstance(value, h5py.Dataset) - return False - - -def _load_flat_policyengine_us_entity_tables( - dataset: str | Path, -) -> PolicyEngineUSEntityTableBundle: - source = Path(dataset).expanduser().resolve() - with h5py.File(source, "r") as handle: - arrays = { - variable: np.asarray(values) - for variable, values in handle.items() - if isinstance(values, h5py.Dataset) - } - - required_structural = { - "household_id", - "person_id", - "person_household_id", - } - missing = sorted(required_structural - set(arrays)) - if missing: - raise ValueError( - "PolicyEngine flat dataset is missing required structural arrays: " - + ", ".join(missing) - ) - - households = pd.DataFrame( - {"household_id": _normalize_id_value(arrays["household_id"])} - ) - household_weight = arrays.get("household_weight") - households["household_weight"] = ( - _normalize_weight_value(household_weight) - if household_weight is not None - else np.ones(len(households), dtype=float) - ) - - persons = pd.DataFrame( - { - "person_id": _normalize_id_value(arrays["person_id"]), - "household_id": _normalize_id_value(arrays["person_household_id"]), - } - ) - if "person_weight" in arrays: - persons["weight"] = _normalize_weight_value(arrays["person_weight"]) - - group_specs = ( - ("tax_unit", "tax_unit_id", "person_tax_unit_id"), - ("spm_unit", "spm_unit_id", "person_spm_unit_id"), - ("family", "family_id", "person_family_id"), - ("marital_unit", "marital_unit_id", "person_marital_unit_id"), - ) - group_tables: dict[str, pd.DataFrame | None] = {} - entity_lengths = { - EntityType.HOUSEHOLD: len(households), - EntityType.PERSON: len(persons), - } - excluded_variable_names = { - "household_id", - "household_weight", - "person_id", - "person_household_id", - "person_weight", - } - for group_name, id_column, membership_column in group_specs: - group_ids = arrays.get(id_column) - membership = arrays.get(membership_column) - if membership is not None: - persons[id_column] = _normalize_id_value(membership) - if group_ids is None: - group_tables[group_name] = None - continue - group_table = pd.DataFrame({id_column: _normalize_id_value(group_ids)}) - if membership is not None: - group_table["household_id"] = group_table[id_column].map( - _build_group_household_map( - group_name=group_name, - group_ids=pd.Series(_normalize_id_value(membership)), - household_ids=persons["household_id"], - ) - ) - group_tables[group_name] = group_table - entity_type = _policyengine_group_entity_type(group_name) - if entity_type is not None: - entity_lengths[entity_type] = len(group_table) - excluded_variable_names.add(id_column) - excluded_variable_names.add(membership_column) - - group_entity_to_table = { - EntityType.TAX_UNIT: group_tables["tax_unit"], - EntityType.SPM_UNIT: group_tables["spm_unit"], - EntityType.FAMILY: group_tables["family"], - } - try: - tax_benefit_system = _resolve_policyengine_us_tax_benefit_system( - simulation_cls=None - ) - except (ImportError, ValueError): - tax_benefit_system = None - for variable_name, values in arrays.items(): - if variable_name in excluded_variable_names: - continue - decoded = _decode_policyengine_array(values) - prefixed_table = _resolve_prefixed_policyengine_table( - variable_name=variable_name, - households=households, - persons=persons, - group_tables=group_tables, - ) - if prefixed_table is not None: - prefixed_table[variable_name] = decoded - continue - try: - entity = _infer_policyengine_array_entity( - variable_name=variable_name, - values=values, - entity_lengths=entity_lengths, - tax_benefit_system=tax_benefit_system, - ) - except ValueError: - continue - if entity is EntityType.HOUSEHOLD: - households[variable_name] = decoded - continue - if entity is EntityType.PERSON: - persons[variable_name] = decoded - continue - group_table = group_entity_to_table.get(entity) - if group_table is None: - raise ValueError( - f"Loaded variable '{variable_name}' for entity '{entity.value}' " - "but no structural table exists for that entity" - ) - group_table[variable_name] = decoded - - return PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=group_tables["tax_unit"], - spm_units=group_tables["spm_unit"], - families=group_tables["family"], - marital_units=group_tables["marital_unit"], - ) - - -def _group_table_from_persons( - persons: pd.DataFrame, - id_column: str, -) -> pd.DataFrame | None: - if id_column not in persons.columns: - return None - grouped = persons.dropna(subset=[id_column]).copy() - if grouped.empty: - return None - grouped[id_column] = _stringify_id_series(grouped[id_column]) - return ( - grouped.groupby(id_column, observed=True)["household_id"] - .first() - .reset_index() - .rename(columns={id_column: id_column}) - ) - - -def _stringify_id_series(values: pd.Series) -> pd.Series: - return values.astype(str) - - -def _build_schema_summary( - candidate_bundle: PolicyEngineUSEntityTableBundle, - reference_bundle: PolicyEngineUSEntityTableBundle, -) -> dict[str, Any]: - entities: dict[str, Any] = {} - for entity in _ENTITY_ORDER: - candidate_table = _bundle_table(candidate_bundle, entity) - reference_table = _bundle_table(reference_bundle, entity) - entities[entity.value] = _entity_schema_summary( - entity=entity, - candidate_table=candidate_table, - reference_table=reference_table, - ) - return {"entities": entities} - - -def _entity_schema_summary( - *, - entity: EntityType, - candidate_table: pd.DataFrame | None, - reference_table: pd.DataFrame | None, -) -> dict[str, Any]: - candidate_variables = ( - sorted(_variable_columns(candidate_table, entity)) - if candidate_table is not None - else [] - ) - reference_variables = ( - sorted(_variable_columns(reference_table, entity)) - if reference_table is not None - else [] - ) - common = sorted(set(candidate_variables) & set(reference_variables)) - missing = sorted(set(reference_variables) - set(candidate_variables)) - extra = sorted(set(candidate_variables) - set(reference_variables)) - return { - "candidate_rows": int(len(candidate_table)) if candidate_table is not None else 0, - "reference_rows": int(len(reference_table)) if reference_table is not None else 0, - "candidate_variable_count": len(candidate_variables), - "reference_variable_count": len(reference_variables), - "common_variable_count": len(common), - "missing_in_candidate_count": len(missing), - "extra_in_candidate_count": len(extra), - "missing_in_candidate": missing, - "extra_in_candidate": extra, - } - - -def _entity_structure_summary(bundle: PolicyEngineUSEntityTableBundle) -> dict[str, Any]: - households = bundle.households - persons = bundle.persons - tax_units = bundle.tax_units - summary: dict[str, Any] = { - "household_rows": int(len(households)), - "person_rows": int(len(persons)) if persons is not None else 0, - "tax_unit_rows": int(len(tax_units)) if tax_units is not None else 0, - } - if persons is None or persons.empty: - return summary - - household_sizes = persons.groupby("household_id", observed=True).size() - household_weights = _household_weight_map(bundle) - aligned_household_weights = ( - _stringify_id_series(household_sizes.index.to_series()) - .map(household_weights) - .fillna(0.0) - ) - summary["mean_household_size"] = float(household_sizes.mean()) - summary["weighted_mean_household_size"] = _weighted_mean( - household_sizes.astype(float).to_numpy(), - aligned_household_weights.to_numpy(dtype=float), - ) - summary["share_multi_person_households"] = float((household_sizes >= 2).mean()) - summary["weighted_share_multi_person_households"] = _weighted_mean( - (household_sizes >= 2).astype(float).to_numpy(), - aligned_household_weights.to_numpy(dtype=float), - ) - - if tax_units is not None and not tax_units.empty: - tax_units_per_household = tax_units.groupby("household_id", observed=True).size() - tax_unit_weights = ( - _stringify_id_series(tax_units_per_household.index.to_series()) - .map(household_weights) - .fillna(0.0) - ) - summary["mean_tax_units_per_household"] = float(tax_units_per_household.mean()) - summary["weighted_mean_tax_units_per_household"] = _weighted_mean( - tax_units_per_household.astype(float).to_numpy(), - tax_unit_weights.to_numpy(dtype=float), - ) - - if "tax_unit_id" in persons.columns: - person_tax_units = persons.dropna(subset=["tax_unit_id"]).copy() - if not person_tax_units.empty: - tax_unit_sizes = person_tax_units.groupby("tax_unit_id", observed=True).size() - tax_unit_household_ids = ( - person_tax_units.groupby("tax_unit_id", observed=True)["household_id"].first() - ) - tax_unit_weights = ( - _stringify_id_series(tax_unit_household_ids) - .map(household_weights) - .fillna(0.0) - ) - summary["mean_tax_unit_size"] = float(tax_unit_sizes.mean()) - summary["weighted_mean_tax_unit_size"] = _weighted_mean( - tax_unit_sizes.astype(float).to_numpy(), - tax_unit_weights.to_numpy(dtype=float), - ) - summary["share_multi_person_tax_units"] = float((tax_unit_sizes >= 2).mean()) - summary["weighted_share_multi_person_tax_units"] = _weighted_mean( - (tax_unit_sizes >= 2).astype(float).to_numpy(), - tax_unit_weights.to_numpy(dtype=float), - ) - - return summary - - -def _weighted_household_size_distribution( - bundle: PolicyEngineUSEntityTableBundle, -) -> dict[str, Any]: - persons = bundle.persons - if persons is None or persons.empty: - return {"shares": {}, "weighted_mean_household_size": 0.0} - household_sizes = persons.groupby("household_id", observed=True).size() - household_weights = ( - _stringify_id_series(household_sizes.index.to_series()) - .map(_household_weight_map(bundle)) - .fillna(0.0) - ) - bucketed = household_sizes.apply(_household_size_bucket) - totals = pd.DataFrame({"bucket": bucketed, "weight": household_weights}).groupby( - "bucket", - observed=True, - )["weight"].sum() - weight_sum = float(household_weights.sum()) - shares = { - bucket: _safe_ratio(float(totals.get(bucket, 0.0)), weight_sum) - for bucket in _HOUSEHOLD_SIZE_BUCKETS - } - return { - "shares": shares, - "weighted_mean_household_size": _weighted_mean( - household_sizes.astype(float).to_numpy(), - household_weights.to_numpy(dtype=float), - ), - } - - -def _focus_variable_comparison( - *, - candidate_bundle: PolicyEngineUSEntityTableBundle, - reference_bundle: PolicyEngineUSEntityTableBundle, - spec: SourceStageParityVariableSpec, -) -> dict[str, Any]: - reference_entry = _resolve_bundle_variable( - reference_bundle, - spec.resolved_reference_variable, - ) - candidate_entry = _resolve_bundle_variable( - candidate_bundle, - spec.candidate_variable, - preferred_entity=reference_entry["entity"] if reference_entry is not None else None, - ) - payload: dict[str, Any] = { - "candidate_variable": spec.candidate_variable, - "reference_variable": spec.resolved_reference_variable, - "candidate_present": candidate_entry is not None, - "reference_present": reference_entry is not None, - } - if candidate_entry is not None: - payload["candidate_entity"] = candidate_entry["entity"].value - payload["candidate"] = _summarize_series( - candidate_entry["series"], - weights=candidate_entry["weights"], - value_kind=spec.value_kind, - ) - if reference_entry is not None: - payload["reference_entity"] = reference_entry["entity"].value - payload["reference"] = _summarize_series( - reference_entry["series"], - weights=reference_entry["weights"], - value_kind=spec.value_kind, - ) - if candidate_entry is not None and reference_entry is not None: - payload["comparison"] = _compare_series( - candidate_entry["series"], - reference_entry["series"], - candidate_weights=candidate_entry["weights"], - reference_weights=reference_entry["weights"], - value_kind=spec.value_kind, - ) - return payload - - -def _resolve_bundle_variable( - bundle: PolicyEngineUSEntityTableBundle, - variable: str, - *, - preferred_entity: EntityType | None = None, -) -> dict[str, Any] | None: - search_order = ( - (preferred_entity,) + tuple(entity for entity in _ENTITY_ORDER if entity is not preferred_entity) - if preferred_entity is not None - else _ENTITY_ORDER - ) - for entity in search_order: - table = _bundle_table(bundle, entity) - if table is None or variable not in table.columns: - continue - return { - "entity": entity, - "series": table[variable], - "weights": _entity_weights(bundle, entity), - } - return None - - -def _bundle_table( - bundle: PolicyEngineUSEntityTableBundle, - entity: EntityType, -) -> pd.DataFrame | None: - if entity is EntityType.HOUSEHOLD: - return bundle.households - if entity is EntityType.PERSON: - return bundle.persons - if entity is EntityType.TAX_UNIT: - return bundle.tax_units - if entity is EntityType.SPM_UNIT: - return bundle.spm_units - if entity is EntityType.FAMILY: - return bundle.families - return None - - -def _variable_columns(table: pd.DataFrame | None, entity: EntityType) -> set[str]: - if table is None: - return set() - excluded = _ENTITY_BASE_COLUMNS.get(entity, set()) - return { - column - for column in table.columns - if column not in excluded and not column.endswith("_id") - } - - -def _entity_weights( - bundle: PolicyEngineUSEntityTableBundle, - entity: EntityType, -) -> pd.Series: - table = _bundle_table(bundle, entity) - if table is None: - return pd.Series(dtype=float) - if entity is EntityType.HOUSEHOLD: - if "household_weight" in table.columns: - return pd.to_numeric(table["household_weight"], errors="coerce").fillna(0.0) - return pd.Series(np.ones(len(table), dtype=float), index=table.index) - if entity is EntityType.PERSON and "weight" in table.columns: - return pd.to_numeric(table["weight"], errors="coerce").fillna(0.0) - if "household_id" in table.columns: - household_weights = _household_weight_map(bundle) - return _stringify_id_series(table["household_id"]).map(household_weights).fillna(0.0) - return pd.Series(np.ones(len(table), dtype=float), index=table.index) - - -def _household_weight_map(bundle: PolicyEngineUSEntityTableBundle) -> pd.Series: - households = bundle.households.copy() - households["household_id"] = _stringify_id_series(households["household_id"]) - if "household_weight" in households.columns: - weights = pd.to_numeric(households["household_weight"], errors="coerce").fillna(0.0) - else: - weights = pd.Series(np.ones(len(households), dtype=float), index=households.index) - return pd.Series(weights.to_numpy(dtype=float), index=households["household_id"]) - - -def _summarize_series( - values: pd.Series, - *, - weights: pd.Series, - value_kind: str = "auto", -) -> dict[str, Any]: - series = values.reset_index(drop=True) - weight_series = pd.to_numeric(weights, errors="coerce").fillna(0.0).reset_index(drop=True) - if len(weight_series) != len(series): - weight_series = pd.Series(np.ones(len(series), dtype=float)) - resolved_value_kind = _resolve_value_kind(series, value_kind) - nonnull = series.notna() - total_weight = float(weight_series.sum()) - nonnull_weight = float(weight_series[nonnull].sum()) - if resolved_value_kind == "categorical": - return _summarize_categorical(series.astype("string"), weight_series, total_weight, nonnull_weight) - numeric = pd.to_numeric(series, errors="coerce") - if numeric.notna().sum() == 0: - return _summarize_categorical(series.astype("string"), weight_series, total_weight, nonnull_weight) - if resolved_value_kind == "auto": - unique_count = int(numeric.dropna().nunique()) - if unique_count <= 64 and pd.api.types.is_integer_dtype(numeric.dropna()): - return _summarize_categorical( - numeric.round().astype("Int64").astype("string"), - weight_series, - total_weight, - nonnull_weight, - ) - if resolved_value_kind != "numeric" and ( - pd.api.types.is_bool_dtype(series) - or pd.api.types.is_object_dtype(series) - or pd.api.types.is_string_dtype(series) - ): - return _summarize_categorical( - numeric.round().astype("Int64").astype("string"), - weight_series, - total_weight, - nonnull_weight, - ) - numeric_values = numeric.dropna().astype(float) - numeric_weights = weight_series[numeric.notna()].astype(float) - return { - "kind": "numeric", - "n": int(len(series)), - "nonnull_share": _safe_ratio(int(numeric.notna().sum()), len(series)), - "weighted_nonnull_share": _safe_ratio(nonnull_weight, total_weight), - "zero_share": float((numeric_values == 0.0).mean()) if not numeric_values.empty else 0.0, - "weighted_zero_share": _weighted_mean( - (numeric_values == 0.0).astype(float).to_numpy(), - numeric_weights.to_numpy(dtype=float), - ), - "positive_share": float((numeric_values > 0.0).mean()) if not numeric_values.empty else 0.0, - "weighted_positive_share": _weighted_mean( - (numeric_values > 0.0).astype(float).to_numpy(), - numeric_weights.to_numpy(dtype=float), - ), - "negative_share": float((numeric_values < 0.0).mean()) if not numeric_values.empty else 0.0, - "weighted_negative_share": _weighted_mean( - (numeric_values < 0.0).astype(float).to_numpy(), - numeric_weights.to_numpy(dtype=float), - ), - "mean": float(numeric_values.mean()) if not numeric_values.empty else 0.0, - "weighted_mean": _weighted_mean( - numeric_values.to_numpy(dtype=float), - numeric_weights.to_numpy(dtype=float), - ), - "sum": float(numeric_values.sum()) if not numeric_values.empty else 0.0, - "weighted_sum": float((numeric_values * numeric_weights).sum()), - } - - -def _summarize_categorical( - values: pd.Series, - weights: pd.Series, - total_weight: float, - nonnull_weight: float, -) -> dict[str, Any]: - normalized_values = _normalize_categorical_series(values) - normalized = normalized_values.dropna() - if normalized.empty: - return { - "kind": "categorical", - "n": int(len(values)), - "nonnull_share": 0.0, - "weighted_nonnull_share": _safe_ratio(nonnull_weight, total_weight), - "unique_count": 0, - "top_values": [], - } - aligned_weights = weights[normalized_values.notna()].astype(float) - grouped = ( - pd.DataFrame({"value": normalized.astype(str), "weight": aligned_weights.to_numpy(dtype=float)}) - .groupby("value", observed=True)["weight"] - .sum() - .sort_values(ascending=False) - ) - return { - "kind": "categorical", - "n": int(len(values)), - "nonnull_share": _safe_ratio(int(normalized.notna().sum()), len(values)), - "weighted_nonnull_share": _safe_ratio(nonnull_weight, total_weight), - "unique_count": int(normalized.nunique(dropna=True)), - "top_values": [ - { - "value": str(index), - "weighted_sum": float(weight), - "weighted_share": _safe_ratio(float(weight), nonnull_weight), - } - for index, weight in grouped.head(10).items() - ], - } - - -def _compare_series( - candidate: pd.Series, - reference: pd.Series, - *, - candidate_weights: pd.Series, - reference_weights: pd.Series, - value_kind: str = "auto", -) -> dict[str, Any]: - candidate_summary = _summarize_series( - candidate, - weights=candidate_weights, - value_kind=value_kind, - ) - reference_summary = _summarize_series( - reference, - weights=reference_weights, - value_kind=value_kind, - ) - if candidate_summary["kind"] != reference_summary["kind"]: - return { - "type": "mismatched", - "candidate_kind": candidate_summary["kind"], - "reference_kind": reference_summary["kind"], - } - if candidate_summary["kind"] == "categorical": - candidate_support = _categorical_support(candidate) - reference_support = _categorical_support(reference) - missing = sorted(reference_support - candidate_support) - return { - "type": "categorical", - "support_recall": _safe_ratio( - len(candidate_support & reference_support), - len(reference_support), - ), - "support_precision": _safe_ratio( - len(candidate_support & reference_support), - len(candidate_support), - ), - "missing_reference_values": missing[:20], - } - return { - "type": "numeric", - "weighted_mean_ratio": _safe_ratio( - candidate_summary["weighted_mean"], - reference_summary["weighted_mean"], - ), - "weighted_sum_ratio": _safe_ratio( - candidate_summary["weighted_sum"], - reference_summary["weighted_sum"], - ), - "weighted_positive_share_ratio": _safe_ratio( - candidate_summary["weighted_positive_share"], - reference_summary["weighted_positive_share"], - ), - "weighted_nonnull_share_delta": float( - candidate_summary["weighted_nonnull_share"] - - reference_summary["weighted_nonnull_share"] - ), - } - - -def _categorical_support(values: pd.Series) -> set[str]: - normalized = _normalize_categorical_series(values).dropna().astype(str) - return set(normalized.tolist()) - - -def _normalize_categorical_series(values: pd.Series) -> pd.Series: - normalized = values.astype("string").replace({"": pd.NA, "nan": pd.NA}) - lowered = normalized.str.strip().str.lower() - nonnull = lowered.dropna() - truthy = {"1", "1.0", "true", "t", "yes", "y"} - falsy = {"0", "0.0", "false", "f", "no", "n"} - bool_tokens = truthy | falsy - if not nonnull.empty and set(nonnull.tolist()) <= bool_tokens: - mapped = normalized.copy() - mapped.loc[lowered.isin(truthy)] = "True" - mapped.loc[lowered.isin(falsy)] = "False" - return mapped - return normalized - - -def _resolve_value_kind(values: pd.Series, value_kind: str) -> str: - if value_kind in {"numeric", "categorical"}: - return value_kind - if value_kind != "auto": - raise ValueError( - "Source-stage parity value_kind must be one of: auto, numeric, categorical" - ) - if ( - pd.api.types.is_bool_dtype(values) - or pd.api.types.is_object_dtype(values) - or pd.api.types.is_string_dtype(values) - ): - return "categorical" - return "auto" - - -def _weighted_mean(values: np.ndarray, weights: np.ndarray) -> float: - if len(values) == 0: - return 0.0 - weight_sum = float(np.asarray(weights, dtype=float).sum()) - if weight_sum <= 0.0: - return float(np.asarray(values, dtype=float).mean()) - return float(np.average(np.asarray(values, dtype=float), weights=np.asarray(weights, dtype=float))) - - -def _numeric_deltas(candidate: dict[str, Any], reference: dict[str, Any]) -> dict[str, float]: - deltas: dict[str, float] = {} - for key, candidate_value in candidate.items(): - reference_value = reference.get(key) - if isinstance(candidate_value, (int, float)) and isinstance(reference_value, (int, float)): - deltas[f"{key}_delta"] = float(candidate_value) - float(reference_value) - return deltas - - -def _distribution_deltas(candidate: dict[str, Any], reference: dict[str, Any]) -> dict[str, float]: - candidate_shares = dict(candidate.get("shares", {})) - reference_shares = dict(reference.get("shares", {})) - deltas = { - f"share_{bucket}_delta": float(candidate_shares.get(bucket, 0.0)) - - float(reference_shares.get(bucket, 0.0)) - for bucket in _HOUSEHOLD_SIZE_BUCKETS - } - deltas["weighted_mean_household_size_delta"] = float( - candidate.get("weighted_mean_household_size", 0.0) - ) - float(reference.get("weighted_mean_household_size", 0.0)) - return deltas - - -def _household_size_bucket(size: int) -> str: - return str(size) if size <= 6 else "7+" - - -def _safe_ratio(numerator: int | float, denominator: int | float) -> float: - if not denominator: - return 0.0 - return float(numerator) / float(denominator) - - -def _build_cli_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers(dest="source", required=True) - - cps_parser = subparsers.add_parser("cps", help="Audit raw CPS provider output") - cps_parser.add_argument("reference_dataset") - cps_parser.add_argument("output_path") - cps_parser.add_argument("--year", type=int, default=2023) - cps_parser.add_argument("--cache-dir") - cps_parser.add_argument("--sample-n", type=int) - cps_parser.add_argument("--random-seed", type=int, default=0) - cps_parser.add_argument("--download", action=argparse.BooleanOptionalAction, default=True) - - puf_parser = subparsers.add_parser("puf", help="Audit raw PUF provider output") - puf_parser.add_argument("reference_dataset") - puf_parser.add_argument("output_path") - puf_parser.add_argument("--target-year", type=int, default=2024) - puf_parser.add_argument("--cache-dir") - puf_parser.add_argument("--puf-path") - puf_parser.add_argument("--demographics-path") - puf_parser.add_argument("--sample-n", type=int) - puf_parser.add_argument("--random-seed", type=int, default=0) - puf_parser.add_argument("--uprating-mode", default=PUF_UPRATING_MODE_INTERPOLATED) - puf_parser.add_argument("--policyengine-us-data-repo") - puf_parser.add_argument("--policyengine-us-data-python") - puf_parser.add_argument( - "--impute-pre-tax-contributions", - action=argparse.BooleanOptionalAction, - default=False, - ) - puf_parser.add_argument("--pre-tax-training-year", type=int, default=2024) - puf_parser.add_argument( - "--require-pre-tax-contribution-model", - action=argparse.BooleanOptionalAction, - default=False, - ) - puf_parser.add_argument( - "--social-security-split-strategy", - default=SOCIAL_SECURITY_SPLIT_STRATEGY_GROUPED_SHARE, - ) - return parser - - -def main() -> None: - parser = _build_cli_parser() - args = parser.parse_args() - if args.source == "cps": - output = write_us_cps_source_stage_parity_audit( - args.reference_dataset, - args.output_path, - year=args.year, - cache_dir=args.cache_dir, - download=args.download, - sample_n=args.sample_n, - random_seed=args.random_seed, - ) - else: - output = write_us_puf_source_stage_parity_audit( - args.reference_dataset, - args.output_path, - target_year=args.target_year, - cache_dir=args.cache_dir, - puf_path=args.puf_path, - demographics_path=args.demographics_path, - sample_n=args.sample_n, - random_seed=args.random_seed, - uprating_mode=args.uprating_mode, - policyengine_us_data_repo=args.policyengine_us_data_repo, - policyengine_us_data_python=args.policyengine_us_data_python, - impute_pre_tax_contributions=args.impute_pre_tax_contributions, - pre_tax_training_year=args.pre_tax_training_year, - require_pre_tax_contribution_model=args.require_pre_tax_contribution_model, - social_security_split_strategy=args.social_security_split_strategy, - ) - print(output) - - -if __name__ == "__main__": - main() diff --git a/src/microplex_us/pipelines/stage9_replay.py b/src/microplex_us/pipelines/stage9_replay.py deleted file mode 100644 index 65ac675e..00000000 --- a/src/microplex_us/pipelines/stage9_replay.py +++ /dev/null @@ -1,279 +0,0 @@ -"""Safe Stage 9 validation and benchmarking replay helpers.""" - -from __future__ import annotations - -import argparse -import json -from dataclasses import dataclass -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.pe_native_scores import compute_us_pe_native_scores -from microplex_us.pipelines.stage_manifest_io import write_json_atomically -from microplex_us.pipelines.stage_validation_evidence import ( - build_us_validation_evidence_manifest, -) - - -@dataclass(frozen=True) -class USStage9ReplayResult: - """Artifacts written by a Stage 9 replay.""" - - output_dir: Path - replay_manifest: Path - validation_evidence: Path - policyengine_harness: Path | None = None - policyengine_native_scores: Path | None = None - - -def replay_us_stage9_validation_benchmarking( - artifact_dir: str | Path, - *, - output_dir: str | Path | None = None, - baseline_dataset: str | Path | None = None, - policyengine_us_data_repo: str | Path | None = None, - period: int | None = None, - precomputed_policyengine_harness: str | Path | dict[str, Any] | None = None, - precomputed_policyengine_native_scores: str | Path | dict[str, Any] | None = None, - run_id: str | None = None, - allow_overwrite: bool = False, -) -> USStage9ReplayResult: - """Rerun safe Stage 9 evidence against an existing Stage 8 dataset. - - The original artifact bundle is left untouched. New evidence is written under - a replay directory and indexed by a replay-local evidence manifest. - """ - - artifact_root = Path(artifact_dir).expanduser().resolve() - manifest_path = artifact_root / "manifest.json" - if not manifest_path.exists(): - raise FileNotFoundError(f"Saved artifact manifest not found: {manifest_path}") - manifest = json.loads(manifest_path.read_text()) - dataset_path = _validated_stage8_dataset_path(artifact_root, manifest) - - resolved_output_dir = _resolve_replay_output_dir( - artifact_root, - output_dir=output_dir, - run_id=run_id, - ) - if resolved_output_dir.exists() and any(resolved_output_dir.iterdir()): - if not allow_overwrite: - raise FileExistsError( - f"Stage 9 replay output directory already exists and is not empty: " - f"{resolved_output_dir}" - ) - resolved_output_dir.mkdir(parents=True, exist_ok=True) - - replay_manifest_payload = dict(manifest) - replay_artifacts = dict(manifest.get("artifacts", {})) - summaries: dict[str, Any] = {} - - harness_path = None - harness_payload = _load_optional_payload(precomputed_policyengine_harness) - if harness_payload is not None: - harness_path = resolved_output_dir / "policyengine_harness.json" - write_json_atomically(harness_path, harness_payload) - replay_artifacts["policyengine_harness"] = _relative_to_root( - harness_path, - artifact_root, - ) - if isinstance(harness_payload.get("summary"), dict): - summaries["policyengine_harness"] = dict(harness_payload["summary"]) - - native_scores_path = None - native_scores_payload = _load_optional_payload( - precomputed_policyengine_native_scores - ) - if native_scores_payload is None and baseline_dataset is not None: - native_scores_payload = compute_us_pe_native_scores( - candidate_dataset_path=dataset_path, - baseline_dataset_path=baseline_dataset, - period=period - or int( - dict(manifest.get("config", {})).get( - "policyengine_dataset_year", - 2024, - ) - ), - policyengine_us_data_repo=policyengine_us_data_repo, - ) - if native_scores_payload is not None: - native_scores_path = resolved_output_dir / "policyengine_native_scores.json" - write_json_atomically(native_scores_path, native_scores_payload) - replay_artifacts["policyengine_native_scores"] = _relative_to_root( - native_scores_path, - artifact_root, - ) - if isinstance(native_scores_payload.get("summary"), dict): - summaries["policyengine_native_scores"] = dict( - native_scores_payload["summary"] - ) - - if not summaries: - raise ValueError( - "Stage 9 replay did not produce evidence. Supply precomputed evidence " - "or a baseline dataset for native scoring." - ) - - evidence_path = resolved_output_dir / "evidence_manifest.json" - replay_artifacts["validation_evidence"] = _relative_to_root( - evidence_path, - artifact_root, - ) - replay_manifest_payload["artifacts"] = replay_artifacts - replay_manifest_payload.update(summaries) - replay_manifest_payload["stage9_replay"] = { - "created_at": datetime.now(UTC).isoformat(), - "source_artifact_dir": str(artifact_root), - "source_manifest": str(manifest_path), - "source_policyengine_dataset": _relative_to_root(dataset_path, artifact_root), - "output_dir": _relative_to_root(resolved_output_dir, artifact_root), - } - write_json_atomically( - evidence_path, - build_us_validation_evidence_manifest( - artifact_root, - manifest_payload=replay_manifest_payload, - ), - ) - replay_manifest_path = resolved_output_dir / "replay_manifest.json" - write_json_atomically(replay_manifest_path, replay_manifest_payload) - return USStage9ReplayResult( - output_dir=resolved_output_dir, - replay_manifest=replay_manifest_path, - validation_evidence=evidence_path, - policyengine_harness=harness_path, - policyengine_native_scores=native_scores_path, - ) - - -def _validated_stage8_dataset_path( - artifact_root: Path, - manifest: dict[str, Any], -) -> Path: - artifacts = dict(manifest.get("artifacts", {})) - dataset_value = artifacts.get("policyengine_dataset") - if not dataset_value: - raise ValueError("Stage 8 policyengine_dataset artifact is not declared") - dataset_path = Path(str(dataset_value)) - if not dataset_path.is_absolute(): - dataset_path = artifact_root / dataset_path - dataset_path = dataset_path.expanduser().resolve() - if not dataset_path.exists(): - raise FileNotFoundError(f"Stage 8 dataset artifact is missing: {dataset_path}") - - stage_manifest_paths = dict(manifest.get("stage_output_manifests", {})) - stage8_manifest_value = stage_manifest_paths.get("08_dataset_assembly") - if not stage8_manifest_value: - raise ValueError("Stage 8 output manifest is not declared") - stage8_manifest_path = Path(str(stage8_manifest_value)) - if not stage8_manifest_path.is_absolute(): - stage8_manifest_path = artifact_root / stage8_manifest_path - if not stage8_manifest_path.exists(): - raise FileNotFoundError( - f"Stage 8 output manifest is missing: {stage8_manifest_path}" - ) - stage8_manifest = json.loads(stage8_manifest_path.read_text()) - if stage8_manifest.get("lifecycleStatus") != "complete": - raise ValueError("Stage 8 must be complete before Stage 9 replay") - stage8_outputs = stage8_manifest.get("outputs") - if isinstance(stage8_outputs, dict): - serialized_dataset = stage8_outputs.get("policyengine_dataset") - if isinstance(serialized_dataset, dict): - output_path = serialized_dataset.get("path") - if ( - output_path - and _resolve_artifact_path( - artifact_root, - output_path, - ) - != dataset_path - ): - raise ValueError( - "Stage 8 dataset output does not match the root manifest " - "policyengine_dataset artifact" - ) - return dataset_path - - -def _resolve_replay_output_dir( - artifact_root: Path, - *, - output_dir: str | Path | None, - run_id: str | None, -) -> Path: - if output_dir is not None: - return Path(output_dir).expanduser().resolve() - resolved_run_id = run_id or datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") - return ( - artifact_root - / "stage_artifacts" - / "09_validation_benchmarking" - / "replays" - / resolved_run_id - ) - - -def _load_optional_payload( - value: str | Path | dict[str, Any] | None, -) -> dict[str, Any] | None: - if value is None: - return None - if isinstance(value, dict): - return dict(value) - return json.loads(Path(value).expanduser().read_text()) - - -def _relative_to_root(path: Path, artifact_root: Path) -> str: - try: - return str(path.relative_to(artifact_root)) - except ValueError: - return str(path) - - -def _resolve_artifact_path(artifact_root: Path, value: object) -> Path: - path = Path(str(value)) - if not path.is_absolute(): - path = artifact_root / path - return path.expanduser().resolve() - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Rerun Stage 9 validation evidence against a saved Stage 8 dataset." - ) - parser.add_argument("artifact_dir") - parser.add_argument("--output-dir") - parser.add_argument("--run-id") - parser.add_argument("--baseline-dataset") - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--period", type=int) - parser.add_argument("--precomputed-policyengine-harness") - parser.add_argument("--precomputed-policyengine-native-scores") - parser.add_argument("--allow-overwrite", action="store_true") - args = parser.parse_args(argv) - result = replay_us_stage9_validation_benchmarking( - args.artifact_dir, - output_dir=args.output_dir, - baseline_dataset=args.baseline_dataset, - policyengine_us_data_repo=args.policyengine_us_data_repo, - period=args.period, - precomputed_policyengine_harness=args.precomputed_policyengine_harness, - precomputed_policyengine_native_scores=args.precomputed_policyengine_native_scores, - run_id=args.run_id, - allow_overwrite=args.allow_overwrite, - ) - print(result.validation_evidence) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) - - -__all__ = [ - "USStage9ReplayResult", - "main", - "replay_us_stage9_validation_benchmarking", -] diff --git a/src/microplex_us/pipelines/stage_artifacts.py b/src/microplex_us/pipelines/stage_artifacts.py deleted file mode 100644 index 0779eac4..00000000 --- a/src/microplex_us/pipelines/stage_artifacts.py +++ /dev/null @@ -1,842 +0,0 @@ -"""Artifact inventory helpers for US Microplex saved runs.""" - -from __future__ import annotations - -import hashlib -import json -from collections.abc import Iterable, Mapping -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, TypedDict, cast - -import pandas as pd - -from microplex_us.pipelines.stage_contracts import ( - US_STAGE_CONTRACT_VERSION, - StageArtifactFormat, - StageArtifactHashMode, - StageArtifactResumeRole, -) -from microplex_us.pipelines.stage_manifest import ( - USStageManifest, - build_us_stage_manifest, - load_us_policyengine_entity_stage_artifact, -) - -if TYPE_CHECKING: - from microplex_us.pipelines.us import USMicroplexTargets - from microplex_us.policyengine import PolicyEngineUSEntityTableBundle - -US_STAGE_ARTIFACT_INVENTORY_SCHEMA_VERSION = 1 -DEFAULT_US_STAGE_ARTIFACT_HASH_MAX_BYTES: int | None = None - -USStageArtifactClassification = Literal[ - "contract_only", - "diagnostic_only", - "manual_replay", - "manual_resume", - "post_artifact_evidence", - "missing_required", - "missing_optional", - "metadata_only", -] - -USStageArtifactHashStatus = Literal[ - "hashed", - "not_requested", - "missing", - "too_large", - "unsupported", - "error", -] - - -class USStageArtifactInventoryRecord(TypedDict): - """Inventory view of one canonical stage artifact.""" - - stageId: str - stageStep: str - stageTitle: str - key: str - description: str - path: str | None - exists: bool - referenced: bool - required: bool - resumeRole: StageArtifactResumeRole | None - format: StageArtifactFormat - hashMode: StageArtifactHashMode - classification: USStageArtifactClassification - sizeBytes: int | None - fileCount: int | None - contentHash: str | None - hashStatus: USStageArtifactHashStatus - - -class USStageArtifactInventory(TypedDict): - """Machine-readable artifact inventory for one saved run.""" - - schemaVersion: int - contractVersion: str - generatedAt: str | None - pipeline: str - artifactRoot: str - manifest: str - stageManifest: str | None - artifacts: list[USStageArtifactInventoryRecord] - - -@dataclass(frozen=True) -class USSeedScaffoldStageArtifacts: - """Reloaded Stage 4 seed/scaffold artifact.""" - - scaffold_seed_data: pd.DataFrame - artifact_paths: Mapping[str, Path] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USCandidateStageArtifacts: - """Reloaded Stage 5 candidate artifacts for manual downstream replay.""" - - seed_data: pd.DataFrame - synthetic_data: pd.DataFrame - artifact_paths: Mapping[str, Path] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USCandidateCalibrationReplayArtifacts: - """Cross-stage artifacts for manually replaying candidate calibration.""" - - candidate: USCandidateStageArtifacts - targets: USMicroplexTargets - seed_scaffold: USSeedScaffoldStageArtifacts | None = None - artifact_paths: Mapping[str, Path] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USPolicyEngineEntityStageArtifacts: - """Reloaded Stage 6 PolicyEngine entity-table checkpoint.""" - - bundle: PolicyEngineUSEntityTableBundle - metadata: dict[str, Any] - metadata_path: Path - - -@dataclass(frozen=True) -class USCalibratedStageArtifacts: - """Reloaded Stage 7 calibrated data and target metadata.""" - - calibrated_data: pd.DataFrame - targets: USMicroplexTargets - calibration_summary: dict[str, Any] - artifact_paths: Mapping[str, Path] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USDatasetAssemblyArtifacts: - """Resolved Stage 8 dataset assembly artifacts.""" - - policyengine_dataset: Path - manifest: Path - stage_manifest: Path - data_flow_snapshot: Path - artifact_inventory: Path - conditional_readiness: Path - - -def build_us_stage_artifact_inventory( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, - assume_existing_artifact_keys: Iterable[str] = (), - max_hash_bytes: int | None = DEFAULT_US_STAGE_ARTIFACT_HASH_MAX_BYTES, -) -> USStageArtifactInventory: - """Build an artifact inventory for one US Microplex saved-run directory.""" - - artifact_root = Path(artifact_dir) - manifest = ( - dict(manifest_payload) - if manifest_payload is not None - else json.loads((artifact_root / "manifest.json").read_text()) - ) - stages = ( - dict(stage_manifest) - if stage_manifest is not None - else build_us_stage_manifest( - artifact_root, - manifest_payload=manifest, - assume_existing_artifact_keys=assume_existing_artifact_keys, - ) - ) - artifacts: list[USStageArtifactInventoryRecord] = [] - for stage in stages.get("stages", ()): - if not isinstance(stage, dict): - continue - stage_id = str(stage.get("id", "")) - stage_step = str(stage.get("step", "")) - stage_title = str(stage.get("title", "")) - for artifact in stage.get("artifacts", ()): - if isinstance(artifact, dict): - artifacts.append( - _inventory_record( - artifact, - stage_id=stage_id, - stage_step=stage_step, - stage_title=stage_title, - artifact_root=artifact_root, - max_hash_bytes=max_hash_bytes, - ) - ) - - manifest_artifacts = dict(manifest.get("artifacts", {})) - return { - "schemaVersion": US_STAGE_ARTIFACT_INVENTORY_SCHEMA_VERSION, - "contractVersion": US_STAGE_CONTRACT_VERSION, - "generatedAt": _optional_str(manifest.get("created_at")), - "pipeline": "us_microplex", - "artifactRoot": ".", - "manifest": str(manifest_artifacts.get("manifest", "manifest.json")), - "stageManifest": _optional_str(manifest_artifacts.get("stage_manifest")), - "artifacts": artifacts, - } - - -def write_us_stage_artifact_inventory( - artifact_dir: str | Path, - output_path: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, - assume_existing_artifact_keys: Iterable[str] = (), - max_hash_bytes: int | None = DEFAULT_US_STAGE_ARTIFACT_HASH_MAX_BYTES, -) -> Path: - """Write an artifact inventory sidecar for one saved run.""" - - destination = Path(output_path) - destination.parent.mkdir(parents=True, exist_ok=True) - _write_json_atomically( - destination, - build_us_stage_artifact_inventory( - artifact_dir, - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - assume_existing_artifact_keys=assume_existing_artifact_keys, - max_hash_bytes=max_hash_bytes, - ), - ) - return destination - - -def load_us_stage_artifact_inventory(path: str | Path) -> USStageArtifactInventory: - """Load a saved artifact inventory and validate its schema version.""" - - inventory_path = Path(path) - payload = json.loads(inventory_path.read_text()) - if payload.get("schemaVersion") != US_STAGE_ARTIFACT_INVENTORY_SCHEMA_VERSION: - raise RuntimeError( - "Unsupported US stage artifact inventory schema: " - f"{payload.get('schemaVersion')!r}" - ) - return cast(USStageArtifactInventory, payload) - - -def resolve_us_stage_artifact_from_inventory( - artifact_dir: str | Path, - inventory: USStageArtifactInventory | dict[str, Any], - stage_id: str, - artifact_key: str, -) -> Path: - """Resolve one artifact path from a stage artifact inventory.""" - - for artifact in inventory.get("artifacts", ()): - if not isinstance(artifact, dict): - continue - if artifact.get("stageId") != stage_id or artifact.get("key") != artifact_key: - continue - path_text = artifact.get("path") - if not path_text: - raise KeyError(f"Stage artifact has no path: {stage_id}.{artifact_key}") - path = Path(str(path_text)) - if not path.is_absolute(): - path = Path(artifact_dir) / path - return path - raise KeyError(f"Stage artifact not found: {stage_id}.{artifact_key}") - - -def resolve_us_stage_artifact_path_checked( - artifact_dir: str | Path, - stage_id: str, - artifact_key: str, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, - expected_format: StageArtifactFormat | None = None, - require_exists: bool = True, -) -> Path: - """Resolve one stage artifact path and enforce format/existence checks.""" - - artifact_root = Path(artifact_dir) - record = _stage_artifact_record( - artifact_root, - stage_id, - artifact_key, - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - ) - actual_format = cast(StageArtifactFormat, record.get("format") or "unknown") - if expected_format is not None and actual_format != expected_format: - raise ValueError( - f"Stage artifact {stage_id}.{artifact_key} has format " - f"{actual_format!r}, expected {expected_format!r}" - ) - path_text = record.get("path") - if not path_text: - raise KeyError(f"Stage artifact has no path: {stage_id}.{artifact_key}") - path = Path(str(path_text)) - if not path.is_absolute(): - path = artifact_root / path - if require_exists and not path.exists(): - raise FileNotFoundError(f"Stage artifact not found: {path}") - return path - - -def load_us_stage_parquet_artifact( - artifact_dir: str | Path, - stage_id: str, - artifact_key: str, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, -) -> pd.DataFrame: - """Load one stage-owned parquet dataframe artifact.""" - - path = resolve_us_stage_artifact_path_checked( - artifact_dir, - stage_id, - artifact_key, - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="parquet_dataframe", - ) - return pd.read_parquet(path) - - -def load_us_stage_json_artifact( - artifact_dir: str | Path, - stage_id: str, - artifact_key: str, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, -) -> dict[str, Any]: - """Load one stage-owned JSON artifact.""" - - path = resolve_us_stage_artifact_path_checked( - artifact_dir, - stage_id, - artifact_key, - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="json", - ) - payload = json.loads(path.read_text()) - if not isinstance(payload, dict): - raise ValueError(f"Expected JSON object in stage artifact: {path}") - return dict(payload) - - -def load_us_candidate_stage_artifacts( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, -) -> USCandidateStageArtifacts: - """Load the saved Stage 5 candidate population artifacts.""" - - seed_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "05_donor_integration_synthesis", - "seed_data", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="parquet_dataframe", - ) - synthetic_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "05_donor_integration_synthesis", - "synthetic_data", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="parquet_dataframe", - ) - return USCandidateStageArtifacts( - seed_data=pd.read_parquet(seed_path), - synthetic_data=pd.read_parquet(synthetic_path), - artifact_paths={ - "seed_data": seed_path, - "synthetic_data": synthetic_path, - }, - ) - - -def load_us_seed_scaffold_stage_artifacts( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, -) -> USSeedScaffoldStageArtifacts: - """Load the saved Stage 4 seed/scaffold artifact.""" - - scaffold_seed_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "04_seed_scaffold", - "scaffold_seed_data", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="parquet_dataframe", - ) - return USSeedScaffoldStageArtifacts( - scaffold_seed_data=pd.read_parquet(scaffold_seed_path), - artifact_paths={"scaffold_seed_data": scaffold_seed_path}, - ) - - -def load_us_candidate_calibration_replay_artifacts( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, - include_seed_scaffold: bool = True, -) -> USCandidateCalibrationReplayArtifacts: - """Load the cross-stage artifacts needed to manually replay calibration.""" - - from microplex_us.pipelines.us import USMicroplexTargets - - candidate = load_us_candidate_stage_artifacts( - artifact_dir, - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - ) - targets_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "07_calibration", - "targets", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="json", - ) - seed_scaffold = None - if include_seed_scaffold: - try: - seed_scaffold = load_us_seed_scaffold_stage_artifacts( - artifact_dir, - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - ) - except (KeyError, FileNotFoundError): - seed_scaffold = None - targets_payload = json.loads(targets_path.read_text()) - artifact_paths = { - **dict(candidate.artifact_paths), - "targets": targets_path, - } - if seed_scaffold is not None: - artifact_paths.update(seed_scaffold.artifact_paths) - return USCandidateCalibrationReplayArtifacts( - candidate=candidate, - targets=USMicroplexTargets( - marginal=dict(targets_payload.get("marginal", {})), - continuous=dict(targets_payload.get("continuous", {})), - ), - seed_scaffold=seed_scaffold, - artifact_paths=artifact_paths, - ) - - -def load_us_policyengine_entity_stage_artifacts( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, -) -> USPolicyEngineEntityStageArtifacts: - """Load the saved Stage 6 PolicyEngine entity-table bundle.""" - - try: - metadata_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "06_policyengine_entities", - "pre_calibration_policyengine_entity_tables", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="policyengine_entity_bundle", - ) - except (KeyError, FileNotFoundError): - metadata_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "06_policyengine_entities", - "policyengine_entity_tables", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="policyengine_entity_bundle", - ) - bundle, metadata = load_us_policyengine_entity_stage_artifact( - metadata_path, - expected_stage="post_microsim", - ) - return USPolicyEngineEntityStageArtifacts( - bundle=bundle, - metadata=metadata, - metadata_path=metadata_path, - ) - - -def load_us_calibrated_stage_artifacts( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, -) -> USCalibratedStageArtifacts: - """Load saved Stage 7 calibrated outputs and calibration metadata.""" - - from microplex_us.pipelines.us import USMicroplexTargets - - calibrated_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "07_calibration", - "calibrated_data", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="parquet_dataframe", - ) - targets_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "07_calibration", - "targets", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="json", - ) - calibration_summary_path = resolve_us_stage_artifact_path_checked( - artifact_dir, - "07_calibration", - "calibration_summary", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="json", - ) - targets_payload = json.loads(targets_path.read_text()) - return USCalibratedStageArtifacts( - calibrated_data=pd.read_parquet(calibrated_path), - targets=USMicroplexTargets( - marginal=dict(targets_payload.get("marginal", {})), - continuous=dict(targets_payload.get("continuous", {})), - ), - calibration_summary=json.loads(calibration_summary_path.read_text()), - artifact_paths={ - "calibrated_data": calibrated_path, - "targets": targets_path, - "calibration_summary": calibration_summary_path, - }, - ) - - -def load_us_dataset_assembly_artifacts( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, -) -> USDatasetAssemblyArtifacts: - """Resolve saved Stage 8 dataset assembly artifacts.""" - - artifact_root = Path(artifact_dir) - return USDatasetAssemblyArtifacts( - policyengine_dataset=resolve_us_stage_artifact_path_checked( - artifact_root, - "08_dataset_assembly", - "policyengine_dataset", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="h5_dataset", - ), - manifest=artifact_root / "manifest.json", - stage_manifest=resolve_us_stage_artifact_path_checked( - artifact_root, - "08_dataset_assembly", - "stage_manifest", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="json", - ), - data_flow_snapshot=resolve_us_stage_artifact_path_checked( - artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="json", - ), - artifact_inventory=resolve_us_stage_artifact_path_checked( - artifact_root, - "08_dataset_assembly", - "artifact_inventory", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="json", - ), - conditional_readiness=resolve_us_stage_artifact_path_checked( - artifact_root, - "08_dataset_assembly", - "conditional_readiness", - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format="json", - ), - ) - - -def _stage_artifact_record( - artifact_root: Path, - stage_id: str, - artifact_key: str, - *, - manifest_payload: dict[str, Any] | None, - stage_manifest: USStageManifest | dict[str, Any] | None, -) -> dict[str, Any]: - manifest = ( - dict(manifest_payload) - if manifest_payload is not None - else json.loads((artifact_root / "manifest.json").read_text()) - ) - stages = ( - dict(stage_manifest) - if stage_manifest is not None - else build_us_stage_manifest(artifact_root, manifest_payload=manifest) - ) - for stage in stages.get("stages", ()): - if not isinstance(stage, dict) or stage.get("id") != stage_id: - continue - for artifact in stage.get("artifacts", ()): - if isinstance(artifact, dict) and artifact.get("key") == artifact_key: - return dict(artifact) - raise KeyError(f"Stage artifact not found: {stage_id}.{artifact_key}") - - -def _resolve_optional_stage_artifact_path( - artifact_dir: str | Path, - stage_id: str, - artifact_key: str, - *, - manifest_payload: dict[str, Any] | None, - stage_manifest: USStageManifest | dict[str, Any] | None, - expected_format: StageArtifactFormat, -) -> Path | None: - try: - return resolve_us_stage_artifact_path_checked( - artifact_dir, - stage_id, - artifact_key, - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - expected_format=expected_format, - ) - except (KeyError, FileNotFoundError): - return None - - -def _inventory_record( - artifact: dict[str, Any], - *, - stage_id: str, - stage_step: str, - stage_title: str, - artifact_root: Path, - max_hash_bytes: int | None, -) -> USStageArtifactInventoryRecord: - path_text = _optional_str(artifact.get("path")) - resolved_path = _resolve_artifact_path(artifact_root, path_text) - artifact_format = cast( - StageArtifactFormat, - artifact.get("format") or "unknown", - ) - hash_mode = cast( - StageArtifactHashMode, - artifact.get("hash_mode") or "none", - ) - hash_target = _hash_target_path(resolved_path, artifact_format, hash_mode) - size_bytes, file_count = _artifact_size(hash_target) - content_hash, hash_status = _artifact_hash( - hash_target, - hash_mode=hash_mode, - max_hash_bytes=max_hash_bytes, - ) - return { - "stageId": stage_id, - "stageStep": stage_step, - "stageTitle": stage_title, - "key": str(artifact.get("key", "")), - "description": str(artifact.get("description", "")), - "path": path_text, - "exists": bool(artifact.get("exists")), - "referenced": bool(artifact.get("referenced")), - "required": bool(artifact.get("required")), - "resumeRole": cast(StageArtifactResumeRole | None, artifact.get("resume_role")), - "format": artifact_format, - "hashMode": hash_mode, - "classification": _artifact_classification(artifact), - "sizeBytes": size_bytes, - "fileCount": file_count, - "contentHash": content_hash, - "hashStatus": hash_status, - } - - -def _artifact_classification( - artifact: Mapping[str, Any], -) -> USStageArtifactClassification: - if not bool(artifact.get("exists")): - if bool(artifact.get("required")): - return "missing_required" - if bool(artifact.get("referenced")): - return "missing_optional" - return "contract_only" - resume_role = artifact.get("resume_role") - if resume_role == "diagnostic": - return "diagnostic_only" - if resume_role in {"manual_replay", "manual_resume", "post_artifact_evidence"}: - return cast(USStageArtifactClassification, resume_role) - return "metadata_only" - - -def _resolve_artifact_path(artifact_root: Path, path_text: str | None) -> Path | None: - if path_text is None: - return None - path = Path(path_text) - if not path.is_absolute(): - path = artifact_root / path - return path - - -def _hash_target_path( - path: Path | None, - artifact_format: StageArtifactFormat, - hash_mode: StageArtifactHashMode, -) -> Path | None: - if path is None or hash_mode != "directory_sha256": - return path - if artifact_format == "policyengine_entity_bundle" and path.name == "metadata.json": - return path.parent - return path - - -def _artifact_size(path: Path | None) -> tuple[int | None, int | None]: - if path is None or not path.exists(): - return None, None - if path.is_file(): - return path.stat().st_size, 1 - if path.is_dir(): - total = 0 - count = 0 - for child in _iter_directory_files(path): - total += child.stat().st_size - count += 1 - return total, count - return None, None - - -def _artifact_hash( - path: Path | None, - *, - hash_mode: StageArtifactHashMode, - max_hash_bytes: int | None, -) -> tuple[str | None, USStageArtifactHashStatus]: - if hash_mode == "none": - return None, "not_requested" - if path is None or not path.exists(): - return None, "missing" - try: - if hash_mode == "file_sha256": - if not path.is_file(): - return None, "unsupported" - size = path.stat().st_size - if max_hash_bytes is not None and size > max_hash_bytes: - return None, "too_large" - return _hash_file(path), "hashed" - if hash_mode == "directory_sha256": - if not path.is_dir(): - return None, "unsupported" - size, _ = _artifact_size(path) - if ( - max_hash_bytes is not None - and size is not None - and size > max_hash_bytes - ): - return None, "too_large" - return _hash_directory(path), "hashed" - except OSError: - return None, "error" - return None, "unsupported" - - -def _hash_file(path: Path) -> str: - hasher = hashlib.sha256() - with path.open("rb") as handle: - for chunk in iter(lambda: handle.read(1024 * 1024), b""): - hasher.update(chunk) - return hasher.hexdigest() - - -def _hash_directory(path: Path) -> str: - hasher = hashlib.sha256() - for child in _iter_directory_files(path): - relative = child.relative_to(path).as_posix() - hasher.update(relative.encode("utf-8")) - hasher.update(b"\0") - hasher.update(_hash_file(child).encode("ascii")) - hasher.update(b"\0") - return hasher.hexdigest() - - -def _iter_directory_files(path: Path) -> list[Path]: - return sorted(child for child in path.rglob("*") if child.is_file()) - - -def _optional_str(value: Any) -> str | None: - if value is None: - return None - return str(value) - - -def _write_json_atomically(path: Path, payload: Mapping[str, Any]) -> None: - temporary = path.with_suffix(path.suffix + ".tmp") - temporary.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temporary.replace(path) - - -__all__ = [ - "DEFAULT_US_STAGE_ARTIFACT_HASH_MAX_BYTES", - "US_STAGE_ARTIFACT_INVENTORY_SCHEMA_VERSION", - "USCalibratedStageArtifacts", - "USCandidateStageArtifacts", - "USCandidateCalibrationReplayArtifacts", - "USDatasetAssemblyArtifacts", - "USPolicyEngineEntityStageArtifacts", - "USSeedScaffoldStageArtifacts", - "USStageArtifactClassification", - "USStageArtifactHashStatus", - "USStageArtifactInventory", - "USStageArtifactInventoryRecord", - "build_us_stage_artifact_inventory", - "load_us_calibrated_stage_artifacts", - "load_us_candidate_calibration_replay_artifacts", - "load_us_candidate_stage_artifacts", - "load_us_dataset_assembly_artifacts", - "load_us_policyengine_entity_stage_artifacts", - "load_us_seed_scaffold_stage_artifacts", - "load_us_stage_json_artifact", - "load_us_stage_parquet_artifact", - "load_us_stage_artifact_inventory", - "resolve_us_stage_artifact_path_checked", - "resolve_us_stage_artifact_from_inventory", - "write_us_stage_artifact_inventory", -] diff --git a/src/microplex_us/pipelines/stage_contracts.py b/src/microplex_us/pipelines/stage_contracts.py deleted file mode 100644 index 49da47cc..00000000 --- a/src/microplex_us/pipelines/stage_contracts.py +++ /dev/null @@ -1,1258 +0,0 @@ -"""Canonical runtime stage contracts for the US Microplex build.""" - -from __future__ import annotations - -from dataclasses import asdict, dataclass -from pathlib import Path -from typing import Literal - -US_STAGE_CONTRACT_VERSION = "us-runtime-stages-v2" - -StageResumeMode = Literal[ - "none", - "metadata_only", - "manual_replay", - "manual_resume", - "post_artifact_evidence", -] - -StageArtifactResumeRole = Literal[ - "diagnostic", - "manual_replay", - "manual_resume", - "post_artifact_evidence", -] - -StageArtifactFormat = Literal[ - "json", - "parquet_dataframe", - "policyengine_entity_bundle", - "h5_dataset", - "model_file", - "sqlite", - "unknown", -] - -StageArtifactHashMode = Literal[ - "none", - "file_sha256", - "directory_sha256", -] - -StageResourceKind = Literal[ - "artifact", - "config", - "external_data", - "manifest", - "runtime_object", - "stage_output", -] - -US_CANONICAL_STAGE_IDS = ( - "01_run_profile", - "02_source_loading", - "03_source_planning", - "04_seed_scaffold", - "05_donor_integration_synthesis", - "06_policyengine_entities", - "07_calibration", - "08_dataset_assembly", - "09_validation_benchmarking", -) - -US_LEGACY_STAGE_ID_ALIASES = { - # Historical run_contract.py IDs from the US Microplex build path. - "preflight": "01_run_profile", - "source_loading": "02_source_loading", - "source_planning": "03_source_planning", - "seed_scaffold": "04_seed_scaffold", - "seed_build": "05_donor_integration_synthesis", - "donor_integration": "05_donor_integration_synthesis", - "synthesis": "05_donor_integration_synthesis", - "support_enforcement": "05_donor_integration_synthesis", - "policyengine_materialization": "06_policyengine_entities", - "target_build": "07_calibration", - "calibration": "07_calibration", - "dataset_assembly": "08_dataset_assembly", - "finalization": "08_dataset_assembly", - "validation": "09_validation_benchmarking", - "benchmark": "09_validation_benchmarking", - "scoring": "09_validation_benchmarking", - "policyengine_native_scores": "09_validation_benchmarking", - # Historical PE-US-data parity plan IDs used in Microplex docs/snapshots. - "source-contracts": "02_source_loading", - "cps-construction": "02_source_loading", - "puf-ingestion-uprating": "02_source_loading", - "extended-cps-qrf": "05_donor_integration_synthesis", - "family-imputation-parity": "05_donor_integration_synthesis", - "entity-export-parity": "06_policyengine_entities", - "weighting-backend": "07_calibration", - "targets-and-eval": "09_validation_benchmarking", -} - - -def canonicalize_us_pipeline_stage_id(stage_id: str) -> str: - """Return the canonical US runtime stage ID for a current or legacy ID.""" - - if stage_id in US_CANONICAL_STAGE_IDS: - return stage_id - return US_LEGACY_STAGE_ID_ALIASES.get(stage_id, stage_id) - - -@dataclass(frozen=True) -class USStageArtifactContract: - """One artifact expected or produced by a canonical build stage.""" - - key: str - description: str - path_hint: str | None = None - required: bool = False - resume_role: StageArtifactResumeRole | None = None - format: StageArtifactFormat = "unknown" - hash_mode: StageArtifactHashMode = "none" - - def to_dict(self) -> dict[str, object]: - return asdict(self) - - -@dataclass(frozen=True) -class USStageValidationContract: - """A future validation hook owned by a canonical build stage.""" - - key: str - description: str - status: Literal["planned", "manual", "implemented"] = "planned" - - def to_dict(self) -> dict[str, object]: - return asdict(self) - - -@dataclass(frozen=True) -class USStageResourceContract: - """Structured input or output dependency for one canonical build stage.""" - - key: str - description: str - kind: StageResourceKind - required: bool = True - stage_id: str | None = None - artifact_key: str | None = None - config_key: str | None = None - manifest_key: str | None = None - - def to_dict(self) -> dict[str, object]: - return asdict(self) - - -@dataclass(frozen=True) -class USPipelineStageContract: - """Stable contract for one canonical US Microplex runtime stage.""" - - id: str - step: str - title: str - purpose: str - consumes: tuple[str, ...] - produces: tuple[str, ...] - inputs: tuple[USStageResourceContract, ...] - outputs: tuple[USStageResourceContract, ...] - artifacts: tuple[USStageArtifactContract, ...] - diagnostics: tuple[str, ...] - validations: tuple[USStageValidationContract, ...] - resume_mode: StageResumeMode - resume_notes: str - - def to_dict(self) -> dict[str, object]: - payload = asdict(self) - payload["inputs"] = [resource.to_dict() for resource in self.inputs] - payload["outputs"] = [resource.to_dict() for resource in self.outputs] - payload["artifacts"] = [artifact.to_dict() for artifact in self.artifacts] - payload["validations"] = [ - validation.to_dict() for validation in self.validations - ] - return payload - - -def _artifact_resource( - key: str, - description: str, - *, - stage_id: str, - artifact_key: str | None = None, - required: bool = True, -) -> USStageResourceContract: - return USStageResourceContract( - key=key, - description=description, - kind="artifact", - required=required, - stage_id=stage_id, - artifact_key=artifact_key or key, - ) - - -def _config_resource( - key: str, - description: str, - *, - config_key: str | None = None, - required: bool = True, -) -> USStageResourceContract: - return USStageResourceContract( - key=key, - description=description, - kind="config", - required=required, - config_key=config_key or key, - ) - - -def _external_resource( - key: str, - description: str, - *, - required: bool = True, -) -> USStageResourceContract: - return USStageResourceContract( - key=key, - description=description, - kind="external_data", - required=required, - ) - - -def _manifest_resource( - key: str, - description: str, - *, - manifest_key: str | None = None, - required: bool = True, -) -> USStageResourceContract: - return USStageResourceContract( - key=key, - description=description, - kind="manifest", - required=required, - manifest_key=manifest_key or key, - ) - - -def _runtime_resource( - key: str, - description: str, - *, - required: bool = True, -) -> USStageResourceContract: - return USStageResourceContract( - key=key, - description=description, - kind="runtime_object", - required=required, - ) - - -def _stage_output_resource( - key: str, - description: str, - *, - stage_id: str, - required: bool = True, -) -> USStageResourceContract: - return USStageResourceContract( - key=key, - description=description, - kind="stage_output", - required=required, - stage_id=stage_id, - ) - - -def default_us_pipeline_stage_contracts() -> tuple[USPipelineStageContract, ...]: - """Return the canonical 9-stage US Microplex runtime taxonomy.""" - - return ( - USPipelineStageContract( - id="01_run_profile", - step="01", - title="Run profile, config, and source bundle", - purpose="Resolve the build profile, runtime config, providers, queries, and run-level options.", - consumes=("user configuration", "provider defaults", "runtime overrides"), - produces=("resolved build config", "provider/query plan"), - inputs=( - _config_resource( - "build_profile", - "Selected build profile and runtime overrides.", - config_key="profile", - required=False, - ), - _config_resource( - "policyengine_target_period", - "Target period used by downstream PolicyEngine export and validation.", - ), - _config_resource( - "calibration_backend", - "Calibration backend selected for this run.", - ), - _config_resource( - "source_names", - "Requested source names or provider defaults.", - required=False, - ), - ), - outputs=( - _artifact_resource( - "manifest", - "Top-level manifest containing resolved configuration and artifact map.", - stage_id="01_run_profile", - ), - _stage_output_resource( - "resolved_config", - "Resolved build configuration recorded for downstream stages.", - stage_id="01_run_profile", - ), - _stage_output_resource( - "provider_query_plan", - "Resolved provider and source-query plan for source loading.", - stage_id="01_run_profile", - ), - ), - artifacts=( - USStageArtifactContract( - key="manifest", - description="Top-level artifact manifest with resolved config.", - path_hint="manifest.json", - required=True, - format="json", - hash_mode="file_sha256", - ), - ), - diagnostics=( - "resolved provider names", - "sample/query filters", - "target period", - "baseline dataset and target DB references", - ), - validations=( - USStageValidationContract( - key="config_context", - description="Check required paths and context for the selected profile.", - ), - ), - resume_mode="metadata_only", - resume_notes="The resolved config can be reused, but this stage does not contain reloadable data by itself.", - ), - USPipelineStageContract( - id="02_source_loading", - step="02", - title="Source contracts and source loading", - purpose="Load external datasets into validated Microplex observation frames.", - consumes=( - "resolved provider/query plan", - "source manifests", - "external datasets", - ), - produces=( - "observation frames", - "source descriptors", - "entity relationships", - ), - inputs=( - _stage_output_resource( - "provider_query_plan", - "Resolved provider and source-query plan from Stage 1.", - stage_id="01_run_profile", - ), - _external_resource( - "source_datasets", - "External source datasets requested by the provider/query plan.", - ), - ), - outputs=( - _stage_output_resource( - "observation_frame_summary", - "Saved summary of loaded Microplex observation frames with source metadata.", - stage_id="02_source_loading", - ), - _stage_output_resource( - "source_descriptors", - "Source descriptors attached to the loaded observation frames.", - stage_id="02_source_loading", - ), - _stage_output_resource( - "source_relationships", - "Validated entity relationships in loaded source frames.", - stage_id="02_source_loading", - ), - ), - artifacts=(), - diagnostics=( - "source row counts", - "entity coverage", - "relationship validity", - "cache/download provenance", - ), - validations=( - USStageValidationContract( - key="observation_frame_validity", - description="Validate required entity tables and relationships for each source.", - ), - ), - resume_mode="none", - resume_notes="Full source-frame snapshotting is not implemented yet.", - ), - USPipelineStageContract( - id="03_source_planning", - step="03", - title="Source planning, fusion planning, and scaffold selection", - purpose="Choose the scaffold source and map donor/source coverage before seed construction.", - consumes=("observation frames", "source descriptors"), - produces=("fusion plan", "scaffold selection", "donor/source plan"), - inputs=( - _runtime_resource( - "observation_frames", - "Loaded observation frames from Stage 2.", - ), - _runtime_resource( - "source_descriptors", - "Source descriptors attached to the loaded frames.", - ), - ), - outputs=( - _artifact_resource( - "source_plan", - "Saved scaffold and donor/source planning summary.", - stage_id="03_source_planning", - ), - _stage_output_resource( - "scaffold_selection", - "Selected scaffold/backbone source and donor plan.", - stage_id="03_source_planning", - ), - ), - artifacts=( - USStageArtifactContract( - key="source_plan", - description="Compact JSON summary of source names, scaffold, and donor variable plan.", - path_hint="stage_artifacts/03_source_planning/source_plan.json", - required=True, - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - ), - diagnostics=( - "source score summary", - "coverage matrix", - "scaffold source", - "donor source names", - ), - validations=( - USStageValidationContract( - key="scaffold_has_households_and_persons", - description="Check that the scaffold has household/person observations and a valid relationship.", - ), - ), - resume_mode="metadata_only", - resume_notes="The source plan explains the build route; raw source frames are not reloadable from this artifact yet.", - ), - USPipelineStageContract( - id="04_seed_scaffold", - step="04", - title="Seed/scaffold construction", - purpose="Project the selected scaffold source into the canonical seed structure.", - consumes=("source plan", "scaffold frame", "identifier rules"), - produces=("scaffold-derived seed frame", "seed schema metadata"), - inputs=( - _artifact_resource( - "source_plan", - "Saved scaffold and donor/source planning summary from Stage 3.", - stage_id="03_source_planning", - ), - _stage_output_resource( - "scaffold_selection", - "Selected scaffold/backbone source from Stage 3.", - stage_id="03_source_planning", - ), - _runtime_resource( - "scaffold_frame", - "Loaded source frame selected as the population scaffold.", - ), - ), - outputs=( - _artifact_resource( - "scaffold_seed_data", - "Scaffold-projected seed population before donor integration.", - stage_id="04_seed_scaffold", - ), - _stage_output_resource( - "seed_schema_metadata", - "Canonical identifier and required-column metadata for the seed.", - stage_id="04_seed_scaffold", - ), - ), - artifacts=( - USStageArtifactContract( - key="scaffold_seed_data", - description="Seed population immediately after scaffold projection and before donor integration.", - path_hint="stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet", - required=True, - resume_role="manual_replay", - format="parquet_dataframe", - hash_mode="file_sha256", - ), - ), - diagnostics=( - "scaffold source", - "pre-donor seed rows and columns", - "canonical identifier coverage", - "required seed column defaults", - ), - validations=( - USStageValidationContract( - key="seed_schema", - description="Check canonical identifiers and required seed columns.", - ), - ), - resume_mode="manual_replay", - resume_notes="The pre-donor seed frame is saved for diagnostics and manual replay; automatic donor-stage resume is not implemented yet.", - ), - USPipelineStageContract( - id="05_donor_integration_synthesis", - step="05", - title="Donor integration, synthesis, and support enforcement", - purpose="Integrate donor variables and produce the candidate population that will be calibrated.", - consumes=( - "scaffold-derived seed frame", - "donor frames", - "synthesis variable plan", - "target support requirements", - ), - produces=( - "donor-integrated seed frame", - "synthetic/candidate frame", - "synthesis metadata", - ), - inputs=( - _artifact_resource( - "scaffold_seed_data", - "Scaffold-projected seed population from Stage 4.", - stage_id="04_seed_scaffold", - ), - _runtime_resource( - "donor_frames", - "Loaded donor source frames used for variable integration.", - ), - _config_resource( - "synthesis_backend", - "Configured synthesis backend.", - ), - _config_resource( - "n_synthetic", - "Requested synthetic population size.", - required=False, - ), - _config_resource( - "random_seed", - "Random seed used by donor integration and synthesis.", - ), - _config_resource( - "synthesizer_condition_vars", - "Configured synthesis conditioning variables.", - required=False, - ), - _config_resource( - "synthesizer_target_vars", - "Configured synthesis target variables.", - required=False, - ), - _config_resource( - "synthesizer_epochs", - "Configured synthesizer training epochs.", - required=False, - ), - _config_resource( - "synthesizer_batch_size", - "Configured synthesizer batch size.", - required=False, - ), - _config_resource( - "synthesizer_learning_rate", - "Configured synthesizer learning rate.", - required=False, - ), - _config_resource( - "synthesizer_n_layers", - "Configured synthesizer network depth.", - required=False, - ), - _config_resource( - "synthesizer_hidden_dim", - "Configured synthesizer hidden dimension.", - required=False, - ), - _config_resource( - "donor_imputer_backend", - "Configured donor imputer backend.", - required=False, - ), - _config_resource( - "donor_imputer_condition_selection", - "Configured donor imputer condition selection strategy.", - required=False, - ), - _config_resource( - "donor_imputer_max_condition_vars", - "Configured donor imputer condition-variable cap.", - required=False, - ), - _config_resource( - "donor_imputer_excluded_variables", - "Variables excluded from donor imputation.", - required=False, - ), - _config_resource( - "donor_imputer_authoritative_override_variables", - "Variables treated as authoritative donor overrides.", - required=False, - ), - _config_resource( - "bootstrap_strata_columns", - "Bootstrap strata columns used by seed/bootstrap synthesis.", - required=False, - ), - ), - outputs=( - _artifact_resource( - "seed_data", - "Seed population after donor integration and semantic guards.", - stage_id="05_donor_integration_synthesis", - ), - _artifact_resource( - "synthetic_data", - "Candidate population before final calibration.", - stage_id="05_donor_integration_synthesis", - ), - _manifest_resource( - "synthesis_metadata", - "Synthesis metadata recorded in the saved manifest.", - manifest_key="synthesis", - ), - ), - artifacts=( - USStageArtifactContract( - key="seed_data", - description="Seed population after donor integration and semantic guards.", - path_hint="seed_data.parquet", - required=True, - resume_role="diagnostic", - format="parquet_dataframe", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="synthetic_data", - description="Candidate population before final calibration.", - path_hint="synthetic_data.parquet", - required=True, - resume_role="manual_replay", - format="parquet_dataframe", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="synthesizer", - description="Optional fitted synthesis model.", - path_hint="synthesizer.pt", - resume_role="diagnostic", - format="model_file", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="source_weight_diagnostics", - description="Diagnostic summary of source-level contribution weights.", - path_hint="source_weight_diagnostics.json", - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - ), - diagnostics=( - "donor-integrated variables", - "conditioning diagnostics", - "authoritative override variables", - "synthesis backend", - "condition variables", - "target variables", - "support enforcement changes", - ), - validations=( - USStageValidationContract( - key="candidate_support", - description="Check that candidate rows support requested marginal target cells.", - ), - ), - resume_mode="manual_replay", - resume_notes="Existing policy-stage replay can reload synthetic_data.parquet and rerun downstream PE work.", - ), - USPipelineStageContract( - id="06_policyengine_entities", - step="06", - title="PolicyEngine entity construction and microsimulation materialization", - purpose="Convert candidate rows into PE entity tables and materialize PE-facing inputs.", - consumes=("synthetic/candidate frame", "PE input mapping rules"), - produces=("PolicyEngine entity table bundle", "materialized PE variables"), - inputs=( - _artifact_resource( - "synthetic_data", - "Candidate population from Stage 5.", - stage_id="05_donor_integration_synthesis", - ), - _runtime_resource( - "policyengine_mapping_rules", - "Rules mapping Microplex candidate rows into PolicyEngine entities.", - ), - ), - outputs=( - _artifact_resource( - "pre_calibration_policyengine_entity_tables", - "Reloadable pre-calibration PolicyEngine entity-table checkpoint.", - stage_id="06_policyengine_entities", - ), - _stage_output_resource( - "materialized_policyengine_inputs", - "PolicyEngine-facing variables materialized for calibration/export.", - stage_id="06_policyengine_entities", - ), - ), - artifacts=( - USStageArtifactContract( - key="pre_calibration_policyengine_entity_tables", - description="Reloadable pre-calibration PE entity-table bundle saved as parquet files plus metadata.", - path_hint="stage_artifacts/06_policyengine_entities/metadata.json", - required=True, - resume_role="manual_resume", - format="policyengine_entity_bundle", - hash_mode="directory_sha256", - ), - ), - diagnostics=( - "entity row counts", - "ID/link integrity", - "missing or filled PE inputs", - "direct override variables", - ), - validations=( - USStageValidationContract( - key="entity_integrity", - description="Check ID uniqueness and cross-entity links.", - ), - ), - resume_mode="manual_resume", - resume_notes="The entity-table bundle can be loaded for manual downstream calibration/export workflows.", - ), - USPipelineStageContract( - id="07_calibration", - step="07", - title="Target resolution, selection, and calibration", - purpose="Resolve target constraints, solve weights, and summarize fit quality.", - consumes=( - "PE entity table bundle", - "target provider/query", - "calibration config", - ), - produces=("calibrated tables", "calibration summary", "target ledger"), - inputs=( - _artifact_resource( - "pre_calibration_policyengine_entity_tables", - "Pre-calibration PolicyEngine entity-table checkpoint from Stage 6.", - stage_id="06_policyengine_entities", - ), - _external_resource( - "target_provider", - "Target provider or target database queried for calibration.", - ), - _config_resource( - "calibration_backend", - "Configured calibration backend.", - ), - _config_resource( - "calibration_tol", - "Configured calibration tolerance.", - required=False, - ), - _config_resource( - "calibration_max_iter", - "Configured maximum calibration iterations or epochs.", - required=False, - ), - _config_resource( - "target_sparsity", - "Configured sparse-target selection pressure.", - required=False, - ), - _config_resource( - "policyengine_quantity_targets", - "Configured PolicyEngine quantity targets.", - required=False, - ), - _config_resource( - "policyengine_targets_db", - "PolicyEngine target database used for calibration.", - required=False, - ), - _config_resource( - "policyengine_calibration_target_variables", - "Configured calibration target variables.", - required=False, - ), - _config_resource( - "policyengine_calibration_target_domains", - "Configured calibration target domains.", - required=False, - ), - _config_resource( - "policyengine_calibration_geo_levels", - "Configured calibration geography levels.", - required=False, - ), - ), - outputs=( - _artifact_resource( - "calibrated_data", - "Calibrated output frame.", - stage_id="07_calibration", - ), - _artifact_resource( - "targets", - "Target payload used by the build.", - stage_id="07_calibration", - ), - _artifact_resource( - "calibration_summary", - "Stage-local calibration summary.", - stage_id="07_calibration", - ), - _artifact_resource( - "policyengine_entity_tables", - "Calibrated PolicyEngine entity-table bundle used for dataset export.", - stage_id="07_calibration", - ), - _stage_output_resource( - "target_ledger", - "Structured target-resolution and calibration target ledger.", - stage_id="07_calibration", - ), - ), - artifacts=( - USStageArtifactContract( - key="calibrated_data", - description="Calibrated person-level output frame.", - path_hint="calibrated_data.parquet", - required=True, - resume_role="manual_replay", - format="parquet_dataframe", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="targets", - description="Saved target payload used by the build.", - path_hint="targets.json", - required=True, - resume_role="manual_replay", - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="calibration_summary", - description="Stage-local calibration summary JSON.", - path_hint="stage_artifacts/07_calibration/calibration_summary.json", - required=True, - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="policyengine_entity_tables", - description="Calibrated PE entity-table bundle used for dataset export.", - path_hint="stage_artifacts/07_calibration/policyengine_entity_tables/metadata.json", - required=True, - resume_role="post_artifact_evidence", - format="policyengine_entity_bundle", - hash_mode="directory_sha256", - ), - ), - diagnostics=( - "supported and unsupported targets", - "feasibility filter", - "calibration stages", - "target ledger", - "oracle loss", - "weight diagnostics", - ), - validations=( - USStageValidationContract( - key="calibration_fit", - description="Check convergence, selected target errors, and weight diagnostics.", - ), - ), - resume_mode="manual_replay", - resume_notes="Saved calibrated outputs can be reused for export/assembly; full conditional calibration is future work.", - ), - USPipelineStageContract( - id="08_dataset_assembly", - step="08", - title="Dataset assembly and publication", - purpose="Assemble the calibrated output into the distributable PE dataset artifact.", - consumes=( - "calibrated entity tables", - "export variable maps", - "period config", - ), - produces=( - "PolicyEngine H5 dataset", - "artifact manifest", - "data-flow snapshot", - ), - inputs=( - _artifact_resource( - "calibrated_data", - "Calibrated output frame from Stage 7.", - stage_id="07_calibration", - ), - _artifact_resource( - "policyengine_entity_tables", - "Calibrated PolicyEngine entity-table checkpoint from Stage 7.", - stage_id="07_calibration", - ), - _config_resource( - "policyengine_dataset_year", - "PolicyEngine dataset period used during H5 export.", - required=False, - ), - ), - outputs=( - _artifact_resource( - "policyengine_dataset", - "PolicyEngine-readable H5 dataset.", - stage_id="08_dataset_assembly", - ), - _artifact_resource( - "stage_manifest", - "Canonical saved-run stage manifest.", - stage_id="08_dataset_assembly", - ), - _artifact_resource( - "data_flow_snapshot", - "Site-facing saved-run pipeline snapshot.", - stage_id="08_dataset_assembly", - ), - _artifact_resource( - "artifact_inventory", - "Stage-owned artifact inventory.", - stage_id="08_dataset_assembly", - ), - _artifact_resource( - "conditional_readiness", - "Conditional-readiness report.", - stage_id="08_dataset_assembly", - ), - ), - artifacts=( - USStageArtifactContract( - key="policyengine_dataset", - description="PolicyEngine-readable H5 dataset.", - path_hint="policyengine_us.h5", - required=True, - resume_role="post_artifact_evidence", - format="h5_dataset", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="capital_gains_lots", - description="Optional synthetic capital-gains lot sidecar database.", - path_hint="capital_gains_lots.sqlite", - resume_role="diagnostic", - format="sqlite", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="stage_manifest", - description="Canonical stage manifest for the saved run.", - path_hint="stage_manifest.json", - required=True, - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="data_flow_snapshot", - description="Site-facing saved-run pipeline snapshot.", - path_hint="data_flow_snapshot.json", - required=True, - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="artifact_inventory", - description="Stage-owned artifact inventory with existence, role, and hash metadata.", - path_hint="stage_artifacts/artifact_inventory.json", - required=True, - resume_role="diagnostic", - format="json", - hash_mode="none", - ), - USStageArtifactContract( - key="conditional_readiness", - description="Conditional-readiness report for manual reuse decisions.", - path_hint="stage_artifacts/conditional_readiness.json", - required=True, - resume_role="diagnostic", - format="json", - hash_mode="none", - ), - ), - diagnostics=( - "exported variable maps", - "excluded variables", - "H5 loadability", - "row counts and weight totals", - ), - validations=( - USStageValidationContract( - key="dataset_loadability", - description="Check that the assembled H5 can be opened and contains expected arrays.", - ), - ), - resume_mode="post_artifact_evidence", - resume_notes="The assembled dataset is the input for validation and benchmarking evidence backfills.", - ), - USPipelineStageContract( - id="09_validation_benchmarking", - step="09", - title="Validation and benchmarking", - purpose="Evaluate the assembled dataset and attach benchmark evidence.", - consumes=( - "PolicyEngine H5 dataset", - "baseline dataset", - "target provider/query", - ), - produces=( - "harness evidence", - "native scores", - "audits", - "run registry/index evidence", - ), - inputs=( - _artifact_resource( - "policyengine_dataset", - "PolicyEngine-readable H5 dataset from Stage 8.", - stage_id="08_dataset_assembly", - ), - _external_resource( - "baseline_dataset", - "Baseline dataset used by validation or comparison harnesses.", - required=False, - ), - _external_resource( - "target_provider", - "Target provider or target database used for benchmark evidence.", - required=False, - ), - _config_resource( - "policyengine_dataset_year", - "PolicyEngine dataset period used during validation.", - required=False, - ), - ), - outputs=( - _artifact_resource( - "validation_evidence", - "Stage-local evidence manifest for validation sidecars.", - stage_id="09_validation_benchmarking", - ), - _stage_output_resource( - "benchmark_summary", - "Saved summary of validation and benchmark evidence attached to the run.", - stage_id="09_validation_benchmarking", - ), - _artifact_resource( - "policyengine_harness", - "PolicyEngine harness comparison payload.", - stage_id="09_validation_benchmarking", - required=False, - ), - _artifact_resource( - "policyengine_native_scores", - "PE-US-data native score comparison payload.", - stage_id="09_validation_benchmarking", - required=False, - ), - _artifact_resource( - "policyengine_native_audit", - "PE-US-data native score audit payload.", - stage_id="09_validation_benchmarking", - required=False, - ), - _artifact_resource( - "policyengine_native_target_diagnostics", - "Full PE-US-data native per-target diagnostics payload.", - stage_id="09_validation_benchmarking", - required=False, - ), - ), - artifacts=( - USStageArtifactContract( - key="policyengine_harness", - description="PolicyEngine harness comparison payload.", - path_hint="policyengine_harness.json", - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="policyengine_native_scores", - description="PE-US-data native score comparison payload.", - path_hint="policyengine_native_scores.json", - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="policyengine_native_audit", - description="PE-US-data native score audit payload.", - path_hint="pe_us_data_rebuild_native_audit.json", - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="policyengine_native_target_diagnostics", - description="Full PE-US-data native per-target diagnostics payload.", - path_hint="pe_native_target_diagnostics.json", - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="imputation_ablation", - description="Imputation ablation benchmark payload.", - path_hint="imputation_ablation.json", - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="child_tax_unit_agi_drift", - description="Child tax-unit AGI drift diagnostic payload.", - path_hint="child_tax_unit_agi_drift.json", - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - USStageArtifactContract( - key="validation_evidence", - description="Stage-local evidence manifest for validation sidecars.", - path_hint="stage_artifacts/09_validation_benchmarking/evidence_manifest.json", - required=True, - resume_role="diagnostic", - format="json", - hash_mode="file_sha256", - ), - ), - diagnostics=( - "harness deltas", - "native score deltas", - "target win rates", - "audit verdicts", - "ablation summaries", - ), - validations=( - USStageValidationContract( - key="benchmark_completeness", - description="Check that configured benchmark evidence was produced.", - ), - ), - resume_mode="post_artifact_evidence", - resume_notes="Benchmark evidence can be rerun or backfilled against the Stage 8 dataset artifact.", - ), - ) - - -def get_us_pipeline_stage_contract(stage_id: str) -> USPipelineStageContract: - """Return one canonical US pipeline stage contract by ID.""" - - for contract in default_us_pipeline_stage_contracts(): - if contract.id == stage_id: - return contract - raise KeyError(f"Unknown US pipeline stage contract: {stage_id}") - - -def get_us_stage_artifact_contract( - stage_id: str, - artifact_key: str, -) -> USStageArtifactContract: - """Return one artifact contract from a canonical stage.""" - - contract = get_us_pipeline_stage_contract(stage_id) - for artifact in contract.artifacts: - if artifact.key == artifact_key: - return artifact - raise KeyError(f"Unknown US stage artifact contract: {stage_id}.{artifact_key}") - - -def resolve_us_stage_artifact_contract_path( - artifact_dir: str | Path, - stage_id: str, - artifact_key: str, -) -> Path: - """Resolve a stage artifact's canonical path from its contract path hint.""" - - artifact = get_us_stage_artifact_contract(stage_id, artifact_key) - if artifact.path_hint is None: - raise KeyError(f"US stage artifact has no path hint: {stage_id}.{artifact_key}") - return Path(artifact_dir) / artifact.path_hint - - -def config_keys_for_us_pipeline_stage(stage_id: str) -> tuple[str, ...]: - """Return config keys that affect one canonical stage's reuse checks.""" - - contract = get_us_pipeline_stage_contract(stage_id) - return tuple( - dict.fromkeys( - resource.config_key - for resource in contract.inputs - if resource.kind == "config" and resource.config_key is not None - ) - ) - - -def serialize_us_pipeline_stage_contracts() -> dict[str, object]: - """Serialize the canonical US stage contract registry.""" - - contracts = default_us_pipeline_stage_contracts() - return { - "schemaVersion": 1, - "contractVersion": US_STAGE_CONTRACT_VERSION, - "pipeline": "us_microplex", - "stages": [contract.to_dict() for contract in contracts], - } - - -__all__ = [ - "StageArtifactFormat", - "StageArtifactHashMode", - "StageArtifactResumeRole", - "StageResourceKind", - "StageResumeMode", - "US_CANONICAL_STAGE_IDS", - "US_LEGACY_STAGE_ID_ALIASES", - "US_STAGE_CONTRACT_VERSION", - "USPipelineStageContract", - "USStageArtifactContract", - "USStageResourceContract", - "USStageValidationContract", - "canonicalize_us_pipeline_stage_id", - "config_keys_for_us_pipeline_stage", - "default_us_pipeline_stage_contracts", - "get_us_stage_artifact_contract", - "get_us_pipeline_stage_contract", - "resolve_us_stage_artifact_contract_path", - "serialize_us_pipeline_stage_contracts", -] diff --git a/src/microplex_us/pipelines/stage_data_flow.py b/src/microplex_us/pipelines/stage_data_flow.py deleted file mode 100644 index fa910980..00000000 --- a/src/microplex_us/pipelines/stage_data_flow.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Data-flow snapshot adapters for saved US stage manifests.""" - -from __future__ import annotations - -from typing import Any, cast - -from microplex_us.pipelines.stage_contracts import StageResumeMode -from microplex_us.pipelines.stage_manifest_types import ( - USDataFlowStageSummary, - USStageManifest, - USStageMetric, - USStageStatus, -) - - -def stage_summary_for_data_flow_snapshot( - stage_manifest: USStageManifest | dict[str, Any], -) -> list[USDataFlowStageSummary]: - """Return site-facing stage summaries from a canonical stage manifest.""" - - summaries: list[USDataFlowStageSummary] = [] - for stage in stage_manifest.get("stages", ()): - if not isinstance(stage, dict): - continue - resume = stage.get("resume", {}) - summaries.append( - { - "id": str(stage.get("id", "")), - "step": str(stage.get("step", "")), - "title": str(stage.get("title", "")), - "summary": str(stage.get("purpose", "")), - "status": cast(USStageStatus, stage.get("status", "missing")), - "metrics": cast(list[USStageMetric], list(stage.get("metrics", ()))), - "outputs": _stage_output_paths_for_data_flow(stage), - "resumeMode": cast( - StageResumeMode, - resume.get("mode", "none") if isinstance(resume, dict) else "none", - ), - } - ) - return summaries - - -def _stage_output_paths_for_data_flow(stage: dict[str, Any]) -> list[str]: - """Return artifact paths that a saved run actually referenced or produced.""" - - outputs: list[str] = [] - for artifact in stage.get("artifacts", ()): - if not isinstance(artifact, dict): - continue - path = artifact.get("path") - if not path: - continue - if bool(artifact.get("exists")) or bool(artifact.get("referenced")): - outputs.append(str(path)) - return outputs - - -__all__ = ["stage_summary_for_data_flow_snapshot"] diff --git a/src/microplex_us/pipelines/stage_manifest.py b/src/microplex_us/pipelines/stage_manifest.py deleted file mode 100644 index 14aa7a03..00000000 --- a/src/microplex_us/pipelines/stage_manifest.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Compatibility facade for US saved-run stage manifest helpers.""" - -from __future__ import annotations - -from microplex_us.pipelines.stage_data_flow import stage_summary_for_data_flow_snapshot -from microplex_us.pipelines.stage_manifest_builder import ( - build_us_stage_manifest, - resolve_us_stage_artifact_path, -) -from microplex_us.pipelines.stage_manifest_io import ( - load_us_stage_manifest, - write_us_stage_manifest, -) -from microplex_us.pipelines.stage_manifest_types import ( - SUPPORTED_US_STAGE_MANIFEST_SCHEMA_VERSIONS, - US_POLICYENGINE_ENTITY_STAGE_ID, - US_STAGE_ARTIFACT_ROOT, - US_STAGE_MANIFEST_SCHEMA_VERSION, - US_VALIDATION_STAGE_ID, - USDataFlowStageSummary, - USStageArtifactRecord, - USStageFailureRecord, - USStageLifecycleStatus, - USStageManifest, - USStageMetric, - USStageMetricValue, - USStageRecord, - USStageResourceRecord, - USStageResumeRecord, - USStageRuntimeEventRecord, - USStageStatus, - USStageValidationRecord, - USStageValidationStatus, - USValidationEvidenceManifest, - USValidationEvidenceRecord, -) -from microplex_us.pipelines.stage_policyengine_artifacts import ( - load_us_policyengine_entity_stage_artifact, - write_us_policyengine_entity_stage_artifact, -) -from microplex_us.pipelines.stage_validation_evidence import ( - build_us_validation_evidence_manifest, - write_us_validation_evidence_manifest, -) - -__all__ = [ - "SUPPORTED_US_STAGE_MANIFEST_SCHEMA_VERSIONS", - "USDataFlowStageSummary", - "US_POLICYENGINE_ENTITY_STAGE_ID", - "US_STAGE_ARTIFACT_ROOT", - "US_STAGE_MANIFEST_SCHEMA_VERSION", - "USStageArtifactRecord", - "USStageFailureRecord", - "USStageLifecycleStatus", - "USStageManifest", - "USStageMetric", - "USStageMetricValue", - "USStageRecord", - "USStageResourceRecord", - "USStageResumeRecord", - "USStageRuntimeEventRecord", - "USStageStatus", - "USStageValidationRecord", - "USStageValidationStatus", - "US_VALIDATION_STAGE_ID", - "USValidationEvidenceManifest", - "USValidationEvidenceRecord", - "build_us_stage_manifest", - "build_us_validation_evidence_manifest", - "load_us_policyengine_entity_stage_artifact", - "load_us_stage_manifest", - "resolve_us_stage_artifact_path", - "stage_summary_for_data_flow_snapshot", - "write_us_policyengine_entity_stage_artifact", - "write_us_stage_manifest", - "write_us_validation_evidence_manifest", -] diff --git a/src/microplex_us/pipelines/stage_manifest_builder.py b/src/microplex_us/pipelines/stage_manifest_builder.py deleted file mode 100644 index eed79a85..00000000 --- a/src/microplex_us/pipelines/stage_manifest_builder.py +++ /dev/null @@ -1,281 +0,0 @@ -"""Build aggregate saved-run stage manifests for US pipeline artifacts.""" - -from __future__ import annotations - -import json -from collections.abc import Iterable -from pathlib import Path -from typing import Any, cast - -from microplex_us.pipelines.stage_contracts import ( - US_STAGE_CONTRACT_VERSION, - USPipelineStageContract, - USStageArtifactContract, - USStageResourceContract, - default_us_pipeline_stage_contracts, -) -from microplex_us.pipelines.stage_manifest_types import ( - US_STAGE_MANIFEST_SCHEMA_VERSION, - USStageArtifactRecord, - USStageLifecycleStatus, - USStageManifest, - USStageRecord, - USStageResourceRecord, - USStageValidationRecord, -) -from microplex_us.pipelines.stage_metrics import stage_metrics -from microplex_us.pipelines.stage_status import stage_status - - -def build_us_stage_manifest( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any], - assume_existing_artifact_keys: Iterable[str] = (), -) -> USStageManifest: - """Build the canonical stage manifest from a saved artifact manifest.""" - - artifact_root = Path(artifact_dir) - manifest = dict(manifest_payload) - artifact_map = dict(manifest.get("artifacts", {})) - assumed_existing = set(assume_existing_artifact_keys) - stage_output_manifests = _load_stage_output_manifests( - artifact_root, - manifest, - ) - stages = [ - _stage_record( - contract, - artifact_root=artifact_root, - manifest=manifest, - stage_output_manifest=stage_output_manifests.get(contract.id), - assume_existing_artifact_keys=assumed_existing, - ) - for contract in default_us_pipeline_stage_contracts() - ] - return { - "schemaVersion": US_STAGE_MANIFEST_SCHEMA_VERSION, - "contractVersion": US_STAGE_CONTRACT_VERSION, - "generatedAt": _optional_str(manifest.get("created_at")), - "pipeline": "us_microplex", - "artifactRoot": ".", - "manifest": str(artifact_map.get("manifest", "manifest.json")), - "stages": stages, - } - - -def resolve_us_stage_artifact_path( - artifact_dir: str | Path, - stage_manifest: dict[str, Any], - stage_id: str, - artifact_key: str, -) -> Path: - """Resolve one artifact path from a stage manifest.""" - - for stage in stage_manifest.get("stages", ()): - if not isinstance(stage, dict) or stage.get("id") != stage_id: - continue - for artifact in stage.get("artifacts", ()): - if ( - isinstance(artifact, dict) - and artifact.get("key") == artifact_key - and artifact.get("path") - ): - path = Path(str(artifact["path"])) - if not path.is_absolute(): - path = Path(artifact_dir) / path - return path - raise KeyError(f"Stage artifact not found: {stage_id}.{artifact_key}") - - -def _stage_record( - contract: USPipelineStageContract, - *, - artifact_root: Path, - manifest: dict[str, Any], - stage_output_manifest: dict[str, Any] | None, - assume_existing_artifact_keys: set[str], -) -> USStageRecord: - artifacts = [ - _artifact_record( - artifact, - artifact_root=artifact_root, - manifest=manifest, - assume_existing_artifact_keys=assume_existing_artifact_keys, - ) - for artifact in contract.artifacts - ] - status = stage_status( - contract.id, - artifact_root=artifact_root, - manifest=manifest, - artifacts=artifacts, - assume_existing_artifact_keys=assume_existing_artifact_keys, - ) - return { - "id": contract.id, - "step": contract.step, - "title": contract.title, - "purpose": contract.purpose, - "status": status, - "lifecycleStatus": _stage_lifecycle_status( - stage_output_manifest, - saved_status=status, - ), - "outputManifest": _stage_output_manifest_ref(manifest, contract.id), - "startedAt": _runtime_optional_str(stage_output_manifest, "startedAt"), - "updatedAt": _runtime_optional_str(stage_output_manifest, "updatedAt"), - "completedAt": _runtime_optional_str(stage_output_manifest, "completedAt"), - "failedAt": _runtime_optional_str(stage_output_manifest, "failedAt"), - "deferredReason": _runtime_optional_str( - stage_output_manifest, - "deferredReason", - ), - "failure": _runtime_mapping_or_none(stage_output_manifest, "failure"), - "events": _runtime_events(stage_output_manifest), - "consumes": list(contract.consumes), - "produces": list(contract.produces), - "inputs": _resource_records(contract.inputs), - "outputs": _resource_records(contract.outputs), - "artifacts": artifacts, - "diagnostics": list(contract.diagnostics), - "validations": cast( - list[USStageValidationRecord], - [validation.to_dict() for validation in contract.validations], - ), - "resume": { - "mode": contract.resume_mode, - "notes": contract.resume_notes, - }, - "metrics": stage_metrics(contract.id, manifest=manifest), - } - - -def _load_stage_output_manifests( - artifact_root: Path, - manifest: dict[str, Any], -) -> dict[str, dict[str, Any]]: - stage_manifest_paths = manifest.get("stage_output_manifests") - if not isinstance(stage_manifest_paths, dict): - return {} - payloads: dict[str, dict[str, Any]] = {} - for stage_id, value in stage_manifest_paths.items(): - if not isinstance(stage_id, str) or value is None: - continue - path = Path(str(value)) - if not path.is_absolute(): - path = artifact_root / path - try: - payload = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - continue - if isinstance(payload, dict): - payloads[stage_id] = payload - return payloads - - -def _stage_output_manifest_ref( - manifest: dict[str, Any], - stage_id: str, -) -> str | None: - stage_manifest_paths = manifest.get("stage_output_manifests") - if not isinstance(stage_manifest_paths, dict): - return None - value = stage_manifest_paths.get(stage_id) - return str(value) if value is not None else None - - -def _stage_lifecycle_status( - stage_output_manifest: dict[str, Any] | None, - *, - saved_status: str, -) -> USStageLifecycleStatus: - if stage_output_manifest is not None: - value = stage_output_manifest.get("lifecycleStatus") - if value in {"pending", "running", "complete", "failed", "deferred"}: - return cast(USStageLifecycleStatus, value) - if stage_output_manifest.get("complete") is True: - return "complete" - if stage_output_manifest.get("complete") is False: - return "pending" - if saved_status == "ready": - return "complete" - if saved_status == "deferred": - return "deferred" - return "pending" - - -def _runtime_optional_str( - stage_output_manifest: dict[str, Any] | None, - key: str, -) -> str | None: - if stage_output_manifest is None: - return None - value = stage_output_manifest.get(key) - return str(value) if value is not None else None - - -def _runtime_mapping_or_none( - stage_output_manifest: dict[str, Any] | None, - key: str, -) -> dict[str, Any] | None: - if stage_output_manifest is None: - return None - value = stage_output_manifest.get(key) - return dict(value) if isinstance(value, dict) else None - - -def _runtime_events( - stage_output_manifest: dict[str, Any] | None, -) -> list[dict[str, Any]]: - if stage_output_manifest is None: - return [] - events = stage_output_manifest.get("events") - if not isinstance(events, list): - return [] - return [dict(event) for event in events if isinstance(event, dict)] - - -def _artifact_record( - artifact: USStageArtifactContract, - *, - artifact_root: Path, - manifest: dict[str, Any], - assume_existing_artifact_keys: set[str], -) -> USStageArtifactRecord: - artifacts = dict(manifest.get("artifacts", {})) - manifest_path = artifacts.get(artifact.key) - path = str(manifest_path) if manifest_path else artifact.path_hint - exists = False - if path: - resolved = Path(str(path)) - if not resolved.is_absolute(): - resolved = artifact_root / resolved - exists = resolved.exists() or artifact.key in assume_existing_artifact_keys - return { - **artifact.to_dict(), - "path": path, - "exists": exists, - "referenced": manifest_path is not None, - } - - -def _resource_records( - resources: tuple[USStageResourceContract, ...], -) -> list[USStageResourceRecord]: - return cast( - list[USStageResourceRecord], - [resource.to_dict() for resource in resources], - ) - - -def _optional_str(value: Any) -> str | None: - if value is None: - return None - return str(value) - - -__all__ = [ - "build_us_stage_manifest", - "resolve_us_stage_artifact_path", -] diff --git a/src/microplex_us/pipelines/stage_manifest_io.py b/src/microplex_us/pipelines/stage_manifest_io.py deleted file mode 100644 index 3ee870a6..00000000 --- a/src/microplex_us/pipelines/stage_manifest_io.py +++ /dev/null @@ -1,67 +0,0 @@ -"""I/O helpers for saved-run US stage manifests.""" - -from __future__ import annotations - -import json -from collections.abc import Iterable, Mapping -from pathlib import Path -from typing import Any, cast - -from microplex_us.pipelines.stage_manifest_builder import build_us_stage_manifest -from microplex_us.pipelines.stage_manifest_types import ( - SUPPORTED_US_STAGE_MANIFEST_SCHEMA_VERSIONS, - USStageManifest, -) - - -def write_us_stage_manifest( - artifact_dir: str | Path, - output_path: str | Path, - *, - manifest_payload: dict[str, Any], - assume_existing_artifact_keys: Iterable[str] = (), -) -> Path: - """Write the canonical stage manifest for a saved US artifact bundle.""" - - destination = Path(output_path) - destination.parent.mkdir(parents=True, exist_ok=True) - write_json_atomically( - destination, - build_us_stage_manifest( - artifact_dir, - manifest_payload=manifest_payload, - assume_existing_artifact_keys=( - *tuple(assume_existing_artifact_keys), - "stage_manifest", - ), - ), - ) - return destination - - -def load_us_stage_manifest(path: str | Path) -> USStageManifest: - """Load a saved stage manifest and validate its schema version.""" - - manifest_path = Path(path) - payload = json.loads(manifest_path.read_text()) - if payload.get("schemaVersion") not in SUPPORTED_US_STAGE_MANIFEST_SCHEMA_VERSIONS: - raise RuntimeError( - f"Unsupported US stage manifest schema: {payload.get('schemaVersion')!r}" - ) - return cast(USStageManifest, payload) - - -def write_json_atomically(path: Path, payload: Mapping[str, Any]) -> None: - """Write JSON atomically through a sibling temporary file.""" - - path.parent.mkdir(parents=True, exist_ok=True) - temporary = path.with_suffix(path.suffix + ".tmp") - temporary.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temporary.replace(path) - - -__all__ = [ - "load_us_stage_manifest", - "write_json_atomically", - "write_us_stage_manifest", -] diff --git a/src/microplex_us/pipelines/stage_manifest_types.py b/src/microplex_us/pipelines/stage_manifest_types.py deleted file mode 100644 index 12897a98..00000000 --- a/src/microplex_us/pipelines/stage_manifest_types.py +++ /dev/null @@ -1,198 +0,0 @@ -"""Shared saved-run stage manifest schemas for US pipeline artifacts.""" - -from __future__ import annotations - -from typing import Any, Literal, TypedDict - -from microplex_us.pipelines.stage_contracts import ( - StageArtifactFormat, - StageArtifactHashMode, - StageResumeMode, -) - -US_STAGE_MANIFEST_SCHEMA_VERSION = 3 -SUPPORTED_US_STAGE_MANIFEST_SCHEMA_VERSIONS = frozenset({1, 2, 3}) -US_STAGE_ARTIFACT_ROOT = "stage_artifacts" -US_POLICYENGINE_ENTITY_STAGE_ID = "06_policyengine_entities" -US_VALIDATION_STAGE_ID = "09_validation_benchmarking" - - -USStageMetricValue = str | int | float | bool | None - -USStageStatus = Literal[ - "ready", - "metadata_only", - "deferred", - "incomplete", - "missing", -] - -USStageValidationStatus = Literal["planned", "manual", "implemented"] -USStageLifecycleStatus = Literal[ - "pending", - "running", - "complete", - "failed", - "deferred", -] - - -class USStageMetric(TypedDict): - """One compact metric shown for a saved stage.""" - - label: str - value: USStageMetricValue - - -class USStageArtifactRecord(TypedDict): - """Saved-run view of one stage artifact contract.""" - - key: str - description: str - path_hint: str | None - required: bool - resume_role: str | None - format: StageArtifactFormat - hash_mode: StageArtifactHashMode - path: str | None - exists: bool - referenced: bool - - -class USStageResumeRecord(TypedDict): - """Saved-run resume metadata for one stage.""" - - mode: StageResumeMode - notes: str - - -class USStageValidationRecord(TypedDict): - """Saved-run view of one planned or implemented validation.""" - - key: str - description: str - status: USStageValidationStatus - - -class USStageFailureRecord(TypedDict, total=False): - """Runtime failure details for one stage.""" - - errorType: str - message: str - traceback: str | None - - -class USStageRuntimeEventRecord(TypedDict, total=False): - """Compact runtime event included in a stage output manifest.""" - - event: str - timestamp: str - details: dict[str, Any] - - -class USStageResourceRecord(TypedDict): - """Saved-run view of one structured stage input or output.""" - - key: str - description: str - kind: str - required: bool - stage_id: str | None - artifact_key: str | None - config_key: str | None - manifest_key: str | None - - -class USStageRecord(TypedDict): - """One stage entry in a US stage manifest.""" - - id: str - step: str - title: str - purpose: str - status: USStageStatus - lifecycleStatus: USStageLifecycleStatus - outputManifest: str | None - startedAt: str | None - updatedAt: str | None - completedAt: str | None - failedAt: str | None - deferredReason: str | None - failure: USStageFailureRecord | None - events: list[USStageRuntimeEventRecord] - consumes: list[str] - produces: list[str] - inputs: list[USStageResourceRecord] - outputs: list[USStageResourceRecord] - artifacts: list[USStageArtifactRecord] - diagnostics: list[str] - validations: list[USStageValidationRecord] - resume: USStageResumeRecord - metrics: list[USStageMetric] - - -class USStageManifest(TypedDict): - """Canonical saved-run stage manifest.""" - - schemaVersion: int - contractVersion: str - generatedAt: str | None - pipeline: str - artifactRoot: str - manifest: str - stages: list[USStageRecord] - - -class USDataFlowStageSummary(TypedDict): - """Stage summary embedded in the site-facing data-flow snapshot.""" - - id: str - step: str - title: str - summary: str - status: USStageStatus - metrics: list[USStageMetric] - outputs: list[str] - resumeMode: StageResumeMode - - -class USValidationEvidenceRecord(TypedDict): - """One validation or benchmarking evidence sidecar.""" - - key: str - path: str - exists: bool - - -class USValidationEvidenceManifest(TypedDict): - """Stage 9 evidence index.""" - - formatVersion: int - stageId: str - evidence: list[USValidationEvidenceRecord] - summaries: dict[str, Any] - - -__all__ = [ - "SUPPORTED_US_STAGE_MANIFEST_SCHEMA_VERSIONS", - "USDataFlowStageSummary", - "US_POLICYENGINE_ENTITY_STAGE_ID", - "US_STAGE_ARTIFACT_ROOT", - "US_STAGE_MANIFEST_SCHEMA_VERSION", - "US_VALIDATION_STAGE_ID", - "USStageArtifactRecord", - "USStageFailureRecord", - "USStageLifecycleStatus", - "USStageManifest", - "USStageMetric", - "USStageMetricValue", - "USStageRecord", - "USStageResourceRecord", - "USStageResumeRecord", - "USStageRuntimeEventRecord", - "USStageStatus", - "USStageValidationRecord", - "USStageValidationStatus", - "USValidationEvidenceManifest", - "USValidationEvidenceRecord", -] diff --git a/src/microplex_us/pipelines/stage_metrics.py b/src/microplex_us/pipelines/stage_metrics.py deleted file mode 100644 index 6668faff..00000000 --- a/src/microplex_us/pipelines/stage_metrics.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Display metrics for saved US pipeline stage manifests.""" - -from __future__ import annotations - -from typing import Any - -from microplex_us.pipelines.stage_manifest_types import USStageMetric - - -def stage_metrics(stage_id: str, *, manifest: dict[str, Any]) -> list[USStageMetric]: - """Return compact display metrics for one saved stage.""" - - synthesis = dict(manifest.get("synthesis", {})) - calibration = dict(manifest.get("calibration", {})) - artifacts = dict(manifest.get("artifacts", {})) - harness = dict(manifest.get("policyengine_harness", {})) - native_scores = dict(manifest.get("policyengine_native_scores", {})) - rows = dict(manifest.get("rows", {})) - config = dict(manifest.get("config", {})) - if stage_id == "01_run_profile": - return [ - { - "label": "Target period", - "value": config.get("policyengine_target_period"), - }, - {"label": "Backend", "value": config.get("calibration_backend")}, - ] - if stage_id == "02_source_loading": - return [ - {"label": "Sources", "value": len(synthesis.get("source_names", ()))}, - ] - if stage_id == "03_source_planning": - return [{"label": "Scaffold", "value": synthesis.get("scaffold_source")}] - if stage_id == "04_seed_scaffold": - return [ - {"label": "Seed rows", "value": rows.get("seed")}, - {"label": "Scaffold", "value": synthesis.get("scaffold_source")}, - ] - if stage_id == "05_donor_integration_synthesis": - return [ - {"label": "Seed rows", "value": rows.get("seed")}, - { - "label": "Integrated vars", - "value": len(synthesis.get("donor_integrated_variables", ())), - }, - {"label": "Backend", "value": synthesis.get("backend")}, - {"label": "Synthetic rows", "value": rows.get("synthetic")}, - ] - if stage_id == "06_policyengine_entities": - return [ - { - "label": "Entity bundle", - "value": artifacts.get("pre_calibration_policyengine_entity_tables"), - } - ] - if stage_id == "07_calibration": - return [ - {"label": "Backend", "value": calibration.get("backend")}, - {"label": "Supported", "value": calibration.get("n_supported_targets")}, - {"label": "Converged", "value": calibration.get("converged")}, - ] - if stage_id == "08_dataset_assembly": - return [{"label": "Dataset", "value": artifacts.get("policyengine_dataset")}] - if stage_id == "09_validation_benchmarking": - imputation_ablation = dict(manifest.get("imputation_ablation", {})) - return [ - { - "label": "Capped full oracle loss", - "value": calibration.get("full_oracle_capped_mean_abs_relative_error"), - }, - { - "label": "Full oracle loss", - "value": calibration.get("full_oracle_mean_abs_relative_error"), - }, - { - "label": "Harness delta", - "value": harness.get("mean_abs_relative_error_delta"), - }, - { - "label": "Native delta", - "value": native_scores.get("enhanced_cps_native_loss_delta"), - }, - {"label": "Win rate", "value": harness.get("target_win_rate")}, - { - "label": "Imputation MAE", - "value": imputation_ablation.get("production_mean_weighted_mae"), - }, - { - "label": "Imputation F1", - "value": imputation_ablation.get("production_mean_support_f1"), - }, - ] - return [] - - -__all__ = ["stage_metrics"] diff --git a/src/microplex_us/pipelines/stage_policyengine_artifacts.py b/src/microplex_us/pipelines/stage_policyengine_artifacts.py deleted file mode 100644 index ac0a3fac..00000000 --- a/src/microplex_us/pipelines/stage_policyengine_artifacts.py +++ /dev/null @@ -1,69 +0,0 @@ -"""PolicyEngine entity stage artifact I/O for US saved runs.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_manifest_io import write_json_atomically -from microplex_us.pipelines.stage_manifest_types import US_POLICYENGINE_ENTITY_STAGE_ID -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - USPipelineCheckpointStage, - load_us_pipeline_checkpoint, - save_us_pipeline_checkpoint, -) - - -def write_us_policyengine_entity_stage_artifact( - bundle: PolicyEngineUSEntityTableBundle, - artifact_root: str | Path, - *, - stage_id: str = US_POLICYENGINE_ENTITY_STAGE_ID, - artifact_key: str = "pre_calibration_policyengine_entity_tables", - checkpoint_stage: USPipelineCheckpointStage = "post_microsim", -) -> Path: - """Persist a PE entity-table checkpoint under a saved-run root.""" - - metadata_path = resolve_us_stage_artifact_contract_path( - artifact_root, - stage_id, - artifact_key, - ) - stage_dir = save_us_pipeline_checkpoint( - bundle, - metadata_path.parent, - stage=checkpoint_stage, - ) - metadata_path = stage_dir / metadata_path.name - metadata = json.loads(metadata_path.read_text()) - metadata["stageId"] = stage_id - metadata["artifactKey"] = artifact_key - write_json_atomically(metadata_path, metadata) - return metadata_path - - -def load_us_policyengine_entity_stage_artifact( - path: str | Path, - *, - expected_stage: USPipelineCheckpointStage | None = "post_microsim", -) -> tuple[PolicyEngineUSEntityTableBundle, dict[str, Any]]: - """Load a PE entity-table bundle artifact.""" - - input_path = Path(path) - checkpoint_dir = input_path if input_path.is_dir() else input_path.parent - bundle, metadata = load_us_pipeline_checkpoint( - checkpoint_dir, - expected_stage=expected_stage, - ) - return bundle, metadata - - -__all__ = [ - "load_us_policyengine_entity_stage_artifact", - "write_us_policyengine_entity_stage_artifact", -] diff --git a/src/microplex_us/pipelines/stage_readiness.py b/src/microplex_us/pipelines/stage_readiness.py deleted file mode 100644 index f24f5ea9..00000000 --- a/src/microplex_us/pipelines/stage_readiness.py +++ /dev/null @@ -1,457 +0,0 @@ -"""Conditional-readiness reports for US Microplex saved runs.""" - -from __future__ import annotations - -import hashlib -import json -from collections.abc import Mapping -from pathlib import Path -from typing import Any, Literal, TypedDict, cast - -from microplex_us.pipelines.stage_artifacts import ( - USStageArtifactInventory, - USStageArtifactInventoryRecord, - build_us_stage_artifact_inventory, - load_us_stage_artifact_inventory, -) -from microplex_us.pipelines.stage_contracts import ( - US_STAGE_CONTRACT_VERSION, - config_keys_for_us_pipeline_stage, -) -from microplex_us.pipelines.stage_manifest import ( - USStageManifest, - USStageStatus, - build_us_stage_manifest, -) - -US_CONDITIONAL_READINESS_SCHEMA_VERSION = 1 -US_CONFIG_REUSE_IGNORED_KEYS = frozenset( - { - "pipeline_checkpoint_save_post_imputation_path", - "pipeline_checkpoint_save_post_microsim_path", - } -) - -USStageReadiness = Literal[ - "manual_replay", - "manual_resume", - "post_artifact_evidence", - "diagnostic_only", - "metadata_only", - "must_rerun", - "not_applicable", -] - -USStageCompatibility = Literal[ - "match", - "mismatch", - "missing_saved_config", - "not_evaluated", -] - - -class USConditionalReadinessStageRecord(TypedDict): - """Conditional-readiness view of one canonical stage.""" - - stageId: str - stageStep: str - stageTitle: str - status: USStageStatus - readiness: USStageReadiness - reason: str - compatibility: USStageCompatibility - reuseKey: str | None - savedConfigHash: str | None - requestedConfigHash: str | None - availableArtifacts: list[str] - missingArtifacts: list[str] - diagnosticArtifacts: list[str] - reloadableArtifacts: list[str] - - -class USConditionalReadinessReport(TypedDict): - """Saved-run conditional-readiness report.""" - - schemaVersion: int - contractVersion: str - generatedAt: str | None - pipeline: str - artifactRoot: str - manifest: str - artifactInventory: str | None - savedConfigHash: str | None - requestedConfigHash: str | None - stages: list[USConditionalReadinessStageRecord] - - -def build_us_stage_reuse_key( - stage_id: str, - manifest_payload: Mapping[str, Any], - artifact_inventory: USStageArtifactInventory | Mapping[str, Any], -) -> str | None: - """Return a deterministic reuse key for one stage, if any evidence exists.""" - - stage_artifacts = [ - artifact - for artifact in artifact_inventory.get("artifacts", ()) - if isinstance(artifact, dict) and artifact.get("stageId") == stage_id - ] - if not stage_artifacts: - return None - evidence = [ - { - "key": str(artifact.get("key")), - "path": artifact.get("path"), - "classification": artifact.get("classification"), - "hashStatus": artifact.get("hashStatus"), - "contentHash": artifact.get("contentHash"), - "sizeBytes": artifact.get("sizeBytes"), - "fileCount": artifact.get("fileCount"), - } - for artifact in stage_artifacts - if artifact.get("exists") or artifact.get("referenced") - ] - if not evidence: - return None - payload = { - "stageId": stage_id, - "configHash": _stage_config_hash(stage_id, manifest_payload.get("config")), - "artifacts": sorted(evidence, key=lambda item: item["key"]), - } - return _hash_json(payload) - - -def build_us_conditional_readiness_report( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, - artifact_inventory: USStageArtifactInventory | dict[str, Any] | None = None, - requested_config: Mapping[str, Any] | None = None, -) -> USConditionalReadinessReport: - """Build a report describing which stage outputs could be reused manually.""" - - artifact_root = Path(artifact_dir) - manifest = ( - dict(manifest_payload) - if manifest_payload is not None - else json.loads((artifact_root / "manifest.json").read_text()) - ) - stages = ( - dict(stage_manifest) - if stage_manifest is not None - else build_us_stage_manifest(artifact_root, manifest_payload=manifest) - ) - inventory = ( - dict(artifact_inventory) - if artifact_inventory is not None - else _load_or_build_inventory(artifact_root, manifest_payload=manifest) - ) - saved_config_hash = _config_hash(manifest.get("config")) - requested_config_hash = ( - _config_hash(requested_config) if requested_config is not None else None - ) - return { - "schemaVersion": US_CONDITIONAL_READINESS_SCHEMA_VERSION, - "contractVersion": US_STAGE_CONTRACT_VERSION, - "generatedAt": _optional_str(manifest.get("created_at")), - "pipeline": "us_microplex", - "artifactRoot": ".", - "manifest": str( - dict(manifest.get("artifacts", {})).get("manifest", "manifest.json") - ), - "artifactInventory": _optional_str( - dict(manifest.get("artifacts", {})).get("artifact_inventory") - ), - "savedConfigHash": saved_config_hash, - "requestedConfigHash": requested_config_hash, - "stages": [ - _readiness_stage_record( - stage, - manifest=manifest, - inventory=inventory, - requested_config=requested_config, - ) - for stage in stages.get("stages", ()) - if isinstance(stage, dict) - ], - } - - -def write_us_conditional_readiness_report( - artifact_dir: str | Path, - output_path: str | Path, - *, - manifest_payload: dict[str, Any] | None = None, - stage_manifest: USStageManifest | dict[str, Any] | None = None, - artifact_inventory: USStageArtifactInventory | dict[str, Any] | None = None, - requested_config: Mapping[str, Any] | None = None, -) -> Path: - """Write a conditional-readiness report sidecar for one saved run.""" - - destination = Path(output_path) - destination.parent.mkdir(parents=True, exist_ok=True) - _write_json_atomically( - destination, - build_us_conditional_readiness_report( - artifact_dir, - manifest_payload=manifest_payload, - stage_manifest=stage_manifest, - artifact_inventory=artifact_inventory, - requested_config=requested_config, - ), - ) - return destination - - -def load_us_conditional_readiness_report( - path: str | Path, -) -> USConditionalReadinessReport: - """Load a saved conditional-readiness report.""" - - report_path = Path(path) - payload = json.loads(report_path.read_text()) - if payload.get("schemaVersion") != US_CONDITIONAL_READINESS_SCHEMA_VERSION: - raise RuntimeError( - "Unsupported US conditional-readiness report schema: " - f"{payload.get('schemaVersion')!r}" - ) - return cast(USConditionalReadinessReport, payload) - - -def _readiness_stage_record( - stage: Mapping[str, Any], - *, - manifest: Mapping[str, Any], - inventory: Mapping[str, Any], - requested_config: Mapping[str, Any] | None, -) -> USConditionalReadinessStageRecord: - stage_id = str(stage.get("id", "")) - saved_stage_config_hash = _stage_config_hash(stage_id, manifest.get("config")) - requested_stage_config_hash = ( - _stage_config_hash(stage_id, requested_config) - if requested_config is not None - else None - ) - compatibility = _config_compatibility( - saved_stage_config_hash, - requested_stage_config_hash, - requested_config_supplied=requested_config is not None, - ) - artifacts = _inventory_artifacts_for_stage(inventory, stage_id) - available = [ - _artifact_label(artifact) - for artifact in artifacts - if bool(artifact.get("exists")) - ] - missing = [ - _artifact_label(artifact) - for artifact in artifacts - if artifact.get("classification") in {"missing_required", "missing_optional"} - ] - diagnostic = [ - _artifact_label(artifact) - for artifact in artifacts - if artifact.get("classification") == "diagnostic_only" - ] - reloadable = [ - _artifact_label(artifact) - for artifact in artifacts - if artifact.get("classification") - in {"manual_replay", "manual_resume", "post_artifact_evidence"} - ] - readiness, reason = _stage_readiness( - stage, - artifacts, - compatibility=compatibility, - stage8_dataset_available=_stage8_dataset_available(inventory), - ) - return { - "stageId": stage_id, - "stageStep": str(stage.get("step", "")), - "stageTitle": str(stage.get("title", "")), - "status": cast(USStageStatus, stage.get("status", "missing")), - "readiness": readiness, - "reason": reason, - "compatibility": compatibility, - "reuseKey": build_us_stage_reuse_key(stage_id, manifest, inventory), - "savedConfigHash": saved_stage_config_hash, - "requestedConfigHash": requested_stage_config_hash, - "availableArtifacts": available, - "missingArtifacts": missing, - "diagnosticArtifacts": diagnostic, - "reloadableArtifacts": reloadable, - } - - -def _stage_readiness( - stage: Mapping[str, Any], - artifacts: list[USStageArtifactInventoryRecord], - *, - compatibility: USStageCompatibility, - stage8_dataset_available: bool, -) -> tuple[USStageReadiness, str]: - stage_id = str(stage.get("id", "")) - status = stage.get("status") - if stage_id == "09_validation_benchmarking" and status == "deferred": - if stage8_dataset_available: - return ( - "post_artifact_evidence", - "Stage 8 dataset is available for validation or benchmark evidence.", - ) - return ( - "must_rerun", - "Validation is deferred and no Stage 8 dataset is available.", - ) - if compatibility == "mismatch": - return ( - "must_rerun", - "Requested configuration does not match this stage's saved run inputs.", - ) - classifications = { - str(artifact.get("classification")) - for artifact in artifacts - if bool(artifact.get("exists")) - } - for readiness in ("manual_resume", "manual_replay", "post_artifact_evidence"): - if readiness in classifications: - return cast(USStageReadiness, readiness), ( - f"Stage has existing {readiness.replace('_', ' ')} artifacts." - ) - if "diagnostic_only" in classifications: - return ( - "diagnostic_only", - "Stage has diagnostic artifacts but no replay boundary.", - ) - if status in {"missing", "incomplete"}: - return "must_rerun", f"Stage status is {status}." - if status == "metadata_only": - return "metadata_only", "Stage has metadata but no reloadable artifact." - return "not_applicable", "No reusable artifact boundary is available." - - -def _inventory_artifacts_for_stage( - inventory: Mapping[str, Any], - stage_id: str, -) -> list[USStageArtifactInventoryRecord]: - return [ - cast(USStageArtifactInventoryRecord, artifact) - for artifact in inventory.get("artifacts", ()) - if isinstance(artifact, dict) and artifact.get("stageId") == stage_id - ] - - -def _stage8_dataset_available(inventory: Mapping[str, Any]) -> bool: - return any( - isinstance(artifact, dict) - and artifact.get("stageId") == "08_dataset_assembly" - and artifact.get("key") == "policyengine_dataset" - and bool(artifact.get("exists")) - for artifact in inventory.get("artifacts", ()) - ) - - -def _load_or_build_inventory( - artifact_root: Path, - *, - manifest_payload: dict[str, Any], -) -> USStageArtifactInventory: - inventory_name = dict(manifest_payload.get("artifacts", {})).get( - "artifact_inventory" - ) - if isinstance(inventory_name, str): - inventory_path = Path(inventory_name) - if not inventory_path.is_absolute(): - inventory_path = artifact_root / inventory_path - if inventory_path.exists(): - return load_us_stage_artifact_inventory(inventory_path) - return build_us_stage_artifact_inventory( - artifact_root, - manifest_payload=manifest_payload, - ) - - -def _config_compatibility( - saved_config_hash: str | None, - requested_config_hash: str | None, - *, - requested_config_supplied: bool, -) -> USStageCompatibility: - if not requested_config_supplied: - return "not_evaluated" - if saved_config_hash is None: - return "missing_saved_config" - return "match" if saved_config_hash == requested_config_hash else "mismatch" - - -def _config_hash(config: Any) -> str | None: - if not isinstance(config, Mapping): - return None - return _hash_json(_canonical_config(config)) - - -def _stage_config_hash(stage_id: str, config: Any) -> str | None: - keys = config_keys_for_us_pipeline_stage(stage_id) - if not keys: - return _hash_json({}) - if not isinstance(config, Mapping): - return None - scoped = {key: config.get(key) for key in keys if key in config} - return _hash_json(_canonical_config(scoped)) - - -def _canonical_config(config: Mapping[str, Any]) -> dict[str, Any]: - return { - str(key): _normalize_config_value(value) - for key, value in sorted(config.items()) - if key not in US_CONFIG_REUSE_IGNORED_KEYS - } - - -def _normalize_config_value(value: Any) -> Any: - if isinstance(value, Mapping): - return { - str(key): _normalize_config_value(item) - for key, item in sorted(value.items()) - } - if isinstance(value, (list, tuple)): - return [_normalize_config_value(item) for item in value] - if isinstance(value, Path): - return str(value) - return value - - -def _hash_json(payload: Any) -> str: - return hashlib.sha256( - json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8") - ).hexdigest() - - -def _artifact_label(artifact: Mapping[str, Any]) -> str: - return f"{artifact.get('stageId')}.{artifact.get('key')}" - - -def _optional_str(value: Any) -> str | None: - if value is None: - return None - return str(value) - - -def _write_json_atomically(path: Path, payload: Mapping[str, Any]) -> None: - temporary = path.with_suffix(path.suffix + ".tmp") - temporary.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temporary.replace(path) - - -__all__ = [ - "US_CONDITIONAL_READINESS_SCHEMA_VERSION", - "US_CONFIG_REUSE_IGNORED_KEYS", - "USConditionalReadinessReport", - "USConditionalReadinessStageRecord", - "USStageCompatibility", - "USStageReadiness", - "build_us_conditional_readiness_report", - "build_us_stage_reuse_key", - "load_us_conditional_readiness_report", - "write_us_conditional_readiness_report", -] diff --git a/src/microplex_us/pipelines/stage_resume.py b/src/microplex_us/pipelines/stage_resume.py deleted file mode 100644 index a0324b90..00000000 --- a/src/microplex_us/pipelines/stage_resume.py +++ /dev/null @@ -1,430 +0,0 @@ -"""Preflight checks for resuming canonical US pipeline stages.""" - -from __future__ import annotations - -import json -from collections.abc import Mapping -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_contracts import ( - US_CANONICAL_STAGE_IDS, - US_STAGE_CONTRACT_VERSION, - canonicalize_us_pipeline_stage_id, - get_us_pipeline_stage_contract, - get_us_stage_artifact_contract, -) -from microplex_us.pipelines.stage_run import ( - resolve_us_manifest_or_contract_artifact_path, -) - -_POLICYENGINE_ENTITY_BUNDLE_TABLES = ( - "households", - "persons", - "tax_units", - "spm_units", - "families", - "marital_units", -) - - -@dataclass(frozen=True) -class USStageResumeArtifactRequirement: - """One durable artifact required before a stage resume can start.""" - - stage_id: str - artifact_key: str - reason: str - - @property - def label(self) -> str: - return f"{self.stage_id}.{self.artifact_key}" - - -@dataclass(frozen=True) -class USStageResumeMissingRequirement: - """One missing input detected by the resume preflight.""" - - label: str - reason: str - path: Path | None = None - - def format(self) -> str: - path_text = f" ({self.path})" if self.path is not None else "" - return f"{self.label}: {self.reason}{path_text}" - - -@dataclass(frozen=True) -class USStageResumePreflightResult: - """Result of checking whether a saved run can resume at one stage.""" - - artifact_root: Path - resume_from_stage: str - missing: tuple[USStageResumeMissingRequirement, ...] - - @property - def ok(self) -> bool: - return not self.missing - - def raise_for_missing(self) -> None: - if self.ok: - return - raise ValueError(format_us_stage_resume_preflight_error(self)) - - -def preflight_us_stage_resume( - artifact_root: str | Path, - resume_from_stage: str, - *, - extra_required_artifacts: tuple[USStageResumeArtifactRequirement, ...] = (), -) -> USStageResumePreflightResult: - """Validate durable inputs before resuming a saved US stage run.""" - - root = Path(artifact_root).expanduser() - stage_id = canonicalize_us_pipeline_stage_id(resume_from_stage) - if stage_id not in US_CANONICAL_STAGE_IDS: - raise ValueError(f"Unknown US pipeline stage: {resume_from_stage}") - - missing: list[USStageResumeMissingRequirement] = [] - if not root.exists(): - missing.append( - USStageResumeMissingRequirement( - label="artifact_root", - reason="saved run directory does not exist", - path=root, - ) - ) - return USStageResumePreflightResult(root, stage_id, tuple(missing)) - if not root.is_dir(): - missing.append( - USStageResumeMissingRequirement( - label="artifact_root", - reason="saved run path is not a directory", - path=root, - ) - ) - return USStageResumePreflightResult(root, stage_id, tuple(missing)) - - stage_index = US_CANONICAL_STAGE_IDS.index(stage_id) - manifest = _load_json_if_available(root / "manifest.json") - if manifest is None and stage_index > 0: - missing.append( - USStageResumeMissingRequirement( - label="01_run_profile.manifest", - reason="top-level artifact manifest is missing or unreadable", - path=root / "manifest.json", - ) - ) - - if stage_index > 0: - previous_stage_id = US_CANONICAL_STAGE_IDS[stage_index - 1] - missing.extend(_missing_completed_stage_requirements(root, previous_stage_id)) - for resource in get_us_pipeline_stage_contract(stage_id).inputs: - if ( - not resource.required - or resource.stage_id is None - or resource.kind not in {"artifact", "manifest", "stage_output"} - ): - continue - missing.extend( - _missing_stage_output_requirement( - root, - stage_id=resource.stage_id, - output_key=resource.key, - consumer_stage_id=stage_id, - ) - ) - - if manifest is not None: - for requirement in extra_required_artifacts: - path = resolve_us_manifest_or_contract_artifact_path( - root, - manifest, - requirement.artifact_key, - stage_id=requirement.stage_id, - ) - if not path.exists(): - missing.append( - USStageResumeMissingRequirement( - label=requirement.label, - reason=requirement.reason, - path=path, - ) - ) - else: - missing.extend( - _missing_artifact_format_requirements( - stage_id=requirement.stage_id, - artifact_key=requirement.artifact_key, - path=path, - label=requirement.label, - ) - ) - - return USStageResumePreflightResult(root, stage_id, _dedupe_missing(missing)) - - -def format_us_stage_resume_preflight_error( - result: USStageResumePreflightResult, -) -> str: - """Return a clear error message for a failed resume preflight.""" - - details = "\n".join(f"- {item.format()}" for item in result.missing) - return ( - "US pipeline resume preflight failed for " - f"{result.resume_from_stage} at {result.artifact_root}. " - "The rerun was not started because required durable inputs are missing:\n" - f"{details}" - ) - - -def _missing_completed_stage_requirements( - artifact_root: Path, - stage_id: str, -) -> tuple[USStageResumeMissingRequirement, ...]: - path = _stage_manifest_path(artifact_root, stage_id) - payload = _load_json_if_available(path) - if payload is None: - return ( - USStageResumeMissingRequirement( - label=f"{stage_id}.stage_manifest", - reason="stage output manifest is missing or unreadable", - path=path, - ), - ) - missing: list[USStageResumeMissingRequirement] = [] - if payload.get("contractVersion") != US_STAGE_CONTRACT_VERSION: - missing.append( - USStageResumeMissingRequirement( - label=f"{stage_id}.contractVersion", - reason=( - "stage output manifest uses stale contract version " - f"{payload.get('contractVersion')!r}; expected " - f"{US_STAGE_CONTRACT_VERSION!r}" - ), - path=path, - ) - ) - if payload.get("lifecycleStatus") != "complete" or not payload.get("complete"): - missing.append( - USStageResumeMissingRequirement( - label=f"{stage_id}.lifecycleStatus", - reason="stage is not marked complete", - path=path, - ) - ) - for output_key in tuple(payload.get("requiredOutputs") or ()): - missing.extend( - _missing_stage_output_requirement( - artifact_root, - stage_id=stage_id, - output_key=str(output_key), - consumer_stage_id=None, - ) - ) - return tuple(missing) - - -def _dedupe_missing( - missing: list[USStageResumeMissingRequirement], -) -> tuple[USStageResumeMissingRequirement, ...]: - deduped: list[USStageResumeMissingRequirement] = [] - seen: set[tuple[str, Path | None]] = set() - for item in missing: - key = (item.label, item.path) - if key in seen: - continue - seen.add(key) - deduped.append(item) - return tuple(deduped) - - -def _missing_stage_output_requirement( - artifact_root: Path, - *, - stage_id: str, - output_key: str, - consumer_stage_id: str | None, -) -> tuple[USStageResumeMissingRequirement, ...]: - path = _stage_manifest_path(artifact_root, stage_id) - payload = _load_json_if_available(path) - if payload is None: - return ( - USStageResumeMissingRequirement( - label=f"{stage_id}.{output_key}", - reason="source stage manifest is missing or unreadable", - path=path, - ), - ) - outputs = payload.get("outputs") - if not isinstance(outputs, Mapping) or output_key not in outputs: - consumer = f" required by {consumer_stage_id}" if consumer_stage_id else "" - return ( - USStageResumeMissingRequirement( - label=f"{stage_id}.{output_key}", - reason=f"required output is not recorded{consumer}", - path=path, - ), - ) - value = outputs[output_key] - missing = _missing_serialized_output_requirements( - artifact_root, - stage_id=stage_id, - output_key=output_key, - value=value, - ) - if not missing: - return () - consumer = f" required by {consumer_stage_id}" if consumer_stage_id else "" - return tuple( - USStageResumeMissingRequirement( - label=item.label, - reason=f"{item.reason}{consumer}", - path=item.path or path, - ) - for item in missing - ) - - -def _missing_serialized_output_requirements( - artifact_root: Path, - *, - stage_id: str, - output_key: str, - value: Any, -) -> tuple[USStageResumeMissingRequirement, ...]: - label = f"{stage_id}.{output_key}" - if value is None: - return ( - USStageResumeMissingRequirement( - label=label, - reason="required output is unavailable", - ), - ) - if isinstance(value, Mapping): - path = _serialized_output_path(artifact_root, value) - if path is not None: - if not path.exists(): - return ( - USStageResumeMissingRequirement( - label=label, - reason="required output is unavailable", - path=path, - ), - ) - return _missing_artifact_format_requirements( - stage_id=stage_id, - artifact_key=output_key, - path=path, - label=label, - ) - exists = value.get("exists") - if exists is not None: - if bool(exists): - return () - return ( - USStageResumeMissingRequirement( - label=label, - reason="required output is unavailable", - ), - ) - if value: - return () - return ( - USStageResumeMissingRequirement( - label=label, - reason="required output is unavailable", - ), - ) - if isinstance(value, (list, tuple, set, frozenset)): - if value: - return () - return ( - USStageResumeMissingRequirement( - label=label, - reason="required output is unavailable", - ), - ) - if isinstance(value, str): - if value: - return () - return ( - USStageResumeMissingRequirement( - label=label, - reason="required output is unavailable", - ), - ) - return () - - -def _missing_artifact_format_requirements( - *, - stage_id: str, - artifact_key: str, - path: Path, - label: str, -) -> tuple[USStageResumeMissingRequirement, ...]: - try: - contract = get_us_stage_artifact_contract(stage_id, artifact_key) - except KeyError: - return () - if contract.format != "policyengine_entity_bundle": - return () - metadata_path = path / "metadata.json" if path.is_dir() else path - metadata = _load_json_if_available(metadata_path) - if metadata is None: - return ( - USStageResumeMissingRequirement( - label=label, - reason="PolicyEngine entity bundle metadata is missing or unreadable", - path=metadata_path, - ), - ) - missing: list[USStageResumeMissingRequirement] = [] - for table_name in _POLICYENGINE_ENTITY_BUNDLE_TABLES: - if metadata.get(table_name) is None: - continue - table_path = metadata_path.parent / f"{table_name}.parquet" - if not table_path.exists(): - missing.append( - USStageResumeMissingRequirement( - label=label, - reason=f"PolicyEngine entity bundle is missing {table_name}.parquet", - path=table_path, - ) - ) - return tuple(missing) - - -def _serialized_output_path(artifact_root: Path, value: Any) -> Path | None: - if not isinstance(value, Mapping): - return None - path_value = value.get("path") - if not path_value: - return None - path = Path(str(path_value)) - if not path.is_absolute(): - path = artifact_root / path - return path - - -def _stage_manifest_path(artifact_root: Path, stage_id: str) -> Path: - return artifact_root / "stage_artifacts" / "manifests" / f"{stage_id}.json" - - -def _load_json_if_available(path: Path) -> dict[str, Any] | None: - try: - payload = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - return None - return payload if isinstance(payload, dict) else None - - -__all__ = [ - "USStageResumeArtifactRequirement", - "USStageResumeMissingRequirement", - "USStageResumePreflightResult", - "format_us_stage_resume_preflight_error", - "preflight_us_stage_resume", -] diff --git a/src/microplex_us/pipelines/stage_run.py b/src/microplex_us/pipelines/stage_run.py deleted file mode 100644 index bcf1ec39..00000000 --- a/src/microplex_us/pipelines/stage_run.py +++ /dev/null @@ -1,1549 +0,0 @@ -"""Shared stage-run writer for US Microplex saved-run manifests.""" - -from __future__ import annotations - -import json -from collections.abc import Mapping -from dataclasses import asdict, dataclass, field, fields, is_dataclass -from enum import Enum -from pathlib import Path -from typing import Any, Literal - -from microplex_us.pipelines.data_flow_snapshot import ( - write_us_microplex_data_flow_snapshot, -) -from microplex_us.pipelines.stage_artifacts import ( - build_us_stage_artifact_inventory, - write_us_stage_artifact_inventory, -) -from microplex_us.pipelines.stage_contracts import ( - US_CANONICAL_STAGE_IDS, - US_STAGE_CONTRACT_VERSION, - StageArtifactFormat, - StageArtifactResumeRole, - StageResourceKind, - USStageResourceContract, - get_us_pipeline_stage_contract, - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_manifest import ( - build_us_validation_evidence_manifest, - write_us_stage_manifest, - write_us_validation_evidence_manifest, -) -from microplex_us.pipelines.stage_manifest_types import ( - USStageFailureRecord, - USStageLifecycleStatus, - USStageRuntimeEventRecord, -) -from microplex_us.pipelines.stage_readiness import ( - write_us_conditional_readiness_report, -) - -US_STAGE_OUTPUT_MANIFEST_SCHEMA_VERSION = 2 - -USArtifactCategory = Literal[ - "required_output", - "diagnostic", - "auxiliary", - "derived", -] - - -@dataclass(frozen=True) -class USArtifactRef: - """Reference to one artifact owned by a stage output manifest.""" - - key: str - path: str | Path - format: StageArtifactFormat = "unknown" - required: bool = False - category: USArtifactCategory = "required_output" - resume_role: StageArtifactResumeRole | None = None - assume_exists: bool = False - exists: bool | None = None - - def resolved_path(self, artifact_root: str | Path) -> Path: - path = Path(self.path) - if not path.is_absolute(): - path = Path(artifact_root) / path - return path - - def exists_under(self, artifact_root: str | Path) -> bool: - if self.assume_exists: - return True - if self.exists is not None: - return self.exists - return self.resolved_path(artifact_root).exists() - - def relative_path(self, artifact_root: str | Path) -> str: - path = self.resolved_path(artifact_root) - try: - return str(path.relative_to(Path(artifact_root))) - except ValueError: - return str(path) - - def to_dict(self, artifact_root: str | Path | None = None) -> dict[str, Any]: - payload = asdict(self) - payload["path"] = ( - self.relative_path(artifact_root) - if artifact_root is not None - else str(self.path) - ) - if artifact_root is not None: - payload["exists"] = self.exists_under(artifact_root) - return payload - - -@dataclass(frozen=True) -class USAuxiliaryArtifact: - """Optional artifact declared by a stage contract.""" - - key: str - path: str | Path - format: StageArtifactFormat = "unknown" - description: str = "" - assume_exists: bool = False - - def as_artifact_ref(self) -> USArtifactRef: - return USArtifactRef( - key=self.key, - path=self.path, - format=self.format, - category="auxiliary", - assume_exists=self.assume_exists, - ) - - -@dataclass(frozen=True) -class USDiagnosticOutput: - """Diagnostic output exposed by a stage manifest.""" - - key: str - description: str = "" - path: str | Path | None = None - summary: Mapping[str, Any] = field(default_factory=dict) - - def to_dict(self, artifact_root: str | Path | None = None) -> dict[str, Any]: - path = None - if self.path is not None: - resolved = Path(self.path) - if artifact_root is not None and not resolved.is_absolute(): - resolved = Path(artifact_root) / resolved - if artifact_root is not None: - try: - path = str(resolved.relative_to(Path(artifact_root))) - except ValueError: - path = str(resolved) - else: - path = str(self.path) - return { - "key": self.key, - "description": self.description, - "path": path, - "summary": dict(self.summary), - } - - -@dataclass(frozen=True) -class USStageInputOverride: - """Explicit override for a stage input that is not provided by the prior stage.""" - - stage_id: str - key: str - path: str | Path - reason: str | None = None - - def to_dict(self, artifact_root: str | Path | None = None) -> dict[str, Any]: - path = Path(self.path) - path_text = str(path) - if artifact_root is not None and not path.is_absolute(): - path_text = str(path) - return { - "stageId": self.stage_id, - "key": self.key, - "path": path_text, - "reason": self.reason, - } - - -@dataclass(frozen=True) -class USStageInputValidationSettings: - """Stage-specific settings for typed input boundary validation.""" - - stage_id: str - require_previous_stage_manifest: bool = True - enforce_required_stage_inputs: bool = True - enforce_only_when_stage_complete: bool = True - enforced_resource_kinds: tuple[StageResourceKind, ...] = ( - "artifact", - "manifest", - "stage_output", - ) - - -@dataclass(frozen=True) -class USStageOutputManifest: - """Base type for one typed stage output manifest.""" - - schema_version: int = US_STAGE_OUTPUT_MANIFEST_SCHEMA_VERSION - contract_version: str = US_STAGE_CONTRACT_VERSION - input_stage_manifest: str | Path | None = None - diagnostics: Mapping[str, USDiagnosticOutput] = field(default_factory=dict) - auxiliary_artifacts: Mapping[str, USAuxiliaryArtifact] = field(default_factory=dict) - metadata: Mapping[str, Any] = field(default_factory=dict) - complete: bool = True - lifecycle_status: USStageLifecycleStatus | None = None - started_at: str | None = None - updated_at: str | None = None - completed_at: str | None = None - failed_at: str | None = None - deferred_reason: str | None = None - failure: USStageFailureRecord | None = None - events: tuple[USStageRuntimeEventRecord, ...] = () - stage_id: str = field(default="", init=False) - - def required_output_keys(self) -> tuple[str, ...]: - """Return required output keys from the canonical stage contract.""" - - contract = get_us_pipeline_stage_contract(self.stage_id) - return tuple(resource.key for resource in contract.outputs if resource.required) - - def artifact_refs(self) -> dict[str, USArtifactRef]: - """Return artifact references carried by this stage output manifest.""" - - refs: dict[str, USArtifactRef] = {} - for item in fields(self): - value = getattr(self, item.name) - if isinstance(value, USArtifactRef): - refs[value.key] = value - for artifact in self.auxiliary_artifacts.values(): - refs[artifact.key] = artifact.as_artifact_ref() - return refs - - def missing_required_outputs(self, artifact_root: str | Path) -> tuple[str, ...]: - """Return required output keys not provided or not present on disk.""" - - missing: list[str] = [] - for key in self.required_output_keys(): - value = getattr(self, key, None) - if _required_output_is_missing(value, artifact_root): - missing.append(key) - return tuple(missing) - - def to_dict( - self, - artifact_root: str | Path | None = None, - *, - input_stage_manifest: str | None = None, - input_overrides: tuple[USStageInputOverride, ...] = (), - ) -> dict[str, Any]: - """Serialize this typed output manifest.""" - - diagnostics = { - key: diagnostic.to_dict(artifact_root) - for key, diagnostic in self.diagnostics.items() - } - auxiliary = { - key: artifact.as_artifact_ref().to_dict(artifact_root) - for key, artifact in self.auxiliary_artifacts.items() - } - output_fields = { - item.name: _serialize_value(getattr(self, item.name), artifact_root) - for item in fields(self) - if item.name - not in { - "schema_version", - "contract_version", - "input_stage_manifest", - "diagnostics", - "auxiliary_artifacts", - "metadata", - "complete", - "lifecycle_status", - "started_at", - "updated_at", - "completed_at", - "failed_at", - "deferred_reason", - "failure", - "events", - "stage_id", - } - } - return { - "schemaVersion": self.schema_version, - "contractVersion": self.contract_version, - "stageId": self.stage_id, - "complete": self.complete, - "lifecycleStatus": self.resolved_lifecycle_status(), - "startedAt": self.started_at, - "updatedAt": self.updated_at, - "completedAt": self.completed_at, - "failedAt": self.failed_at, - "deferredReason": self.deferred_reason, - "failure": self.failure, - "events": [_serialize_value(event, artifact_root) for event in self.events], - "inputStageManifest": input_stage_manifest - or _optional_str(self.input_stage_manifest), - "inputOverrides": [ - override.to_dict(artifact_root) for override in input_overrides - ], - "requiredOutputs": list(self.required_output_keys()), - "missingRequiredOutputs": ( - list(self.missing_required_outputs(artifact_root)) - if artifact_root is not None - else [] - ), - "outputs": output_fields, - "diagnostics": diagnostics, - "auxiliaryArtifacts": auxiliary, - "metadata": dict(self.metadata), - } - - def resolved_lifecycle_status(self) -> USStageLifecycleStatus: - """Return explicit lifecycle state or the legacy completion default.""" - - if self.lifecycle_status is not None: - return self.lifecycle_status - return "complete" if self.complete else "pending" - - -@dataclass(frozen=True) -class USRunProfileOutputs(USStageOutputManifest): - stage_id: str = field(default="01_run_profile", init=False) - manifest: USArtifactRef | None = None - resolved_config: Mapping[str, Any] = field(default_factory=dict) - provider_query_plan: Mapping[str, Any] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USSourceLoadingOutputs(USStageOutputManifest): - stage_id: str = field(default="02_source_loading", init=False) - observation_frame_summary: Mapping[str, Any] = field(default_factory=dict) - source_descriptors: tuple[str, ...] = () - source_relationships: Mapping[str, Any] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USSourcePlanningOutputs(USStageOutputManifest): - stage_id: str = field(default="03_source_planning", init=False) - source_plan: USArtifactRef | None = None - scaffold_selection: Mapping[str, Any] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USSeedScaffoldOutputs(USStageOutputManifest): - stage_id: str = field(default="04_seed_scaffold", init=False) - scaffold_seed_data: USArtifactRef | None = None - seed_schema_metadata: Mapping[str, Any] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USDonorSynthesisOutputs(USStageOutputManifest): - stage_id: str = field(default="05_donor_integration_synthesis", init=False) - seed_data: USArtifactRef | None = None - synthetic_data: USArtifactRef | None = None - synthesis_metadata: Mapping[str, Any] = field(default_factory=dict) - source_weight_diagnostics: USArtifactRef | None = None - - -@dataclass(frozen=True) -class USPolicyEngineEntityOutputs(USStageOutputManifest): - stage_id: str = field(default="06_policyengine_entities", init=False) - pre_calibration_policyengine_entity_tables: USArtifactRef | None = None - materialized_policyengine_inputs: Mapping[str, Any] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USCalibrationOutputs(USStageOutputManifest): - stage_id: str = field(default="07_calibration", init=False) - calibrated_data: USArtifactRef | None = None - targets: USArtifactRef | None = None - calibration_summary: USArtifactRef | None = None - policyengine_entity_tables: USArtifactRef | None = None - target_ledger: Mapping[str, Any] = field(default_factory=dict) - - -@dataclass(frozen=True) -class USDatasetAssemblyOutputs(USStageOutputManifest): - stage_id: str = field(default="08_dataset_assembly", init=False) - policyengine_dataset: USArtifactRef | None = None - stage_manifest: USArtifactRef | None = None - data_flow_snapshot: USArtifactRef | None = None - artifact_inventory: USArtifactRef | None = None - conditional_readiness: USArtifactRef | None = None - - -@dataclass(frozen=True) -class USValidationBenchmarkingOutputs(USStageOutputManifest): - stage_id: str = field(default="09_validation_benchmarking", init=False) - validation_evidence: USArtifactRef | None = None - benchmark_summary: Mapping[str, Any] = field(default_factory=dict) - policyengine_harness: USArtifactRef | None = None - policyengine_native_scores: USArtifactRef | None = None - policyengine_native_audit: USArtifactRef | None = None - policyengine_native_target_diagnostics: USArtifactRef | None = None - imputation_ablation: USArtifactRef | None = None - child_tax_unit_agi_drift: USArtifactRef | None = None - - -US_STAGE_OUTPUT_MANIFEST_TYPES: dict[str, type[USStageOutputManifest]] = { - "01_run_profile": USRunProfileOutputs, - "02_source_loading": USSourceLoadingOutputs, - "03_source_planning": USSourcePlanningOutputs, - "04_seed_scaffold": USSeedScaffoldOutputs, - "05_donor_integration_synthesis": USDonorSynthesisOutputs, - "06_policyengine_entities": USPolicyEngineEntityOutputs, - "07_calibration": USCalibrationOutputs, - "08_dataset_assembly": USDatasetAssemblyOutputs, - "09_validation_benchmarking": USValidationBenchmarkingOutputs, -} - - -class USStageRunWriter: - """Validate and write typed US stage output manifests as one run.""" - - def __init__( - self, - artifact_root: str | Path, - *, - manifest_payload: Mapping[str, Any] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), - ) -> None: - self.artifact_root = Path(artifact_root) - self.manifest_payload: dict[str, Any] = dict(manifest_payload or {}) - self.allow_stage_input_overrides = allow_stage_input_overrides - self.stage_input_overrides = tuple(stage_input_overrides) - if self.stage_input_overrides and not self.allow_stage_input_overrides: - raise ValueError( - "Stage input overrides require allow_stage_input_overrides=True" - ) - for override in self.stage_input_overrides: - _validate_us_stage_input_override(override) - self._recorded: dict[str, USStageOutputManifest] = {} - self._input_validator = USStageInputValidator( - self.artifact_root, - self._recorded, - allow_stage_input_overrides=self.allow_stage_input_overrides, - stage_input_overrides=self.stage_input_overrides, - ) - - @property - def recorded_stages(self) -> tuple[USStageOutputManifest, ...]: - """Return recorded stages in canonical order.""" - - return tuple( - self._recorded[stage_id] - for stage_id in US_CANONICAL_STAGE_IDS - if stage_id in self._recorded - ) - - def update(self, outputs: USStageOutputManifest) -> None: - """Record one whole typed stage output manifest.""" - - self.record_stage(outputs) - - def record_stage(self, outputs: USStageOutputManifest) -> None: - """Validate and record one whole typed stage output manifest.""" - - self.validate_stage(outputs) - self.validate_transition(outputs) - self._recorded[outputs.stage_id] = outputs - - def validate_stage(self, outputs: USStageOutputManifest) -> None: - """Validate one typed stage output manifest against its contract.""" - - expected_type = US_STAGE_OUTPUT_MANIFEST_TYPES.get(outputs.stage_id) - if expected_type is None: - raise KeyError(f"Unknown US stage output manifest: {outputs.stage_id}") - if not isinstance(outputs, expected_type): - raise TypeError( - f"{outputs.stage_id} must use {expected_type.__name__}, " - f"got {type(outputs).__name__}" - ) - get_us_pipeline_stage_contract(outputs.stage_id) - if not outputs.diagnostics: - raise ValueError(f"{outputs.stage_id} does not expose diagnostics") - missing = outputs.missing_required_outputs(self.artifact_root) - if outputs.complete and missing: - raise ValueError( - f"{outputs.stage_id} is marked complete but is missing required " - f"outputs: {', '.join(missing)}" - ) - contract_artifact_keys = { - artifact.key - for artifact in get_us_pipeline_stage_contract(outputs.stage_id).artifacts - } - for artifact in outputs.auxiliary_artifacts.values(): - if artifact.key not in contract_artifact_keys: - raise KeyError( - f"{outputs.stage_id} auxiliary artifact {artifact.key!r} " - "is not declared by the stage contract" - ) - for artifact in outputs.artifact_refs().values(): - if artifact.key not in contract_artifact_keys: - raise KeyError( - f"{outputs.stage_id} artifact {artifact.key!r} is not declared " - "by the stage contract" - ) - - def validate_transition(self, outputs: USStageOutputManifest) -> None: - """Validate that a stage consumes the previous stage output manifest.""" - - self._input_validator.validate(outputs) - - def write_manifest_files(self) -> dict[str, Any]: - """Write per-stage manifests and derived aggregate run manifests.""" - - self.artifact_root.mkdir(parents=True, exist_ok=True) - manifest = self._materialize_manifest_payload() - stage_manifest_path = resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "stage_manifest", - ) - data_flow_snapshot_path = resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - ) - artifact_inventory_path = resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "artifact_inventory", - ) - conditional_readiness_path = resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "conditional_readiness", - ) - manifest_path = resolve_us_stage_artifact_contract_path( - self.artifact_root, - "01_run_profile", - "manifest", - ) - validation_evidence_name = dict(manifest.get("artifacts", {})).get( - "validation_evidence" - ) - - _write_json_atomically(manifest_path, manifest) - if validation_evidence_name: - validation_evidence_path = self._resolve_path(validation_evidence_name) - write_us_validation_evidence_manifest( - self.artifact_root, - validation_evidence_path, - manifest_payload=manifest, - ) - write_us_microplex_data_flow_snapshot( - self.artifact_root, - data_flow_snapshot_path, - manifest_payload=manifest, - assume_existing_stage_artifact_keys=( - "stage_manifest", - "artifact_inventory", - "conditional_readiness", - ), - ) - write_us_stage_manifest( - self.artifact_root, - stage_manifest_path, - manifest_payload=manifest, - assume_existing_artifact_keys=( - "artifact_inventory", - "conditional_readiness", - ), - ) - readiness_inventory = build_us_stage_artifact_inventory( - self.artifact_root, - manifest_payload=manifest, - assume_existing_artifact_keys=( - "artifact_inventory", - "conditional_readiness", - ), - ) - write_us_conditional_readiness_report( - self.artifact_root, - conditional_readiness_path, - manifest_payload=manifest, - artifact_inventory=readiness_inventory, - ) - write_us_stage_artifact_inventory( - self.artifact_root, - artifact_inventory_path, - manifest_payload=manifest, - assume_existing_artifact_keys=("artifact_inventory",), - ) - return manifest - - def _materialize_manifest_payload(self) -> dict[str, Any]: - manifest = dict(self.manifest_payload) - artifacts = dict(manifest.get("artifacts", {})) - stage_manifest_paths: dict[str, str] = {} - - for stage_id in US_CANONICAL_STAGE_IDS: - outputs = self._recorded.get(stage_id) - if outputs is None: - continue - stage_manifest_path = self._stage_output_manifest_path(stage_id) - stage_manifest_path.parent.mkdir(parents=True, exist_ok=True) - stage_manifest_paths[stage_id] = str( - stage_manifest_path.relative_to(self.artifact_root) - ) - for artifact in outputs.artifact_refs().values(): - artifacts[artifact.key] = artifact.relative_path(self.artifact_root) - - self._ensure_aggregate_artifact_paths(artifacts) - manifest["artifacts"] = artifacts - manifest["stage_output_manifests"] = stage_manifest_paths - manifest.setdefault("diagnostics", {}) - for stage_id, outputs in self._recorded.items(): - manifest["diagnostics"].setdefault( - stage_id, - { - key: diagnostic.to_dict(self.artifact_root) - for key, diagnostic in outputs.diagnostics.items() - }, - ) - for stage_id, outputs in self._recorded.items(): - stage_manifest_path = self._stage_output_manifest_path(stage_id) - _write_json_atomically( - stage_manifest_path, - outputs.to_dict( - self.artifact_root, - input_stage_manifest=self._previous_stage_manifest_ref(stage_id), - input_overrides=self._overrides_for_stage(stage_id), - ), - ) - self.manifest_payload = manifest - return manifest - - def _ensure_aggregate_artifact_paths(self, artifacts: dict[str, Any]) -> None: - artifacts.setdefault( - "stage_manifest", - resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "stage_manifest", - ).name, - ) - artifacts.setdefault( - "data_flow_snapshot", - resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "data_flow_snapshot", - ).name, - ) - artifacts.setdefault( - "artifact_inventory", - str( - resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "artifact_inventory", - ).relative_to(self.artifact_root) - ), - ) - artifacts.setdefault( - "conditional_readiness", - str( - resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "conditional_readiness", - ).relative_to(self.artifact_root) - ), - ) - - def _stage_output_manifest_path(self, stage_id: str) -> Path: - return self.artifact_root / "stage_artifacts" / "manifests" / f"{stage_id}.json" - - def _previous_stage_manifest_ref(self, stage_id: str) -> str | None: - stage_index = US_CANONICAL_STAGE_IDS.index(stage_id) - if stage_index == 0: - return None - previous_stage_id = US_CANONICAL_STAGE_IDS[stage_index - 1] - if previous_stage_id not in self._recorded: - return None - return str( - self._stage_output_manifest_path(previous_stage_id).relative_to( - self.artifact_root - ) - ) - - def _overrides_for_stage(self, stage_id: str) -> tuple[USStageInputOverride, ...]: - return tuple( - override - for override in self.stage_input_overrides - if override.stage_id == stage_id - ) - - def _resolve_path(self, value: Any) -> Path: - path = Path(str(value)) - if not path.is_absolute(): - path = self.artifact_root / path - return path - - -class USStageInputValidator: - """Validate stage input seams against typed stage manifests and overrides.""" - - def __init__( - self, - artifact_root: str | Path, - recorded_stages: Mapping[str, USStageOutputManifest], - *, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), - settings_by_stage: Mapping[str, USStageInputValidationSettings] | None = None, - ) -> None: - self.artifact_root = Path(artifact_root) - self.recorded_stages = recorded_stages - self.allow_stage_input_overrides = allow_stage_input_overrides - self.stage_input_overrides = tuple(stage_input_overrides) - self.settings_by_stage = dict( - settings_by_stage or default_us_stage_input_validation_settings() - ) - - def validate(self, outputs: USStageOutputManifest) -> None: - """Validate one stage's required input boundary.""" - - stage_index = US_CANONICAL_STAGE_IDS.index(outputs.stage_id) - if stage_index == 0: - return - settings = self.settings_by_stage[outputs.stage_id] - previous_stage_id = US_CANONICAL_STAGE_IDS[stage_index - 1] - required_stage_inputs = tuple( - resource - for resource in get_us_pipeline_stage_contract(outputs.stage_id).inputs - if self._enforces_resource(resource, settings) - ) - missing_inputs = tuple( - self._resource_label(resource) - for resource in required_stage_inputs - if not self._resource_is_satisfied(resource, outputs) - ) - previous_inputs = tuple( - resource - for resource in required_stage_inputs - if resource.stage_id == previous_stage_id - ) - previous_stage_available = self._stage_manifest_available( - previous_stage_id, - outputs, - ) - previous_stage_overridden = bool(previous_inputs) and all( - self._override_satisfies(outputs.stage_id, resource) - for resource in previous_inputs - ) - if ( - settings.require_previous_stage_manifest - and not previous_stage_available - and not previous_stage_overridden - ): - detail = ( - f"; missing required inputs: {', '.join(missing_inputs)}" - if missing_inputs - else "" - ) - raise ValueError( - f"{outputs.stage_id} requires {previous_stage_id} output manifest " - "or explicit overrides for all required inputs from that stage" - f"{detail}" - ) - if ( - settings.enforce_required_stage_inputs - and missing_inputs - and (not settings.enforce_only_when_stage_complete or outputs.complete) - ): - raise ValueError( - f"{outputs.stage_id} is missing required stage input(s): " - f"{', '.join(missing_inputs)}" - ) - - def _enforces_resource( - self, - resource: USStageResourceContract, - settings: USStageInputValidationSettings, - ) -> bool: - return ( - resource.required - and resource.stage_id is not None - and resource.kind in settings.enforced_resource_kinds - ) - - def _resource_is_satisfied( - self, - resource: USStageResourceContract, - outputs: USStageOutputManifest, - ) -> bool: - if self._override_satisfies(outputs.stage_id, resource): - return True - source_stage_id = resource.stage_id - if source_stage_id is None: - return False - recorded_outputs = self.recorded_stages.get(source_stage_id) - if recorded_outputs is not None: - return not _required_output_is_missing( - getattr(recorded_outputs, resource.key, None), - self.artifact_root, - ) - serialized_stage = self._serialized_input_stage_manifest(outputs) - if ( - serialized_stage is not None - and serialized_stage.get("stageId") == source_stage_id - ): - return _serialized_output_key_is_available( - serialized_stage, - resource.key, - ) - return False - - def _stage_manifest_available( - self, - stage_id: str, - outputs: USStageOutputManifest, - ) -> bool: - if stage_id in self.recorded_stages: - return True - serialized_stage = self._serialized_input_stage_manifest(outputs) - return ( - serialized_stage is not None and serialized_stage.get("stageId") == stage_id - ) - - def _serialized_input_stage_manifest( - self, - outputs: USStageOutputManifest, - ) -> Mapping[str, Any] | None: - if outputs.input_stage_manifest is None: - return None - stage_index = US_CANONICAL_STAGE_IDS.index(outputs.stage_id) - if stage_index == 0: - return None - previous_stage_id = US_CANONICAL_STAGE_IDS[stage_index - 1] - path = Path(outputs.input_stage_manifest) - if not path.is_absolute(): - path = self.artifact_root / path - expected_path = ( - self.artifact_root - / "stage_artifacts" - / "manifests" - / f"{previous_stage_id}.json" - ) - if path != expected_path or not path.exists(): - return None - try: - payload = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - return None - return payload if isinstance(payload, Mapping) else None - - def _override_satisfies( - self, - stage_id: str, - resource: USStageResourceContract, - ) -> bool: - if not self.allow_stage_input_overrides: - return False - return any( - override.stage_id == stage_id and override.key == resource.key - for override in self.stage_input_overrides - ) - - @staticmethod - def _resource_label(resource: USStageResourceContract) -> str: - return f"{resource.stage_id}.{resource.key}" - - -def build_us_stage_output_manifests_from_artifact_manifest( - artifact_root: str | Path, - manifest_payload: Mapping[str, Any], -) -> tuple[USStageOutputManifest, ...]: - """Build typed stage output manifests from an existing artifact manifest.""" - - root = Path(artifact_root) - manifest = dict(manifest_payload) - synthesis = dict(manifest.get("synthesis", {})) - rows = dict(manifest.get("rows", {})) - config = dict(manifest.get("config", {})) - artifacts = dict(manifest.get("artifacts", {})) - source_names = tuple( - str(source) - for source in synthesis.get("source_names", ()) - if isinstance(source, str) - ) - benchmark_summary, has_benchmark_evidence = _benchmark_summary(root, manifest) - has_benchmark = bool(benchmark_summary) and has_benchmark_evidence - has_dataset = _artifact_exists(root, artifacts, "policyengine_dataset") - return ( - USRunProfileOutputs( - manifest=_artifact_ref( - root, - {"manifest": artifacts.get("manifest", "manifest.json")}, - "manifest", - "01_run_profile", - assume_exists=True, - ), - resolved_config=config, - provider_query_plan={"source_names": list(source_names)}, - diagnostics=_diagnostics("01_run_profile", manifest), - complete=bool(config), - ), - USSourceLoadingOutputs( - observation_frame_summary={"source_count": len(source_names)}, - source_descriptors=source_names, - source_relationships={"status": "summarized"}, - diagnostics=_diagnostics("02_source_loading", manifest), - complete=bool(source_names), - ), - USSourcePlanningOutputs( - source_plan=_artifact_ref( - root, artifacts, "source_plan", "03_source_planning" - ), - scaffold_selection={"scaffold_source": synthesis.get("scaffold_source")}, - diagnostics=_diagnostics("03_source_planning", manifest), - complete=_artifact_exists(root, artifacts, "source_plan"), - ), - USSeedScaffoldOutputs( - scaffold_seed_data=_artifact_ref( - root, - artifacts, - "scaffold_seed_data", - "04_seed_scaffold", - ), - seed_schema_metadata={"seed_rows": rows.get("seed")}, - diagnostics=_diagnostics("04_seed_scaffold", manifest), - complete=_artifact_exists(root, artifacts, "scaffold_seed_data"), - ), - USDonorSynthesisOutputs( - seed_data=_artifact_ref( - root, - artifacts, - "seed_data", - "05_donor_integration_synthesis", - ), - synthetic_data=_artifact_ref( - root, - artifacts, - "synthetic_data", - "05_donor_integration_synthesis", - ), - synthesis_metadata=synthesis, - source_weight_diagnostics=_artifact_ref( - root, - artifacts, - "source_weight_diagnostics", - "05_donor_integration_synthesis", - category="diagnostic", - ), - diagnostics=_diagnostics("05_donor_integration_synthesis", manifest), - complete=all( - _artifact_exists(root, artifacts, key) - for key in ("seed_data", "synthetic_data") - ), - ), - USPolicyEngineEntityOutputs( - pre_calibration_policyengine_entity_tables=_artifact_ref( - root, - artifacts, - "pre_calibration_policyengine_entity_tables", - "06_policyengine_entities", - ), - materialized_policyengine_inputs=_policyengine_entity_metadata_summary( - root, - artifacts, - artifact_key="pre_calibration_policyengine_entity_tables", - ), - diagnostics=_diagnostics("06_policyengine_entities", manifest), - complete=_artifact_exists( - root, - artifacts, - "pre_calibration_policyengine_entity_tables", - ), - ), - USCalibrationOutputs( - calibrated_data=_artifact_ref( - root, artifacts, "calibrated_data", "07_calibration" - ), - targets=_artifact_ref(root, artifacts, "targets", "07_calibration"), - calibration_summary=_artifact_ref( - root, - artifacts, - "calibration_summary", - "07_calibration", - category="diagnostic", - ), - policyengine_entity_tables=_artifact_ref( - root, - artifacts, - "policyengine_entity_tables", - "07_calibration", - ), - target_ledger={"target_count": manifest.get("targets", {})}, - diagnostics=_diagnostics("07_calibration", manifest), - complete=all( - _artifact_exists(root, artifacts, key) - for key in ( - "pre_calibration_policyengine_entity_tables", - "calibrated_data", - "targets", - "calibration_summary", - "policyengine_entity_tables", - ) - ), - ), - USDatasetAssemblyOutputs( - policyengine_dataset=_artifact_ref( - root, - artifacts, - "policyengine_dataset", - "08_dataset_assembly", - ), - stage_manifest=_derived_artifact_ref( - root, "stage_manifest", "08_dataset_assembly" - ), - data_flow_snapshot=_derived_artifact_ref( - root, - "data_flow_snapshot", - "08_dataset_assembly", - ), - artifact_inventory=_derived_artifact_ref( - root, - "artifact_inventory", - "08_dataset_assembly", - ), - conditional_readiness=_derived_artifact_ref( - root, - "conditional_readiness", - "08_dataset_assembly", - ), - diagnostics=_diagnostics("08_dataset_assembly", manifest), - complete=bool(has_dataset), - ), - USValidationBenchmarkingOutputs( - validation_evidence=( - _derived_artifact_ref( - root, - "validation_evidence", - "09_validation_benchmarking", - ) - if has_dataset or has_benchmark - else None - ), - benchmark_summary=benchmark_summary, - policyengine_harness=_artifact_ref( - root, - artifacts, - "policyengine_harness", - "09_validation_benchmarking", - category="diagnostic", - ), - policyengine_native_scores=_artifact_ref( - root, - artifacts, - "policyengine_native_scores", - "09_validation_benchmarking", - category="diagnostic", - ), - policyengine_native_audit=_artifact_ref( - root, - artifacts, - "policyengine_native_audit", - "09_validation_benchmarking", - category="diagnostic", - ), - policyengine_native_target_diagnostics=_artifact_ref( - root, - artifacts, - "policyengine_native_target_diagnostics", - "09_validation_benchmarking", - category="diagnostic", - ), - imputation_ablation=_artifact_ref( - root, - artifacts, - "imputation_ablation", - "09_validation_benchmarking", - category="diagnostic", - ), - child_tax_unit_agi_drift=_artifact_ref( - root, - artifacts, - "child_tax_unit_agi_drift", - "09_validation_benchmarking", - category="diagnostic", - ), - diagnostics=_diagnostics( - "09_validation_benchmarking", - manifest, - stage_summary=benchmark_summary, - ), - complete=bool(has_benchmark), - lifecycle_status=( - "complete" if has_benchmark else "deferred" if has_dataset else None - ), - deferred_reason=( - None - if has_benchmark - else "Stage 8 dataset exists, but validation or benchmark evidence is not attached." - if has_dataset - else None - ), - ), - ) - - -def write_us_stage_run_manifests_from_artifact_manifest( - artifact_root: str | Path, - manifest_payload: Mapping[str, Any], - *, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), -) -> dict[str, Any]: - """Write typed stage manifests and aggregate outputs from an artifact manifest.""" - - writer = USStageRunWriter( - artifact_root, - manifest_payload=manifest_payload, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - for outputs in build_us_stage_output_manifests_from_artifact_manifest( - artifact_root, - manifest_payload, - ): - writer.record_stage(outputs) - return writer.write_manifest_files() - - -def resolve_us_manifest_or_contract_artifact_path( - artifact_root: str | Path, - manifest_payload: Mapping[str, Any], - artifact_key: str, - *, - stage_id: str, -) -> Path: - """Resolve an artifact from the manifest first, then the stage contract.""" - - artifacts = dict(manifest_payload.get("artifacts", {})) - declared = artifacts.get(artifact_key) - if declared is not None: - path = Path(str(declared)) - if not path.is_absolute(): - path = Path(artifact_root) / path - return path - return resolve_us_stage_artifact_contract_path( - artifact_root, stage_id, artifact_key - ) - - -def parse_us_stage_input_override(value: str) -> USStageInputOverride: - """Parse STAGE_ID.KEY=PATH into a stage input override.""" - - if "=" not in value: - raise ValueError("Stage input overrides must use STAGE_ID.KEY=PATH syntax") - left, path = value.split("=", 1) - if "." not in left: - raise ValueError("Stage input overrides must use STAGE_ID.KEY=PATH syntax") - stage_id, key = left.split(".", 1) - if not stage_id or not key or not path: - raise ValueError("Stage input overrides must use STAGE_ID.KEY=PATH syntax") - if stage_id not in US_CANONICAL_STAGE_IDS: - raise ValueError(f"Unknown US pipeline stage: {stage_id}") - override = USStageInputOverride(stage_id=stage_id, key=key, path=path) - _validate_us_stage_input_override(override) - return override - - -def default_us_stage_input_validation_settings() -> dict[ - str, USStageInputValidationSettings -]: - """Return stage-specific settings for typed input boundary validation.""" - - return { - stage_id: USStageInputValidationSettings( - stage_id=stage_id, - require_previous_stage_manifest=stage_id != "01_run_profile", - ) - for stage_id in US_CANONICAL_STAGE_IDS - } - - -def _validate_us_stage_input_override(override: USStageInputOverride) -> None: - if override.stage_id not in US_CANONICAL_STAGE_IDS: - raise ValueError(f"Unknown US pipeline stage: {override.stage_id}") - contract = get_us_pipeline_stage_contract(override.stage_id) - input_keys = {resource.key for resource in contract.inputs} - if override.key not in input_keys: - valid_keys = ", ".join(sorted(input_keys)) or "none" - raise ValueError( - f"Unknown input override key {override.stage_id}.{override.key}; " - f"valid keys: {valid_keys}" - ) - - -def _artifact_ref( - artifact_root: Path, - artifacts: Mapping[str, Any], - artifact_key: str, - stage_id: str, - *, - category: USArtifactCategory = "required_output", - assume_exists: bool = False, -) -> USArtifactRef | None: - declared = artifacts.get(artifact_key) - if declared is None: - return None - contract = get_us_stage_artifact_contract(stage_id, artifact_key) - return USArtifactRef( - key=artifact_key, - path=str(declared), - format=contract.format, - required=contract.required, - category=category, - resume_role=contract.resume_role, - assume_exists=assume_exists, - exists=_artifact_path_exists(artifact_root, declared), - ) - - -def _derived_artifact_ref( - artifact_root: Path, - artifact_key: str, - stage_id: str, -) -> USArtifactRef: - contract = get_us_stage_artifact_contract(stage_id, artifact_key) - path = resolve_us_stage_artifact_contract_path( - artifact_root, stage_id, artifact_key - ) - return USArtifactRef( - key=artifact_key, - path=str(path.relative_to(artifact_root)), - format=contract.format, - required=contract.required, - category="derived", - resume_role=contract.resume_role, - assume_exists=True, - ) - - -def _artifact_exists( - artifact_root: Path, - artifacts: Mapping[str, Any], - artifact_key: str, -) -> bool: - declared = artifacts.get(artifact_key) - return declared is not None and _artifact_path_exists(artifact_root, declared) - - -def _artifact_path_exists(artifact_root: Path, value: Any) -> bool: - path = Path(str(value)) - if not path.is_absolute(): - path = artifact_root / path - return path.exists() - - -def _path_for_manifest(path: Path, artifact_root: Path) -> str: - try: - return str(path.relative_to(artifact_root)) - except ValueError: - return str(path) - - -def _policyengine_entity_metadata_summary( - artifact_root: Path, - artifacts: Mapping[str, Any], - *, - artifact_key: str = "policyengine_entity_tables", -) -> dict[str, Any]: - declared = artifacts.get(artifact_key) - if declared is None: - return {} - path = Path(str(declared)) - if not path.is_absolute(): - path = artifact_root / path - summary: dict[str, Any] = { - "metadata_path": _path_for_manifest(path, artifact_root), - } - if not path.exists() or not path.is_file(): - return summary - try: - metadata = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - return summary - if not isinstance(metadata, Mapping): - return summary - stage = metadata.get("stage") - if stage is not None: - summary["stage"] = stage - tables: dict[str, dict[str, Any]] = {} - for key in ( - "households", - "persons", - "tax_units", - "spm_units", - "families", - "marital_units", - ): - table_metadata = metadata.get(key) - if not isinstance(table_metadata, Mapping): - continue - columns = table_metadata.get("columns", ()) - column_names = ( - [str(column) for column in columns] - if isinstance(columns, (list, tuple)) - else [] - ) - tables[key] = { - "rows": table_metadata.get("rows"), - "columns": column_names, - } - if tables: - summary["tables"] = tables - return summary - - -def _diagnostics( - stage_id: str, - manifest: Mapping[str, Any], - *, - stage_summary: Mapping[str, Any] | None = None, -) -> dict[str, USDiagnosticOutput]: - diagnostics = dict(manifest.get("diagnostics", {})) - stage_diagnostics = diagnostics.get(stage_id) - summary = ( - dict(stage_diagnostics) - if isinstance(stage_diagnostics, Mapping) - else dict(stage_summary) - if stage_summary is not None - else _default_stage_diagnostic_summary(stage_id, manifest) - ) - return { - "stage_summary": USDiagnosticOutput( - key="stage_summary", - description=f"Saved-run diagnostic summary for {stage_id}.", - summary=summary, - ) - } - - -def _default_stage_diagnostic_summary( - stage_id: str, - manifest: Mapping[str, Any], -) -> dict[str, Any]: - rows = dict(manifest.get("rows", {})) - synthesis = dict(manifest.get("synthesis", {})) - calibration = dict(manifest.get("calibration", {})) - artifacts = dict(manifest.get("artifacts", {})) - if stage_id == "01_run_profile": - return {"has_config": isinstance(manifest.get("config"), Mapping)} - if stage_id == "02_source_loading": - return {"source_names": list(synthesis.get("source_names", ()))} - if stage_id == "03_source_planning": - return {"scaffold_source": synthesis.get("scaffold_source")} - if stage_id == "04_seed_scaffold": - return {"seed_rows": rows.get("seed")} - if stage_id == "05_donor_integration_synthesis": - return { - "seed_rows": rows.get("seed"), - "synthetic_rows": rows.get("synthetic"), - "backend": synthesis.get("backend"), - } - if stage_id == "06_policyengine_entities": - return { - "entity_tables": artifacts.get("pre_calibration_policyengine_entity_tables") - } - if stage_id == "07_calibration": - return { - "calibrated_rows": rows.get("calibrated"), - "backend": calibration.get("backend"), - "converged": calibration.get("converged"), - } - if stage_id == "08_dataset_assembly": - return {"dataset": artifacts.get("policyengine_dataset")} - if stage_id == "09_validation_benchmarking": - return _manifest_benchmark_summary(manifest) - return {} - - -def _benchmark_summary( - artifact_root: Path, - manifest: Mapping[str, Any], -) -> tuple[dict[str, Any], bool]: - try: - evidence = build_us_validation_evidence_manifest( - artifact_root, - manifest_payload=dict(manifest), - ) - except (OSError, ValueError, TypeError): - summary = _manifest_benchmark_summary_for_existing_artifacts( - artifact_root, - manifest, - ) - return summary, bool(summary) - summary = _validation_evidence_summary_for_existing_evidence(evidence) - if summary: - return summary, True - summary = _manifest_benchmark_summary_for_existing_artifacts( - artifact_root, - manifest, - ) - return summary, bool(summary) - - -def _manifest_benchmark_summary(manifest: Mapping[str, Any]) -> dict[str, Any]: - summary: dict[str, Any] = {} - for key in ( - "policyengine_harness", - "policyengine_native_scores", - "policyengine_native_audit", - "policyengine_native_target_diagnostics", - "imputation_ablation", - ): - value = manifest.get(key) - if isinstance(value, Mapping): - summary[key] = dict(value) - return summary - - -def _validation_evidence_summary_for_existing_evidence( - evidence: Mapping[str, Any], -) -> dict[str, Any]: - records = evidence.get("evidence") - if not isinstance(records, list): - return {} - existing_keys = { - str(record["key"]) - for record in records - if isinstance(record, Mapping) - and record.get("key") - and record.get("exists") is True - } - summaries = evidence.get("summaries") - if not isinstance(summaries, Mapping): - return {} - return { - str(key): item for key, item in summaries.items() if str(key) in existing_keys - } - - -def _manifest_benchmark_summary_for_existing_artifacts( - artifact_root: Path, - manifest: Mapping[str, Any], -) -> dict[str, Any]: - artifacts = dict(manifest.get("artifacts", {})) - return { - key: value - for key, value in _manifest_benchmark_summary(manifest).items() - if _artifact_exists(artifact_root, artifacts, key) - } - - -def _serialize_value(value: Any, artifact_root: str | Path | None) -> Any: - if isinstance(value, USArtifactRef): - return value.to_dict(artifact_root) - if isinstance(value, USAuxiliaryArtifact): - return value.as_artifact_ref().to_dict(artifact_root) - if isinstance(value, USDiagnosticOutput): - return value.to_dict(artifact_root) - if isinstance(value, Path): - return str(value) - if isinstance(value, Enum): - return value.value - if isinstance(value, Mapping): - return { - str(key): _serialize_value(item, artifact_root) - for key, item in value.items() - } - if isinstance(value, tuple): - return [_serialize_value(item, artifact_root) for item in value] - if isinstance(value, list): - return [_serialize_value(item, artifact_root) for item in value] - if is_dataclass(value): - return { - str(key): _serialize_value(item, artifact_root) - for key, item in asdict(value).items() - } - return value - - -def _required_output_is_missing(value: Any, artifact_root: str | Path) -> bool: - if value is None: - return True - if isinstance(value, USArtifactRef): - return not value.exists_under(artifact_root) - if isinstance(value, Mapping): - return not bool(value) - if isinstance(value, (tuple, list, set, frozenset)): - return not bool(value) - if isinstance(value, str): - return not value - return False - - -def _serialized_output_key_is_available( - stage_manifest: Mapping[str, Any], - key: str, -) -> bool: - outputs = stage_manifest.get("outputs") - if not isinstance(outputs, Mapping) or key not in outputs: - return False - value = outputs[key] - if value is None: - return False - if isinstance(value, Mapping): - exists = value.get("exists") - if exists is not None: - return bool(exists) - return bool(value) - if isinstance(value, (tuple, list, set, frozenset)): - return bool(value) - if isinstance(value, str): - return bool(value) - return True - - -def _optional_str(value: Any) -> str | None: - if value is None: - return None - return str(value) - - -def _write_json_atomically(path: Path, payload: Mapping[str, Any]) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - temporary = path.with_suffix(path.suffix + ".tmp") - temporary.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temporary.replace(path) - - -__all__ = [ - "USAuxiliaryArtifact", - "USArtifactCategory", - "USArtifactRef", - "USCalibrationOutputs", - "USDatasetAssemblyOutputs", - "USDiagnosticOutput", - "USDonorSynthesisOutputs", - "USPolicyEngineEntityOutputs", - "USRunProfileOutputs", - "USSeedScaffoldOutputs", - "USSourceLoadingOutputs", - "USSourcePlanningOutputs", - "USStageInputOverride", - "USStageInputValidationSettings", - "USStageInputValidator", - "USStageFailureRecord", - "USStageLifecycleStatus", - "USStageOutputManifest", - "USStageRuntimeEventRecord", - "USStageRunWriter", - "USValidationBenchmarkingOutputs", - "build_us_stage_output_manifests_from_artifact_manifest", - "default_us_stage_input_validation_settings", - "parse_us_stage_input_override", - "resolve_us_manifest_or_contract_artifact_path", - "write_us_stage_run_manifests_from_artifact_manifest", -] diff --git a/src/microplex_us/pipelines/stage_runtime.py b/src/microplex_us/pipelines/stage_runtime.py deleted file mode 100644 index dcba5946..00000000 --- a/src/microplex_us/pipelines/stage_runtime.py +++ /dev/null @@ -1,737 +0,0 @@ -"""Live runtime writer for canonical US pipeline stage manifests.""" - -from __future__ import annotations - -import json -import traceback -from collections.abc import Mapping -from dataclasses import fields, replace -from datetime import UTC, datetime -from pathlib import Path -from typing import Any, Literal - -from microplex_us.pipelines.stage_contracts import ( - US_CANONICAL_STAGE_IDS, - US_STAGE_CONTRACT_VERSION, - get_us_pipeline_stage_contract, - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_manifest import write_us_stage_manifest -from microplex_us.pipelines.stage_manifest_types import ( - USStageFailureRecord, - USStageLifecycleStatus, - USStageRuntimeEventRecord, -) -from microplex_us.pipelines.stage_run import ( - USArtifactRef, - USDiagnosticOutput, - USStageInputOverride, - USStageOutputManifest, - USStageRunWriter, - _serialize_value, - build_us_stage_output_manifests_from_artifact_manifest, -) - -RuntimeUpdateSection = Literal["outputs", "diagnostics", "metadata"] - - -class USStageRuntimeWriter: - """Write stage manifests incrementally during a canonical US build.""" - - def __init__( - self, - artifact_root: str | Path, - *, - manifest_payload: Mapping[str, Any] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), - ) -> None: - self.artifact_root = Path(artifact_root) - self.manifest_payload: dict[str, Any] = dict(manifest_payload or {}) - self.allow_stage_input_overrides = allow_stage_input_overrides - self.stage_input_overrides = tuple(stage_input_overrides) - self._run_writer = USStageRunWriter( - self.artifact_root, - manifest_payload=self.manifest_payload, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - - @property - def recorded_stages(self) -> tuple[USStageOutputManifest, ...]: - """Return completed typed stage manifests recorded by this writer.""" - - return self._run_writer.recorded_stages - - def start_stage( - self, - stage_id: str, - *, - metadata: Mapping[str, Any] | None = None, - ) -> dict[str, Any]: - """Mark one stage as running after validating its previous stage seam.""" - - self._validate_stage_id(stage_id) - self._validate_start_transition(stage_id) - now = _now() - payload = self._stage_payload(stage_id) - payload["complete"] = False - payload["lifecycleStatus"] = "running" - payload["startedAt"] = payload.get("startedAt") or now - payload["updatedAt"] = now - payload["completedAt"] = None - payload["failedAt"] = None - payload["deferredReason"] = None - payload["failure"] = None - payload["inputOverrides"] = self._serialized_overrides_for_stage(stage_id) - payload["metadata"] = { - **dict(payload.get("metadata", {})), - **dict(metadata or {}), - } - payload["events"] = [ - *list(payload.get("events", [])), - _event("stage_started", now, dict(metadata or {})), - ] - self._write_stage_payload(stage_id, payload) - self._refresh_aggregate() - return payload - - def update( - self, - stage_id: str, - key: str, - value: Any, - *, - section: RuntimeUpdateSection = "outputs", - path: str | Path | None = None, - ) -> dict[str, Any]: - """Update one manifest entry, optionally writing a JSON artifact first.""" - - self._validate_stage_id(stage_id) - if section == "outputs": - self._validate_output_key(stage_id, key) - payload = self._stage_payload(stage_id) - written_value = value - if path is not None: - written_value = self._write_update_artifact(stage_id, key, value, path) - bucket = dict(payload.get(section, {})) - bucket[key] = _runtime_serialize(written_value, self.artifact_root) - payload[section] = bucket - now = _now() - payload["updatedAt"] = now - payload["events"] = [ - *list(payload.get("events", [])), - _event("stage_updated", now, {"section": section, "key": key}), - ] - self._write_stage_payload(stage_id, payload) - self._refresh_aggregate() - return payload - - def record_output( - self, - stage_id: str, - key: str, - value: Any, - *, - path: str | Path | None = None, - ) -> dict[str, Any]: - """Record one stage output entry.""" - - return self.update(stage_id, key, value, section="outputs", path=path) - - def record_diagnostic( - self, - stage_id: str, - diagnostic: USDiagnosticOutput, - ) -> dict[str, Any]: - """Record one diagnostic output for a running stage.""" - - return self.update( - stage_id, - diagnostic.key, - diagnostic, - section="diagnostics", - ) - - def complete_stage(self, outputs: USStageOutputManifest) -> dict[str, Any]: - """Validate, record, and write a complete typed stage output manifest.""" - - self._validate_stage_id(outputs.stage_id) - now = _now() - existing = self._stage_payload(outputs.stage_id) - stage_started_at = _optional_str(existing.get("startedAt")) or now - existing_events = tuple( - dict(event) - for event in existing.get("events", ()) - if isinstance(event, dict) - ) - input_stage_manifest = outputs.input_stage_manifest - if input_stage_manifest is None: - input_stage_manifest = self._previous_stage_manifest_ref(outputs.stage_id) - lifecycle_outputs = replace( - outputs, - input_stage_manifest=input_stage_manifest, - lifecycle_status="complete", - started_at=stage_started_at, - updated_at=now, - completed_at=now, - failed_at=None, - deferred_reason=None, - failure=None, - events=( - *existing_events, - *tuple(outputs.events), - _event("stage_completed", now), - ), - ) - self._run_writer.manifest_payload = self.manifest_payload - self._run_writer.record_stage(lifecycle_outputs) - payload = lifecycle_outputs.to_dict( - self.artifact_root, - input_stage_manifest=input_stage_manifest, - input_overrides=self._input_overrides_for_stage(outputs.stage_id), - ) - self._write_stage_payload(outputs.stage_id, payload) - if outputs.stage_id == "08_dataset_assembly": - self.manifest_payload = self._run_writer.write_manifest_files() - else: - self._refresh_aggregate() - return payload - - def fail_stage( - self, - stage_id: str, - error: BaseException, - *, - metadata: Mapping[str, Any] | None = None, - ) -> dict[str, Any]: - """Mark one stage as failed and persist the failure details.""" - - self._validate_stage_id(stage_id) - now = _now() - payload = self._stage_payload(stage_id) - failure: USStageFailureRecord = { - "errorType": type(error).__name__, - "message": str(error), - "traceback": "".join( - traceback.format_exception(type(error), error, error.__traceback__) - ), - } - payload["complete"] = False - payload["lifecycleStatus"] = "failed" - payload["updatedAt"] = now - payload["failedAt"] = now - payload["failure"] = failure - payload["metadata"] = { - **dict(payload.get("metadata", {})), - **dict(metadata or {}), - } - payload["events"] = [ - *list(payload.get("events", [])), - _event("stage_failed", now, {"errorType": type(error).__name__}), - ] - self._write_stage_payload(stage_id, payload) - self._refresh_aggregate() - return payload - - def defer_stage( - self, - stage_id: str, - reason: str, - *, - metadata: Mapping[str, Any] | None = None, - ) -> dict[str, Any]: - """Mark one stage as intentionally deferred.""" - - self._validate_stage_id(stage_id) - now = _now() - payload = self._stage_payload(stage_id) - payload["complete"] = False - payload["lifecycleStatus"] = "deferred" - payload["updatedAt"] = now - payload["deferredReason"] = reason - payload["metadata"] = { - **dict(payload.get("metadata", {})), - **dict(metadata or {}), - } - payload["events"] = [ - *list(payload.get("events", [])), - _event("stage_deferred", now, {"reason": reason}), - ] - self._write_stage_payload(stage_id, payload) - self._refresh_aggregate() - return payload - - def finalize_from_artifact_manifest( - self, - manifest_payload: Mapping[str, Any], - ) -> dict[str, Any]: - """Finalize typed manifests from a completed saved artifact manifest.""" - - self.manifest_payload = dict(manifest_payload) - self._run_writer = USStageRunWriter( - self.artifact_root, - manifest_payload=self.manifest_payload, - allow_stage_input_overrides=self.allow_stage_input_overrides, - stage_input_overrides=self.stage_input_overrides, - ) - for outputs in build_us_stage_output_manifests_from_artifact_manifest( - self.artifact_root, - self.manifest_payload, - ): - existing = self._stage_payload(outputs.stage_id) - outputs = _rehydrate_outputs_from_stage_payload(outputs, existing) - if ( - _terminal_lifecycle(existing) == "failed" - and not outputs.complete - and not outputs.missing_required_outputs(self.artifact_root) - ): - outputs = replace( - outputs, - complete=True, - lifecycle_status="complete", - deferred_reason=None, - failure=None, - ) - now = _now() - existing_events = tuple( - dict(event) - for event in existing.get("events", ()) - if isinstance(event, dict) - ) - existing_lifecycle = _terminal_lifecycle(existing) - preserve_existing_lifecycle = existing_lifecycle in { - "complete", - "deferred", - } or (existing_lifecycle == "failed" and not outputs.complete) - if preserve_existing_lifecycle: - lifecycle_status = existing_lifecycle - complete = bool(existing.get("complete")) - started_at = _optional_str(existing.get("startedAt")) - updated_at = _optional_str(existing.get("updatedAt")) - completed_at = _optional_str(existing.get("completedAt")) - failed_at = _optional_str(existing.get("failedAt")) - deferred_reason = _optional_str(existing.get("deferredReason")) - failure = existing.get("failure") - events = existing_events - else: - lifecycle_status = _final_lifecycle_status(outputs) - complete = outputs.complete - started_at = _optional_str(existing.get("startedAt")) or now - updated_at = now - completed_at = now if lifecycle_status == "complete" else None - failed_at = None - deferred_reason = ( - outputs.deferred_reason if lifecycle_status == "deferred" else None - ) - failure = None - events = ( - *existing_events, - *tuple(outputs.events), - _event(f"stage_{lifecycle_status}", now), - ) - lifecycle_outputs = replace( - outputs, - complete=complete, - input_stage_manifest=outputs.input_stage_manifest - or self._previous_stage_manifest_ref(outputs.stage_id), - lifecycle_status=lifecycle_status, - started_at=started_at, - updated_at=updated_at, - completed_at=completed_at, - failed_at=failed_at, - deferred_reason=deferred_reason, - failure=failure, - events=events, - ) - self._run_writer.record_stage(lifecycle_outputs) - self.manifest_payload = self._run_writer.write_manifest_files() - return self.manifest_payload - - def _stage_payload(self, stage_id: str) -> dict[str, Any]: - path = self._stage_output_manifest_path(stage_id) - if path.exists(): - try: - payload = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - payload = {} - if isinstance(payload, dict): - return _ensure_stage_payload_defaults(stage_id, payload) - return _empty_stage_payload(stage_id) - - def _write_stage_payload(self, stage_id: str, payload: Mapping[str, Any]) -> None: - path = self._stage_output_manifest_path(stage_id) - _write_json_atomically(path, payload) - self._register_stage_output_manifest(stage_id, path) - - def _register_stage_output_manifest(self, stage_id: str, path: Path) -> None: - stage_paths = dict(self.manifest_payload.get("stage_output_manifests", {})) - stage_paths[stage_id] = str(path.relative_to(self.artifact_root)) - self.manifest_payload["stage_output_manifests"] = stage_paths - - def _refresh_aggregate(self) -> None: - stage_manifest_path = resolve_us_stage_artifact_contract_path( - self.artifact_root, - "08_dataset_assembly", - "stage_manifest", - ) - artifacts = dict(self.manifest_payload.get("artifacts", {})) - artifacts.setdefault("stage_manifest", stage_manifest_path.name) - artifacts.setdefault("manifest", "manifest.json") - self.manifest_payload["artifacts"] = artifacts - _write_json_atomically( - self.artifact_root / "manifest.json", self.manifest_payload - ) - write_us_stage_manifest( - self.artifact_root, - stage_manifest_path, - manifest_payload=self.manifest_payload, - ) - - def _stage_output_manifest_path(self, stage_id: str) -> Path: - return self.artifact_root / "stage_artifacts" / "manifests" / f"{stage_id}.json" - - def _previous_stage_manifest_ref(self, stage_id: str) -> str | None: - stage_index = US_CANONICAL_STAGE_IDS.index(stage_id) - if stage_index == 0: - return None - previous_stage_id = US_CANONICAL_STAGE_IDS[stage_index - 1] - path = self._stage_output_manifest_path(previous_stage_id) - return str(path.relative_to(self.artifact_root)) if path.exists() else None - - def _validate_start_transition(self, stage_id: str) -> None: - stage_index = US_CANONICAL_STAGE_IDS.index(stage_id) - if stage_index == 0: - return - previous_stage_id = US_CANONICAL_STAGE_IDS[stage_index - 1] - previous_payload = self._stage_payload(previous_stage_id) - if previous_payload.get("lifecycleStatus") == "complete": - self._validate_completed_stage(previous_stage_id, previous_payload) - self._validate_required_start_inputs(stage_id) - return - contract = get_us_pipeline_stage_contract(stage_id) - required_previous_inputs = tuple( - resource - for resource in contract.inputs - if resource.required and resource.stage_id == previous_stage_id - ) - if required_previous_inputs and all( - self._override_satisfies(stage_id, resource.key) - for resource in required_previous_inputs - ): - self._validate_required_start_inputs(stage_id) - return - raise ValueError( - f"{stage_id} requires {previous_stage_id} to be complete before start, " - "unless explicit stage input overrides are enabled" - ) - - def _validate_required_start_inputs(self, stage_id: str) -> None: - contract = get_us_pipeline_stage_contract(stage_id) - missing_inputs: list[str] = [] - for resource in contract.inputs: - if ( - not resource.required - or resource.stage_id is None - or resource.kind not in {"artifact", "manifest", "stage_output"} - or self._override_satisfies(stage_id, resource.key) - ): - continue - payload = self._stage_payload(resource.stage_id) - if payload.get("lifecycleStatus") != "complete": - missing_inputs.append(f"{resource.stage_id}.{resource.key}") - continue - self._validate_completed_stage(resource.stage_id, payload) - outputs = payload.get("outputs") - if not isinstance(outputs, Mapping) or not _serialized_output_is_available( - outputs.get(resource.key) - ): - missing_inputs.append(f"{resource.stage_id}.{resource.key}") - if missing_inputs: - raise ValueError( - f"{stage_id} is missing required stage input(s) before start: " - f"{', '.join(missing_inputs)}" - ) - - def _validate_completed_stage( - self, - stage_id: str, - payload: Mapping[str, Any], - ) -> None: - if payload.get("contractVersion") != US_STAGE_CONTRACT_VERSION: - raise ValueError( - f"{stage_id} uses stale contract version " - f"{payload.get('contractVersion')!r}; expected " - f"{US_STAGE_CONTRACT_VERSION!r}" - ) - missing = tuple(payload.get("missingRequiredOutputs") or ()) - if missing: - raise ValueError( - f"{stage_id} is complete but missing required outputs: " - f"{', '.join(str(item) for item in missing)}" - ) - outputs = payload.get("outputs") - if not isinstance(outputs, Mapping): - raise ValueError(f"{stage_id} has no serialized outputs") - required_outputs = tuple(payload.get("requiredOutputs") or ()) - for key in required_outputs: - if not _serialized_output_is_available(outputs.get(str(key))): - raise ValueError( - f"{stage_id} is complete but required output {key!r} is unavailable" - ) - - def _override_satisfies(self, stage_id: str, key: str) -> bool: - if not self.allow_stage_input_overrides: - return False - return any( - override.stage_id == stage_id and override.key == key - for override in self.stage_input_overrides - ) - - def _serialized_overrides_for_stage(self, stage_id: str) -> list[dict[str, Any]]: - return [ - override.to_dict(self.artifact_root) - for override in self._input_overrides_for_stage(stage_id) - ] - - def _input_overrides_for_stage( - self, - stage_id: str, - ) -> tuple[USStageInputOverride, ...]: - return tuple( - override - for override in self.stage_input_overrides - if override.stage_id == stage_id - ) - - def _validate_output_key(self, stage_id: str, key: str) -> None: - contract = get_us_pipeline_stage_contract(stage_id) - valid_keys = {resource.key for resource in contract.outputs} - valid_keys.update(artifact.key for artifact in contract.artifacts) - if key not in valid_keys: - valid = ", ".join(sorted(valid_keys)) or "none" - raise KeyError(f"Unknown output key {stage_id}.{key}; valid keys: {valid}") - - def _write_update_artifact( - self, - stage_id: str, - key: str, - value: Any, - path: str | Path, - ) -> USArtifactRef: - resolved_path = Path(path) - if not resolved_path.is_absolute(): - resolved_path = self.artifact_root / resolved_path - _write_json_atomically( - resolved_path, _runtime_serialize(value, self.artifact_root) - ) - artifact_contract = get_us_stage_artifact_contract(stage_id, key) - return USArtifactRef( - key=key, - path=resolved_path, - format=artifact_contract.format, - required=artifact_contract.required, - resume_role=artifact_contract.resume_role, - exists=True, - ) - - @staticmethod - def _validate_stage_id(stage_id: str) -> None: - if stage_id not in US_CANONICAL_STAGE_IDS: - raise KeyError(f"Unknown US pipeline stage: {stage_id}") - - -def _empty_stage_payload(stage_id: str) -> dict[str, Any]: - contract = get_us_pipeline_stage_contract(stage_id) - return { - "schemaVersion": 2, - "contractVersion": US_STAGE_CONTRACT_VERSION, - "stageId": stage_id, - "complete": False, - "lifecycleStatus": "pending", - "startedAt": None, - "updatedAt": None, - "completedAt": None, - "failedAt": None, - "deferredReason": None, - "failure": None, - "inputStageManifest": None, - "inputOverrides": [], - "requiredOutputs": [ - resource.key for resource in contract.outputs if resource.required - ], - "missingRequiredOutputs": [ - resource.key for resource in contract.outputs if resource.required - ], - "outputs": {}, - "diagnostics": { - "stage_summary": USDiagnosticOutput( - key="stage_summary", - description=f"Runtime diagnostic summary for {stage_id}.", - ).to_dict(), - }, - "auxiliaryArtifacts": {}, - "metadata": {}, - "events": [], - } - - -def _ensure_stage_payload_defaults( - stage_id: str, - payload: dict[str, Any], -) -> dict[str, Any]: - defaults = _empty_stage_payload(stage_id) - merged = {**defaults, **payload} - for key in ("outputs", "diagnostics", "auxiliaryArtifacts", "metadata"): - if not isinstance(merged.get(key), dict): - merged[key] = {} - if not isinstance(merged.get("events"), list): - merged["events"] = [] - return merged - - -def _final_lifecycle_status( - outputs: USStageOutputManifest, -) -> USStageLifecycleStatus: - if outputs.resolved_lifecycle_status() == "deferred": - return "deferred" - return "complete" if outputs.complete else "pending" - - -def _rehydrate_outputs_from_stage_payload( - outputs: USStageOutputManifest, - payload: Mapping[str, Any], -) -> USStageOutputManifest: - serialized_outputs = payload.get("outputs") - if not isinstance(serialized_outputs, Mapping): - return outputs - - hydrated: dict[str, Any] = {} - for item in fields(outputs): - name = item.name - if name in { - "schema_version", - "contract_version", - "input_stage_manifest", - "diagnostics", - "auxiliary_artifacts", - "metadata", - "complete", - "lifecycle_status", - "started_at", - "updated_at", - "completed_at", - "failed_at", - "deferred_reason", - "failure", - "events", - "stage_id", - }: - continue - if name not in serialized_outputs: - continue - current = getattr(outputs, name) - if not _typed_output_is_missing(current): - continue - value = _deserialize_stage_output_field(serialized_outputs[name]) - if not _typed_output_is_missing(value): - hydrated[name] = value - if not hydrated: - return outputs - return replace(outputs, **hydrated) - - -def _deserialize_stage_output_field(value: Any) -> Any: - if isinstance(value, Mapping): - if "path" in value and "key" in value: - return USArtifactRef( - key=str(value["key"]), - path=str(value["path"]), - format=value.get("format", "unknown"), - required=bool(value.get("required", False)), - category=value.get("category", "required_output"), - resume_role=value.get("resume_role"), - assume_exists=bool(value.get("assume_exists", False)), - exists=( - value.get("exists") - if isinstance(value.get("exists"), bool) - else None - ), - ) - return value - - -def _typed_output_is_missing(value: Any) -> bool: - if value is None: - return True - if isinstance(value, Mapping): - return not bool(value) - if isinstance(value, (tuple, list, set, frozenset)): - return not bool(value) - if isinstance(value, str): - return not value - return False - - -def _terminal_lifecycle( - payload: Mapping[str, Any], -) -> USStageLifecycleStatus | None: - status = payload.get("lifecycleStatus") - if status in {"complete", "failed", "deferred"}: - return status - return None - - -def _runtime_serialize(value: Any, artifact_root: str | Path | None) -> Any: - if isinstance(value, USDiagnosticOutput): - return value.to_dict(artifact_root) - return _serialize_value(value, artifact_root) - - -def _serialized_output_is_available(value: Any) -> bool: - if value is None: - return False - if isinstance(value, Mapping): - exists = value.get("exists") - if exists is not None: - return bool(exists) - return bool(value) - if isinstance(value, (list, tuple, set, frozenset)): - return bool(value) - if isinstance(value, str): - return bool(value) - return True - - -def _event( - event: str, - timestamp: str, - details: Mapping[str, Any] | None = None, -) -> USStageRuntimeEventRecord: - return { - "event": event, - "timestamp": timestamp, - "details": dict(details or {}), - } - - -def _now() -> str: - return datetime.now(UTC).isoformat() - - -def _optional_str(value: Any) -> str | None: - return str(value) if value is not None else None - - -def _write_json_atomically(path: Path, payload: Any) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - temporary = path.with_suffix(path.suffix + ".tmp") - temporary.write_text(json.dumps(payload, indent=2, sort_keys=True)) - temporary.replace(path) - - -__all__ = [ - "RuntimeUpdateSection", - "USStageRuntimeWriter", -] diff --git a/src/microplex_us/pipelines/stage_status.py b/src/microplex_us/pipelines/stage_status.py deleted file mode 100644 index 705af1ed..00000000 --- a/src/microplex_us/pipelines/stage_status.py +++ /dev/null @@ -1,275 +0,0 @@ -"""Saved-run status classification for US pipeline stage manifests.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_manifest_types import ( - USStageArtifactRecord, - USStageStatus, -) - - -def stage_status( - stage_id: str, - *, - artifact_root: Path, - manifest: dict[str, Any], - artifacts: list[USStageArtifactRecord], - assume_existing_artifact_keys: set[str], -) -> USStageStatus: - """Return the saved-run status for one canonical stage.""" - - artifact_map = dict(manifest.get("artifacts", {})) - synthesis = dict(manifest.get("synthesis", {})) - calibration = dict(manifest.get("calibration", {})) - rows = dict(manifest.get("rows", {})) - if stage_id == "01_run_profile": - if artifact_missing(artifacts, required_only=True): - return "incomplete" - if artifact_exists(artifacts, "manifest"): - return "ready" - return "metadata_only" if manifest.get("config") else "missing" - if stage_id == "02_source_loading": - return "metadata_only" if synthesis.get("source_names") else "missing" - if stage_id == "03_source_planning": - if artifact_missing(artifacts): - return "incomplete" - if artifact_exists(artifacts, "source_plan"): - return "ready" - return "metadata_only" if synthesis.get("scaffold_source") else "missing" - if stage_id == "04_seed_scaffold": - if artifact_missing(artifacts, required_only=True): - return "incomplete" - if required_artifacts_exist(artifacts): - return "ready" - return ( - "metadata_only" - if rows.get("seed") or synthesis.get("scaffold_source") - else "missing" - ) - if stage_id == "05_donor_integration_synthesis": - if artifact_missing(artifacts, required_only=True): - return "incomplete" - if required_artifacts_exist(artifacts): - return "ready" - return ( - "metadata_only" if rows.get("seed") or rows.get("synthetic") else "missing" - ) - if stage_id == "06_policyengine_entities": - if artifact_missing(artifacts): - return "incomplete" - if artifact_exists(artifacts, "pre_calibration_policyengine_entity_tables"): - return "ready" - if manifest_artifact_exists( - manifest, - artifact_root, - "policyengine_dataset", - assume_existing_artifact_keys=assume_existing_artifact_keys, - ): - return "metadata_only" - return "missing" - if stage_id == "07_calibration": - if artifact_missing(artifacts, required_only=True): - return "incomplete" - if calibration and required_artifacts_exist(artifacts): - return "ready" - return "metadata_only" if calibration and rows.get("calibrated") else "missing" - if stage_id == "08_dataset_assembly": - if artifact_missing(artifacts, required_only=True): - return "incomplete" - if manifest_artifact_exists( - manifest, - artifact_root, - "policyengine_dataset", - assume_existing_artifact_keys=assume_existing_artifact_keys, - ): - return "ready" - return "metadata_only" if artifact_map.get("stage_manifest") else "missing" - if stage_id == "09_validation_benchmarking": - evidence_keys = ( - "policyengine_harness", - "policyengine_native_scores", - "policyengine_native_audit", - "policyengine_native_target_diagnostics", - "imputation_ablation", - ) - evidence_index_keys = ("validation_evidence",) - if manifest_artifact_missing( - manifest, - artifact_root, - (*evidence_keys, *evidence_index_keys), - assume_existing_artifact_keys=assume_existing_artifact_keys, - ): - return "incomplete" - has_evidence = any( - manifest_artifact_exists( - manifest, - artifact_root, - key, - assume_existing_artifact_keys=assume_existing_artifact_keys, - ) - for key in evidence_keys - ) - if not has_evidence: - has_evidence = validation_evidence_index_has_existing_evidence( - manifest, - artifact_root, - assume_existing_artifact_keys=assume_existing_artifact_keys, - ) - if has_evidence: - if not manifest_artifact_exists( - manifest, - artifact_root, - "validation_evidence", - assume_existing_artifact_keys=assume_existing_artifact_keys, - ): - return "incomplete" - return "ready" - if manifest_artifact_exists( - manifest, - artifact_root, - "policyengine_dataset", - assume_existing_artifact_keys=assume_existing_artifact_keys, - ): - return "deferred" - return "missing" - if any(artifact.get("exists") for artifact in artifacts): - return "ready" - return "missing" - - -def required_artifacts_exist(artifacts: list[USStageArtifactRecord]) -> bool: - """Return whether all required artifacts exist.""" - - required = [artifact for artifact in artifacts if bool(artifact.get("required"))] - return bool(required) and all(bool(artifact.get("exists")) for artifact in required) - - -def artifact_exists(artifacts: list[USStageArtifactRecord], key: str) -> bool: - """Return whether a stage artifact record exists.""" - - return any( - artifact.get("key") == key and bool(artifact.get("exists")) - for artifact in artifacts - ) - - -def artifact_missing( - artifacts: list[USStageArtifactRecord], - *, - required_only: bool = False, -) -> bool: - """Return whether required or referenced stage artifacts are missing.""" - - return any( - not bool(artifact.get("exists")) - and ( - bool(artifact.get("required")) - or (not required_only and bool(artifact.get("referenced"))) - ) - for artifact in artifacts - ) - - -def manifest_artifact_exists( - manifest: dict[str, Any], - artifact_root: Path, - artifact_key: str, - *, - assume_existing_artifact_keys: set[str], -) -> bool: - """Return whether a top-level manifest artifact exists.""" - - path = manifest_artifact_path(manifest, artifact_root, artifact_key) - if path is None: - return False - if artifact_key in assume_existing_artifact_keys: - return True - return path.exists() - - -def manifest_artifact_missing( - manifest: dict[str, Any], - artifact_root: Path, - artifact_keys: tuple[str, ...], - *, - assume_existing_artifact_keys: set[str], -) -> bool: - """Return whether any referenced top-level manifest artifact is missing.""" - - artifacts = dict(manifest.get("artifacts", {})) - return any( - bool(artifacts.get(key)) - and not manifest_artifact_exists( - manifest, - artifact_root, - key, - assume_existing_artifact_keys=assume_existing_artifact_keys, - ) - for key in artifact_keys - ) - - -def validation_evidence_index_has_existing_evidence( - manifest: dict[str, Any], - artifact_root: Path, - *, - assume_existing_artifact_keys: set[str], -) -> bool: - """Return whether a validation evidence index points to existing evidence.""" - - path = manifest_artifact_path(manifest, artifact_root, "validation_evidence") - if path is None: - return False - if "validation_evidence" in assume_existing_artifact_keys and not path.exists(): - return False - if not path.exists(): - return False - try: - payload = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - return False - evidence = payload.get("evidence") - if not isinstance(evidence, list): - return False - for record in evidence: - if not isinstance(record, dict) or not record.get("path"): - continue - evidence_path = Path(str(record["path"])) - if not evidence_path.is_absolute(): - evidence_path = artifact_root / evidence_path - if evidence_path.exists(): - return True - return False - - -def manifest_artifact_path( - manifest: dict[str, Any], - artifact_root: Path, - artifact_key: str, -) -> Path | None: - """Return the resolved path for a top-level manifest artifact.""" - - artifacts = dict(manifest.get("artifacts", {})) - filename = artifacts.get(artifact_key) - if not filename: - return None - path = Path(str(filename)) - if not path.is_absolute(): - path = artifact_root / path - return path - - -__all__ = [ - "artifact_exists", - "artifact_missing", - "manifest_artifact_exists", - "manifest_artifact_missing", - "manifest_artifact_path", - "required_artifacts_exist", - "stage_status", - "validation_evidence_index_has_existing_evidence", -] diff --git a/src/microplex_us/pipelines/stage_validation_evidence.py b/src/microplex_us/pipelines/stage_validation_evidence.py deleted file mode 100644 index 195f2875..00000000 --- a/src/microplex_us/pipelines/stage_validation_evidence.py +++ /dev/null @@ -1,137 +0,0 @@ -"""Validation and benchmarking evidence manifests for US saved runs.""" - -from __future__ import annotations - -import json -from collections.abc import Mapping -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_manifest_io import write_json_atomically -from microplex_us.pipelines.stage_manifest_types import ( - US_VALIDATION_STAGE_ID, - USValidationEvidenceManifest, - USValidationEvidenceRecord, -) - - -def build_us_validation_evidence_manifest( - artifact_dir: str | Path, - *, - manifest_payload: dict[str, Any], -) -> USValidationEvidenceManifest: - """Build a compact Stage 9 evidence index from a saved artifact manifest.""" - - artifact_root = Path(artifact_dir) - artifacts = dict(manifest_payload.get("artifacts", {})) - existing = _load_existing_validation_evidence_manifest(artifact_root, artifacts) - evidence_keys = ( - "policyengine_harness", - "policyengine_native_scores", - "policyengine_native_audit", - "policyengine_native_target_diagnostics", - "imputation_ablation", - "child_tax_unit_agi_drift", - ) - evidence_by_key: dict[str, USValidationEvidenceRecord] = {} - if existing is not None: - for record in existing.get("evidence", ()): - if not isinstance(record, Mapping) or not record.get("key"): - continue - key = str(record["key"]) - evidence_by_key[key] = _validation_evidence_record( - artifact_root, - key, - record.get("path"), - ) - for key in evidence_keys: - filename = artifacts.get(key) - if not filename: - continue - evidence_by_key[key] = _validation_evidence_record( - artifact_root, - key, - filename, - ) - summaries: dict[str, Any] = {} - if existing is not None and isinstance(existing.get("summaries"), Mapping): - summaries.update(dict(existing["summaries"])) - summaries.update( - { - key: manifest_payload[key] - for key in ( - "policyengine_harness", - "policyengine_native_scores", - "policyengine_native_audit", - "imputation_ablation", - ) - if isinstance(manifest_payload.get(key), dict) - } - ) - return { - "formatVersion": 1, - "stageId": US_VALIDATION_STAGE_ID, - "evidence": list(evidence_by_key.values()), - "summaries": summaries, - } - - -def write_us_validation_evidence_manifest( - artifact_dir: str | Path, - output_path: str | Path, - *, - manifest_payload: dict[str, Any], -) -> Path: - """Write a Stage 9 evidence manifest for validation/benchmark sidecars.""" - - destination = Path(output_path) - destination.parent.mkdir(parents=True, exist_ok=True) - write_json_atomically( - destination, - build_us_validation_evidence_manifest( - artifact_dir, - manifest_payload=manifest_payload, - ), - ) - return destination - - -def _load_existing_validation_evidence_manifest( - artifact_root: Path, - artifacts: Mapping[str, Any], -) -> Mapping[str, Any] | None: - evidence_name = artifacts.get("validation_evidence") - if not evidence_name: - return None - path = Path(str(evidence_name)) - if not path.is_absolute(): - path = artifact_root / path - if not path.exists(): - return None - try: - payload = json.loads(path.read_text()) - except (OSError, json.JSONDecodeError): - return None - return payload if isinstance(payload, Mapping) else None - - -def _validation_evidence_record( - artifact_root: Path, - key: str, - path_value: Any, -) -> USValidationEvidenceRecord: - path_text = str(path_value) if path_value else "" - path = Path(path_text) - if path_text and not path.is_absolute(): - path = artifact_root / path - return { - "key": key, - "path": path_text, - "exists": bool(path_text) and path.exists(), - } - - -__all__ = [ - "build_us_validation_evidence_manifest", - "write_us_validation_evidence_manifest", -] diff --git a/src/microplex_us/pipelines/summarize_child_tax_unit_agi_drift.py b/src/microplex_us/pipelines/summarize_child_tax_unit_agi_drift.py deleted file mode 100644 index fcb98dba..00000000 --- a/src/microplex_us/pipelines/summarize_child_tax_unit_agi_drift.py +++ /dev/null @@ -1,181 +0,0 @@ -"""Summarize child-linked AGI component drift across artifact stages.""" - -from __future__ import annotations - -import argparse -import json -from pathlib import Path -from typing import Any, Iterable - -import pandas as pd - -DEFAULT_VARIABLES = ( - "total_person_income", - "income", - "employment_income", - "wage_income", - "self_employment_income", - "gross_social_security", - "ssi", - "public_assistance", - "pension_income", - "taxable_interest_income", - "tax_exempt_interest_income", - "taxable_pension_income", - "dividend_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "rental_income", - "partnership_s_corp_income", -) - -DEFAULT_STAGE_FILES = { - "seed": "seed_data.parquet", - "calibrated": "calibrated_data.parquet", - "synthetic": "synthetic_data.parquet", -} - - -def _resolve_artifact_dir(path: str | Path) -> Path: - candidate = Path(path).expanduser().resolve() - if candidate.is_dir(): - if (candidate / "manifest.json").exists(): - return candidate - for filename in DEFAULT_STAGE_FILES.values(): - if (candidate / filename).exists(): - return candidate - manifest = next(candidate.glob("**/manifest.json"), None) - if manifest is None: - raise FileNotFoundError(f"No manifest.json found under {candidate}") - return manifest.parent - if candidate.name == "manifest.json": - return candidate.parent - raise FileNotFoundError(f"Expected an artifact directory or manifest.json, got {candidate}") - - -def _summarize_variable(frame: pd.DataFrame, variable: str) -> dict[str, float]: - series = pd.to_numeric( - frame.get(variable, pd.Series(0.0, index=frame.index)), - errors="coerce", - ).fillna(0.0) - count = int(series.shape[0]) - if count == 0: - return {"count": 0, "sum": 0.0, "mean": 0.0, "nonzero_share": 0.0} - nonzero = (series != 0).sum() - return { - "count": count, - "sum": float(series.sum()), - "mean": float(series.mean()), - "nonzero_share": float(nonzero / count), - } - - -def _summarize_frame(frame: pd.DataFrame, variables: Iterable[str]) -> dict[str, Any]: - age = pd.to_numeric(frame.get("age", pd.Series([], dtype=float)), errors="coerce") - if "is_tax_unit_dependent" in frame.columns: - is_dependent = pd.to_numeric(frame["is_tax_unit_dependent"], errors="coerce") - else: - is_dependent = pd.to_numeric( - frame.get("is_dependent", pd.Series([], dtype=float)), errors="coerce" - ) - subsets = { - "all": frame.index, - "under_20": frame.index[age.fillna(-1) < 20], - "dependents_under_20": frame.index[ - (age.fillna(-1) < 20) & (is_dependent.fillna(0) > 0) - ], - "adults": frame.index[age.fillna(-1) >= 20], - } - result: dict[str, Any] = { - "row_count": int(frame.shape[0]), - "subsets": {}, - "tax_unit_subsets": {}, - } - for subset_name, index in subsets.items(): - subset = frame.loc[index] - result["subsets"][subset_name] = { - variable: _summarize_variable(subset, variable) - for variable in variables - } - if "tax_unit_id" in frame.columns: - tax_unit_ids = frame["tax_unit_id"].astype(str) - tax_unit_flags = pd.DataFrame( - { - "tax_unit_id": tax_unit_ids, - "has_child": age.fillna(-1).lt(20).groupby(tax_unit_ids).transform("max"), - } - ) - available_vars = [var for var in variables if var in frame.columns] - tax_unit_agg = frame.loc[:, ["tax_unit_id", *available_vars]].copy() - tax_unit_agg = tax_unit_agg.groupby("tax_unit_id").sum(numeric_only=True) - tax_unit_flags = ( - tax_unit_flags.drop_duplicates("tax_unit_id") - .set_index("tax_unit_id") - .reindex(tax_unit_agg.index) - ) - tax_unit_agg["has_child"] = tax_unit_flags["has_child"].fillna(0).astype(float) - tax_subsets = { - "all": tax_unit_agg.index, - "with_children": tax_unit_agg.index[tax_unit_agg["has_child"] > 0], - "without_children": tax_unit_agg.index[tax_unit_agg["has_child"] == 0], - } - result["tax_unit_row_count"] = int(tax_unit_agg.shape[0]) - for subset_name, index in tax_subsets.items(): - subset = tax_unit_agg.loc[index] - result["tax_unit_subsets"][subset_name] = { - variable: _summarize_variable(subset, variable) - for variable in variables - } - return result - - -def summarize_child_tax_unit_agi_drift( - artifact_path: str | Path, - *, - variables: Iterable[str] = DEFAULT_VARIABLES, - stage_files: dict[str, str] = DEFAULT_STAGE_FILES, -) -> dict[str, Any]: - """Summarize child-linked AGI component drift for one artifact bundle.""" - artifact_dir = _resolve_artifact_dir(artifact_path) - payload: dict[str, Any] = { - "artifact_path": str(artifact_dir), - "variables": list(variables), - "stages": {}, - } - for stage, filename in stage_files.items(): - file_path = artifact_dir / filename - if not file_path.exists(): - continue - frame = pd.read_parquet(file_path) - payload["stages"][stage] = _summarize_frame(frame, variables) - return payload - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Summarize child-linked AGI component drift for one artifact." - ) - parser.add_argument("artifact", help="Artifact directory or manifest.json path.") - parser.add_argument("--out", help="Optional JSON output path.") - parser.add_argument( - "--variables", - nargs="+", - default=list(DEFAULT_VARIABLES), - help="Variables to summarize.", - ) - args = parser.parse_args(argv) - - payload = summarize_child_tax_unit_agi_drift( - args.artifact, - variables=tuple(args.variables), - ) - output = json.dumps(payload, indent=2, sort_keys=True) - if args.out: - Path(args.out).expanduser().write_text(output) - else: - print(output) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/summarize_donor_conditioning.py b/src/microplex_us/pipelines/summarize_donor_conditioning.py deleted file mode 100644 index 045ae2cf..00000000 --- a/src/microplex_us/pipelines/summarize_donor_conditioning.py +++ /dev/null @@ -1,175 +0,0 @@ -"""Summarize donor-conditioning diagnostics recorded in artifact manifests.""" - -from __future__ import annotations - -import argparse -import json -from collections import Counter -from collections.abc import Iterable -from pathlib import Path -from typing import Any - - -def _resolve_artifact_dir(path: str | Path) -> Path: - candidate = Path(path).expanduser().resolve() - if candidate.is_dir(): - if (candidate / "manifest.json").exists(): - return candidate - manifest = next(candidate.glob("**/manifest.json"), None) - if manifest is None: - raise FileNotFoundError(f"No manifest.json found under {candidate}") - return manifest.parent - if candidate.name == "manifest.json": - return candidate.parent - raise FileNotFoundError(f"Expected an artifact directory or manifest.json, got {candidate}") - - -def summarize_donor_conditioning( - artifact_path: str | Path, - *, - focus_variables: Iterable[str] | None = None, -) -> dict[str, Any]: - """Summarize donor-conditioning diagnostics for one artifact bundle.""" - artifact_dir = _resolve_artifact_dir(artifact_path) - manifest = json.loads((artifact_dir / "manifest.json").read_text()) - diagnostics = list( - manifest.get("synthesis", {}).get("donor_conditioning_diagnostics", []) - ) - focus = set(focus_variables or ()) - if focus: - diagnostics = [ - entry - for entry in diagnostics - if focus - & { - *entry.get("model_variables", []), - *entry.get("restored_variables", []), - } - ] - - selected_counter: Counter[str] = Counter() - dropped_counter: Counter[str] = Counter() - raw_supplemental_reason_counter: Counter[str] = Counter() - supplemental_reason_counter: Counter[str] = Counter() - raw_challenger_reason_counter: Counter[str] = Counter() - challenger_reason_counter: Counter[str] = Counter() - block_summaries: list[dict[str, Any]] = [] - for entry in diagnostics: - selected = list(entry.get("selected_condition_vars", [])) - dropped = list(entry.get("dropped_shared_vars", [])) - raw_supplemental_status = list( - entry.get("raw_supplemental_shared_condition_var_status", []) - ) - supplemental_status = list( - entry.get("supplemental_shared_condition_var_status", []) - ) - raw_challenger_status = list( - entry.get("raw_challenger_shared_condition_var_status", []) - ) - challenger_status = list( - entry.get("challenger_shared_condition_var_status", []) - ) - selected_counter.update(selected) - dropped_counter.update(dropped) - raw_supplemental_reason_counter.update( - status.get("reason") - for status in raw_supplemental_status - if status.get("reason") is not None - ) - supplemental_reason_counter.update( - status.get("reason") - for status in supplemental_status - if status.get("reason") is not None - ) - raw_challenger_reason_counter.update( - status.get("reason") - for status in raw_challenger_status - if status.get("reason") is not None - ) - challenger_reason_counter.update( - status.get("reason") - for status in challenger_status - if status.get("reason") is not None - ) - block_summaries.append( - { - "donor_source": entry.get("donor_source"), - "model_variables": list(entry.get("model_variables", [])), - "restored_variables": list(entry.get("restored_variables", [])), - "condition_selection": entry.get("condition_selection"), - "used_condition_surface": bool( - entry.get("used_condition_surface", False) - ), - "raw_shared_vars": list(entry.get("raw_shared_vars", [])), - "shared_vars_after_model_exclusion": list( - entry.get("shared_vars_after_model_exclusion", []) - ), - "projection_applied": bool(entry.get("projection_applied", False)), - "entity_compatible_shared_vars": list( - entry.get("entity_compatible_shared_vars", []) - ), - "requested_supplemental_shared_condition_vars": list( - entry.get("requested_supplemental_shared_condition_vars", []) - ), - "requested_challenger_shared_condition_vars": list( - entry.get("requested_challenger_shared_condition_vars", []) - ), - "raw_supplemental_shared_condition_var_status": raw_supplemental_status, - "raw_challenger_shared_condition_var_status": raw_challenger_status, - "supplemental_shared_condition_var_status": supplemental_status, - "challenger_shared_condition_var_status": challenger_status, - "selected_condition_vars": selected, - "dropped_shared_vars": dropped, - } - ) - - return { - "artifact_path": str(artifact_dir), - "block_count": len(block_summaries), - "focus_variables": sorted(focus), - "selected_condition_var_frequency": dict(sorted(selected_counter.items())), - "dropped_shared_var_frequency": dict(sorted(dropped_counter.items())), - "raw_supplemental_shared_condition_reason_frequency": dict( - sorted(raw_supplemental_reason_counter.items()) - ), - "raw_challenger_shared_condition_reason_frequency": dict( - sorted(raw_challenger_reason_counter.items()) - ), - "supplemental_shared_condition_reason_frequency": dict( - sorted(supplemental_reason_counter.items()) - ), - "challenger_shared_condition_reason_frequency": dict( - sorted(challenger_reason_counter.items()) - ), - "blocks": block_summaries, - } - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - description="Summarize donor-conditioning diagnostics for one artifact." - ) - parser.add_argument("artifact", help="Artifact directory or manifest.json path.") - parser.add_argument("--out", help="Optional JSON output path.") - parser.add_argument( - "--variables", - nargs="+", - default=None, - help="Optional model/restored variables to focus on.", - ) - args = parser.parse_args(argv) - - payload = summarize_donor_conditioning( - args.artifact, - focus_variables=tuple(args.variables) if args.variables else None, - ) - output = json.dumps(payload, indent=2, sort_keys=True) - if args.out: - Path(args.out).expanduser().write_text(output) - else: - print(output) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/summarize_pe_native_family_drilldown.py b/src/microplex_us/pipelines/summarize_pe_native_family_drilldown.py deleted file mode 100644 index e398f9d4..00000000 --- a/src/microplex_us/pipelines/summarize_pe_native_family_drilldown.py +++ /dev/null @@ -1,308 +0,0 @@ -"""Summarize one PE-native regression family across saved native-audit sidecars.""" - -from __future__ import annotations - -import argparse -import json -from collections import Counter, defaultdict -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_contracts import ( - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) - - -def classify_pe_native_target_family(target_name: str) -> str: - """Classify one PE target name into the broad-loss family buckets.""" - - parts = target_name.split("/") - if target_name.startswith("state/census/age/"): - return "state_age_distribution" - if target_name.startswith("state/census/population_by_state/"): - return "state_population" - if target_name.startswith("state/census/population_under_5_by_state/"): - return "state_population_under_5" - if target_name.startswith("nation/irs/aca_spending/"): - return "state_aca_spending" - if target_name.startswith("state/irs/aca_enrollment/"): - return "state_aca_enrollment" - if target_name.startswith("irs/medicaid_enrollment/"): - return "state_medicaid_enrollment" - if target_name.endswith("/snap-cost"): - return "state_snap_cost" - if target_name.endswith("/snap-hhs"): - return "state_snap_households" - if target_name.startswith("state/real_estate_taxes/"): - return "state_real_estate_taxes" - if len(parts) >= 3 and parts[0] == "state" and parts[2] == "adjusted_gross_income": - return "state_agi_distribution" - if target_name.startswith("nation/jct/"): - return "national_tax_expenditures" - if target_name.startswith("nation/net_worth/"): - return "national_net_worth" - if target_name.startswith("nation/ssa/"): - return "national_ssa" - if target_name.startswith("nation/census/population_by_age/"): - return "national_population_by_age" - if target_name == "nation/census/infants": - return "national_infants" - if target_name.startswith("nation/census/agi_in_spm_threshold_decile_"): - return "national_spm_threshold_agi" - if target_name.startswith("nation/census/count_in_spm_threshold_decile_"): - return "national_spm_threshold_count" - if target_name.startswith("nation/census/"): - return "national_census_other" - if target_name.startswith("nation/irs/"): - return "national_irs_other" - return "other" - - -def summarize_us_pe_native_family_drilldown( - artifact_roots: list[str | Path] | tuple[str | Path, ...], - *, - family: str, - top_k: int = 10, -) -> dict[str, Any]: - """Summarize one regression family across saved native-audit sidecars.""" - - normalized_roots = [Path(root) for root in artifact_roots] - matching_target_counts: Counter[str] = Counter() - matching_target_delta_sum: defaultdict[str, float] = defaultdict(float) - lead_target_counts: Counter[str] = Counter() - lead_target_delta_sum: defaultdict[str, float] = defaultdict(float) - filing_gap_rows: defaultdict[str, list[float]] = defaultdict(list) - mfs_gap_rows: defaultdict[str, list[float]] = defaultdict(list) - matching_audits: list[dict[str, Any]] = [] - lead_audits: list[dict[str, Any]] = [] - total_audits = 0 - - for artifact_root in normalized_roots: - root_key = artifact_root.name - for bundle_dir in _iter_native_audit_bundle_dirs(artifact_root): - total_audits += 1 - payload = json.loads( - resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_audit", - ).read_text() - ) - verdict_hints = dict(payload.get("verdictHints", {})) - support_summary = dict(payload.get("supportAuditSummary", {})) - matching_targets = [ - row - for row in list(payload.get("topTargetRegressions", ())) - if classify_pe_native_target_family(str(row.get("target_name", ""))) == family - ] - if not matching_targets: - continue - - largest_regressing_family = verdict_hints.get("largestRegressingFamily") - is_lead_audit = largest_regressing_family == family - audit_row = { - "artifactRoot": root_key, - "artifactPath": str(bundle_dir.relative_to(artifact_root)), - "largestRegressingFamily": largest_regressing_family, - "largestRegressingTarget": verdict_hints.get("largestRegressingTarget"), - "matchingTargets": [ - { - "target": row.get("target_name"), - "weightedTermDelta": float(row.get("weighted_term_delta", 0.0)), - } - for row in matching_targets[:top_k] - ], - } - matching_audits.append(audit_row) - - for row in matching_targets: - target_name = str(row.get("target_name")) - matching_target_counts[target_name] += 1 - matching_target_delta_sum[target_name] += float( - row.get("weighted_term_delta", 0.0) - ) - if is_lead_audit: - lead_target_counts[target_name] += 1 - lead_target_delta_sum[target_name] += float( - row.get("weighted_term_delta", 0.0) - ) - - if not is_lead_audit: - continue - - lead_audits.append( - { - **audit_row, - "topFilingStatusGaps": [ - { - "filingStatus": row.get("filing_status"), - "weightedCountDelta": float( - row.get("weighted_count_delta", 0.0) - ), - } - for row in list( - support_summary.get("topFilingStatusGaps", ()) - )[:top_k] - ], - "topMFSAgiGaps": [ - { - "agiBin": row.get("agi_bin"), - "weightedCountDelta": float( - row.get("weighted_count_delta", 0.0) - ), - } - for row in list(support_summary.get("topMFSAgiGaps", ()))[:top_k] - ], - } - ) - - for row in list(support_summary.get("topFilingStatusGaps", ())): - status = str(row.get("filing_status")) - filing_gap_rows[status].append(float(row.get("weighted_count_delta", 0.0))) - for row in list(support_summary.get("topMFSAgiGaps", ())): - agi_bin = str(row.get("agi_bin")) - mfs_gap_rows[agi_bin].append(float(row.get("weighted_count_delta", 0.0))) - - matching_audits.sort( - key=lambda row: ( - row["largestRegressingFamily"] != family, - row["artifactRoot"], - row["artifactPath"], - ) - ) - lead_audits.sort(key=lambda row: (row["artifactRoot"], row["artifactPath"])) - - return { - "artifactRoots": [str(root) for root in normalized_roots], - "family": family, - "totalAudits": total_audits, - "auditsWithMatchingTargets": len(matching_audits), - "auditsWhereFamilyLeads": len(lead_audits), - "matchingTargetCounts": _build_target_rows( - matching_target_counts, - matching_target_delta_sum, - )[:top_k], - "leadTargetCounts": _build_target_rows( - lead_target_counts, - lead_target_delta_sum, - )[:top_k], - "leadFilingStatusGapSummary": _build_gap_rows( - filing_gap_rows, - gap_key="filingStatus", - )[:top_k], - "leadMFSAgiGapSummary": _build_gap_rows( - mfs_gap_rows, - gap_key="agiBin", - )[:top_k], - "matchingAudits": matching_audits[:top_k], - "leadAudits": lead_audits[:top_k], - } - - -def _iter_native_audit_bundle_dirs(artifact_root: Path) -> tuple[Path, ...]: - audit_hint = get_us_stage_artifact_contract( - "09_validation_benchmarking", - "policyengine_native_audit", - ).path_hint - dataset_hint = get_us_stage_artifact_contract( - "08_dataset_assembly", - "policyengine_dataset", - ).path_hint - if audit_hint is None or dataset_hint is None: - return () - return tuple( - sorted( - path.parent - for path in artifact_root.rglob(audit_hint) - if (path.parent / dataset_hint).exists() - ) - ) - - -def _build_target_rows( - counts: Counter[str], - delta_sum: dict[str, float], -) -> list[dict[str, Any]]: - rows = [] - for target, count in counts.items(): - total_delta = float(delta_sum[target]) - rows.append( - { - "target": target, - "count": int(count), - "weightedTermDeltaSum": total_delta, - "weightedTermDeltaMean": total_delta / float(count), - } - ) - rows.sort( - key=lambda row: ( - -int(row["count"]), - -float(row["weightedTermDeltaSum"]), - str(row["target"]), - ) - ) - return rows - - -def _build_gap_rows( - values_by_key: dict[str, list[float]], - *, - gap_key: str, -) -> list[dict[str, Any]]: - rows = [] - for key, values in values_by_key.items(): - if not values: - continue - rows.append( - { - gap_key: key, - "count": len(values), - "positiveCount": sum(1 for value in values if value > 0.0), - "negativeCount": sum(1 for value in values if value < 0.0), - "weightedCountDeltaSum": float(sum(values)), - "meanAbsWeightedCountDelta": float( - sum(abs(value) for value in values) / float(len(values)) - ), - } - ) - rows.sort( - key=lambda row: ( - -float(row["meanAbsWeightedCountDelta"]), - str(row[gap_key]), - ) - ) - return rows - - -def main(argv: list[str] | None = None) -> int: - """CLI entrypoint for one-family native-audit drilldowns.""" - - parser = argparse.ArgumentParser( - description="Summarize one PE-native regression family across saved native audits.", - ) - parser.add_argument("family", help="Broad-loss family to summarize.") - parser.add_argument( - "artifact_roots", - nargs="+", - help="One or more artifact roots to scan.", - ) - parser.add_argument( - "--top-k", - type=int, - default=10, - help="Number of rows to keep for each ranked section.", - ) - args = parser.parse_args(argv) - - payload = summarize_us_pe_native_family_drilldown( - args.artifact_roots, - family=args.family, - top_k=args.top_k, - ) - print(json.dumps(payload, indent=2, sort_keys=True)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/summarize_pe_native_regressions.py b/src/microplex_us/pipelines/summarize_pe_native_regressions.py deleted file mode 100644 index 9ce81175..00000000 --- a/src/microplex_us/pipelines/summarize_pe_native_regressions.py +++ /dev/null @@ -1,204 +0,0 @@ -"""Summarize recurring PE-native regression families across saved US artifacts.""" - -from __future__ import annotations - -import argparse -import json -from collections import Counter, defaultdict -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_contracts import ( - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) - - -def _sorted_counter_items(counter: Counter[str]) -> list[tuple[str, int]]: - return sorted(counter.items(), key=lambda item: (-int(item[1]), item[0])) - - -def summarize_us_pe_native_regressions( - artifact_roots: list[str | Path] | tuple[str | Path, ...], - *, - top_k: int = 10, -) -> dict[str, Any]: - """Summarize recurring PE-native regression families from saved artifacts.""" - - rows: list[dict[str, Any]] = [] - largest_family_counts: Counter[str] = Counter() - top3_family_counts: Counter[str] = Counter() - family_rank_counts: dict[str, Counter[int]] = defaultdict(Counter) - family_counts_by_root: dict[str, Counter[str]] = defaultdict(Counter) - target_counts_from_audits: Counter[str] = Counter() - missing_critical_inputs_counts: Counter[str] = Counter() - - normalized_roots = [Path(root) for root in artifact_roots] - for artifact_root in normalized_roots: - root_key = artifact_root.name - for bundle_dir in _iter_scored_bundle_dirs(artifact_root): - scores_payload = json.loads( - resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_scores", - ).read_text() - ) - summary = dict(scores_payload.get("summary", {})) - positive_families = [ - row - for row in list(scores_payload.get("family_breakdown", ())) - if float(row.get("loss_contribution_delta", 0.0)) > 0.0 - ] - positive_families.sort( - key=lambda row: float(row.get("loss_contribution_delta", 0.0)), - reverse=True, - ) - - largest_family = positive_families[0] if positive_families else {} - top3_families = [row.get("family") for row in positive_families[:3]] - - audit_path = resolve_us_stage_artifact_contract_path( - bundle_dir, - "09_validation_benchmarking", - "policyengine_native_audit", - ) - audit_payload = json.loads(audit_path.read_text()) if audit_path.exists() else None - verdict_hints = dict((audit_payload or {}).get("verdictHints", {})) - support_summary = dict((audit_payload or {}).get("supportAuditSummary", {})) - - rows.append( - { - "artifactRoot": root_key, - "artifactPath": str(bundle_dir.relative_to(artifact_root)), - "lossDelta": float( - summary.get("enhanced_cps_native_loss_delta", 0.0) - ), - "candidateBeatsBaseline": bool( - summary.get("candidate_beats_baseline", False) - ), - "largestRegressingFamily": largest_family.get("family"), - "largestRegressingFamilyDelta": ( - float(largest_family.get("loss_contribution_delta", 0.0)) - if largest_family - else None - ), - "top3Families": top3_families, - "largestRegressingTarget": verdict_hints.get( - "largestRegressingTarget" - ), - "missingStoredCriticalInputs": list( - support_summary.get("missingStoredCriticalInputs", ()) - ), - "auditAvailable": audit_payload is not None, - } - ) - - if largest_family.get("family"): - largest_family_counts[str(largest_family["family"])] += 1 - - for rank, family in enumerate(top3_families, start=1): - if not family: - continue - family_str = str(family) - top3_family_counts[family_str] += 1 - family_rank_counts[family_str][rank] += 1 - family_counts_by_root[root_key][family_str] += 1 - - target_name = verdict_hints.get("largestRegressingTarget") - if target_name: - target_counts_from_audits[str(target_name)] += 1 - - for variable in support_summary.get("missingStoredCriticalInputs", ()): - missing_critical_inputs_counts[str(variable)] += 1 - - rows.sort(key=lambda row: (-float(row["lossDelta"]), row["artifactRoot"], row["artifactPath"])) - best_rows = sorted( - rows, - key=lambda row: (float(row["lossDelta"]), row["artifactRoot"], row["artifactPath"]), - ) - - return { - "artifactRoots": [str(root) for root in normalized_roots], - "totalScoredRuns": len(rows), - "totalAuditedRuns": sum(1 for row in rows if bool(row["auditAvailable"])), - "largestFamilyCounts": [ - {"family": family, "count": count} - for family, count in _sorted_counter_items(largest_family_counts) - ], - "top3FamilyCounts": [ - { - "family": family, - "top3Count": top3_family_counts[family], - "rank1Count": family_rank_counts[family][1], - "rank2Count": family_rank_counts[family][2], - "rank3Count": family_rank_counts[family][3], - } - for family, _count in _sorted_counter_items(top3_family_counts) - ], - "familyCountsByRoot": { - root: [ - {"family": family, "count": count} - for family, count in _sorted_counter_items(counter) - ] - for root, counter in sorted(family_counts_by_root.items()) - }, - "targetCountsFromAudits": [ - {"target": target, "count": count} - for target, count in _sorted_counter_items(target_counts_from_audits) - ], - "missingCriticalInputsCounts": [ - {"variable": variable, "count": count} - for variable, count in _sorted_counter_items(missing_critical_inputs_counts) - ], - "worstRuns": rows[:top_k], - "bestRuns": best_rows[:top_k], - } - - -def _iter_scored_bundle_dirs(artifact_root: Path) -> tuple[Path, ...]: - scores_hint = get_us_stage_artifact_contract( - "09_validation_benchmarking", - "policyengine_native_scores", - ).path_hint - dataset_hint = get_us_stage_artifact_contract( - "08_dataset_assembly", - "policyengine_dataset", - ).path_hint - if scores_hint is None or dataset_hint is None: - return () - return tuple( - sorted( - path.parent - for path in artifact_root.rglob(scores_hint) - if (path.parent / dataset_hint).exists() - ) - ) - - -def main(argv: list[str] | None = None) -> int: - """CLI entrypoint for PE-native regression summary over saved artifacts.""" - - parser = argparse.ArgumentParser( - description="Summarize recurring PE-native regression families for saved US artifacts.", - ) - parser.add_argument( - "artifact_roots", - nargs="+", - help="One or more artifact roots to scan.", - ) - parser.add_argument( - "--top-k", - type=int, - default=10, - help="Number of best/worst runs to include in the output.", - ) - args = parser.parse_args(argv) - - payload = summarize_us_pe_native_regressions(args.artifact_roots, top_k=args.top_k) - print(json.dumps(payload, indent=2, sort_keys=True)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/summarize_policyengine_oracle_regressions.py b/src/microplex_us/pipelines/summarize_policyengine_oracle_regressions.py deleted file mode 100644 index 9893be2c..00000000 --- a/src/microplex_us/pipelines/summarize_policyengine_oracle_regressions.py +++ /dev/null @@ -1,229 +0,0 @@ -"""Summarize recurring calibration-oracle regression families across saved artifacts.""" - -from __future__ import annotations - -import argparse -import json -from collections import Counter, defaultdict -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_contracts import get_us_stage_artifact_contract - - -def _sorted_counter_items(counter: Counter[str]) -> list[tuple[str, int]]: - return sorted(counter.items(), key=lambda item: (-int(item[1]), item[0])) - - -def summarize_us_policyengine_oracle_regressions( - artifact_roots: list[str | Path] | tuple[str | Path, ...], - *, - loss_scope: str = "full_oracle", - top_k: int = 10, -) -> dict[str, Any]: - """Summarize recurring calibration-oracle regression families for saved artifacts.""" - - rows: list[dict[str, Any]] = [] - largest_family_counts: Counter[str] = Counter() - largest_geography_counts: Counter[str] = Counter() - top3_family_counts: Counter[str] = Counter() - top3_geography_counts: Counter[str] = Counter() - family_rank_counts: dict[str, Counter[int]] = defaultdict(Counter) - geography_rank_counts: dict[str, Counter[int]] = defaultdict(Counter) - family_counts_by_root: dict[str, Counter[str]] = defaultdict(Counter) - geography_counts_by_root: dict[str, Counter[str]] = defaultdict(Counter) - - normalized_roots = [Path(root) for root in artifact_roots] - for artifact_root in normalized_roots: - root_key = artifact_root.name - for bundle_dir in _iter_oracle_bundle_dirs(artifact_root): - manifest = json.loads((bundle_dir / "manifest.json").read_text()) - calibration = dict(manifest.get("calibration", {})) - oracle_loss = dict(calibration.get("oracle_loss", {})) - scope_summary = dict(oracle_loss.get(loss_scope, {})) - family_ranking = [ - row - for row in list(scope_summary.get("family_ranking", ())) - if float(row.get("capped_sum_abs_relative_error", 0.0)) > 0.0 - ] - geography_ranking = [ - row - for row in list(scope_summary.get("geography_ranking", ())) - if float(row.get("capped_sum_abs_relative_error", 0.0)) > 0.0 - ] - - largest_family = family_ranking[0] if family_ranking else {} - largest_geography = geography_ranking[0] if geography_ranking else {} - top3_families = [row.get("group") for row in family_ranking[:3] if row.get("group")] - top3_geographies = [ - row.get("group") for row in geography_ranking[:3] if row.get("group") - ] - - rows.append( - { - "artifactRoot": root_key, - "artifactPath": str(bundle_dir.relative_to(artifact_root)), - "lossScope": loss_scope, - "scopeCappedLoss": scope_summary.get( - "capped_mean_abs_relative_error" - ), - "scopeLoss": scope_summary.get("mean_abs_relative_error"), - "activeSolveCappedLoss": calibration.get( - "active_solve_capped_mean_abs_relative_error" - ), - "nConstraints": calibration.get("n_constraints"), - "nSupportedTargets": calibration.get("n_supported_targets"), - "nUnsupportedTargets": calibration.get("n_unsupported_targets"), - "nCalibrationStagesApplied": calibration.get( - "n_calibration_stages_applied" - ), - "largestFamily": largest_family.get("group"), - "largestFamilyCappedLossShare": largest_family.get( - "capped_loss_share" - ), - "largestGeography": largest_geography.get("group"), - "largestGeographyCappedLossShare": largest_geography.get( - "capped_loss_share" - ), - "top3Families": top3_families, - "top3Geographies": top3_geographies, - } - ) - - if largest_family.get("group"): - largest_family_counts[str(largest_family["group"])] += 1 - if largest_geography.get("group"): - largest_geography_counts[str(largest_geography["group"])] += 1 - - for rank, family in enumerate(top3_families, start=1): - family_str = str(family) - top3_family_counts[family_str] += 1 - family_rank_counts[family_str][rank] += 1 - family_counts_by_root[root_key][family_str] += 1 - - for rank, geography in enumerate(top3_geographies, start=1): - geography_str = str(geography) - top3_geography_counts[geography_str] += 1 - geography_rank_counts[geography_str][rank] += 1 - geography_counts_by_root[root_key][geography_str] += 1 - - rows.sort( - key=lambda row: ( - -float(row["scopeCappedLoss"] or 0.0), - row["artifactRoot"], - row["artifactPath"], - ) - ) - best_rows = sorted( - rows, - key=lambda row: ( - float(row["scopeCappedLoss"] or 0.0), - row["artifactRoot"], - row["artifactPath"], - ), - ) - - return { - "artifactRoots": [str(root) for root in normalized_roots], - "lossScope": loss_scope, - "totalScoredRuns": len(rows), - "largestFamilyCounts": [ - {"group": group, "count": count} - for group, count in _sorted_counter_items(largest_family_counts) - ], - "largestGeographyCounts": [ - {"group": group, "count": count} - for group, count in _sorted_counter_items(largest_geography_counts) - ], - "top3FamilyCounts": [ - { - "group": group, - "top3Count": top3_family_counts[group], - "rank1Count": family_rank_counts[group][1], - "rank2Count": family_rank_counts[group][2], - "rank3Count": family_rank_counts[group][3], - } - for group, _count in _sorted_counter_items(top3_family_counts) - ], - "top3GeographyCounts": [ - { - "group": group, - "top3Count": top3_geography_counts[group], - "rank1Count": geography_rank_counts[group][1], - "rank2Count": geography_rank_counts[group][2], - "rank3Count": geography_rank_counts[group][3], - } - for group, _count in _sorted_counter_items(top3_geography_counts) - ], - "familyCountsByRoot": { - root: [ - {"group": group, "count": count} - for group, count in _sorted_counter_items(counter) - ] - for root, counter in sorted(family_counts_by_root.items()) - }, - "geographyCountsByRoot": { - root: [ - {"group": group, "count": count} - for group, count in _sorted_counter_items(counter) - ] - for root, counter in sorted(geography_counts_by_root.items()) - }, - "worstRuns": rows[:top_k], - "bestRuns": best_rows[:top_k], - } - - -def _iter_oracle_bundle_dirs(artifact_root: Path) -> tuple[Path, ...]: - dataset_hint = get_us_stage_artifact_contract( - "08_dataset_assembly", - "policyengine_dataset", - ).path_hint - if dataset_hint is None: - return () - return tuple( - sorted( - path.parent - for path in artifact_root.rglob("manifest.json") - if (path.parent / dataset_hint).exists() - ) - ) - - -def main(argv: list[str] | None = None) -> int: - """CLI entrypoint for oracle-regression summaries over saved artifacts.""" - - parser = argparse.ArgumentParser( - description=( - "Summarize recurring calibration-oracle regression families for saved US artifacts." - ), - ) - parser.add_argument( - "artifact_roots", - nargs="+", - help="One or more artifact roots to scan.", - ) - parser.add_argument( - "--loss-scope", - default="full_oracle", - help="Which calibration oracle-loss scope to summarize.", - ) - parser.add_argument( - "--top-k", - type=int, - default=10, - help="Number of best/worst runs to include in the output.", - ) - args = parser.parse_args(argv) - - payload = summarize_us_policyengine_oracle_regressions( - args.artifact_roots, - loss_scope=args.loss_scope, - top_k=args.top_k, - ) - print(json.dumps(payload, indent=2, sort_keys=True)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/summarize_policyengine_oracle_target_drilldown.py b/src/microplex_us/pipelines/summarize_policyengine_oracle_target_drilldown.py deleted file mode 100644 index e554329b..00000000 --- a/src/microplex_us/pipelines/summarize_policyengine_oracle_target_drilldown.py +++ /dev/null @@ -1,390 +0,0 @@ -"""Summarize the worst calibration-oracle target cells for one saved artifact.""" - -from __future__ import annotations - -import argparse -import json -from collections import Counter -from pathlib import Path -from typing import Any - -from microplex_us.pipelines.stage_contracts import ( - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexPipeline, - _policyengine_target_ledger_entry, - _policyengine_target_loss_family_key, - _policyengine_target_loss_geography_key, -) -from microplex_us.policyengine import ( - PolicyEngineUSDBTargetProvider, - evaluate_policyengine_us_target_set, - load_policyengine_us_entity_tables, -) - - -def summarize_us_policyengine_oracle_target_drilldown( - artifact_dir: str | Path, - *, - family: str | None = None, - geography: str | None = None, - stage: str | None = None, - top_k: int | None = 25, -) -> dict[str, Any]: - """Evaluate one saved artifact against its oracle targets and list the worst cells.""" - - bundle_dir = Path(artifact_dir) - manifest_path = bundle_dir / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - config_payload = dict(manifest.get("config", {})) - config = USMicroplexBuildConfig(**config_payload) - if config.policyengine_targets_db is None: - raise ValueError("Artifact config does not define policyengine_targets_db") - - dataset_name = dict(manifest.get("artifacts", {})).get("policyengine_dataset") - dataset_path = ( - _resolve_manifest_artifact_path(bundle_dir, str(dataset_name)) - if dataset_name is not None - else resolve_us_stage_artifact_contract_path( - bundle_dir, - "08_dataset_assembly", - "policyengine_dataset", - ) - ) - if not dataset_path.exists(): - raise FileNotFoundError(f"PolicyEngine dataset not found: {dataset_path}") - - target_db_path = Path(config.policyengine_targets_db).expanduser() - if not target_db_path.is_absolute(): - target_db_path = (bundle_dir / target_db_path).resolve() - period = int( - config.policyengine_target_period or config.policyengine_dataset_year or 2024 - ) - pipeline = USMicroplexPipeline(config) - tables = load_policyengine_us_entity_tables(dataset_path, period=period) - provider = PolicyEngineUSDBTargetProvider(target_db_path) - ( - tables, - _bindings, - canonical_targets, - _compiled_targets, - _unsupported_targets, - _compiled_constraints, - _supported_targets, - _constraints, - _feasibility_filter_summary, - calibration_materialized_variables, - _materialization_failures, - _fixed_spine_residualization_summary, - ) = pipeline._resolve_policyengine_calibration_targets( - tables, - provider=provider, - target_period=period, - ) - - simulation_cls = ( - config.policyengine_simulation_cls - if isinstance(config.policyengine_simulation_cls, type) - else None - ) - report = evaluate_policyengine_us_target_set( - tables, - canonical_targets, - period=period, - dataset_year=config.policyengine_dataset_year or period, - simulation_cls=simulation_cls, - label=bundle_dir.name, - strict_materialization=False, - direct_override_variables=tuple(config.policyengine_direct_override_variables), - ) - - relative_error_cap = manifest.get("calibration", {}).get( - "oracle_relative_error_cap", - config.policyengine_oracle_relative_error_cap, - ) - if relative_error_cap is not None: - relative_error_cap = float(relative_error_cap) - target_ledger = list(manifest.get("calibration", {}).get("target_ledger", ())) - materialized_variables = { - str(variable) - for variable in manifest.get("calibration", {}).get( - "materialized_variables", () - ) - } - materialized_variables.update( - str(variable) for variable in calibration_materialized_variables - ) - materialized_variables.update( - str(variable) for variable in report.materialized_variables - ) - ledger_by_name = { - str(entry["target_name"]): dict(entry) - for entry in target_ledger - if entry.get("target_name") is not None - } - - rows: list[dict[str, Any]] = [] - for evaluation in report.evaluations: - rows.append( - _oracle_target_row( - target=evaluation.target, - ledger_entry=ledger_by_name.get(evaluation.target.name), - actual_value=float(evaluation.actual_value), - relative_error=evaluation.relative_error, - relative_error_cap=relative_error_cap, - unsupported=False, - household_count=len(tables.households), - materialized_variables=materialized_variables, - ) - ) - unsupported_penalty = relative_error_cap if relative_error_cap is not None else 1.0 - for target in report.unsupported_targets: - rows.append( - _oracle_target_row( - target=target, - ledger_entry=ledger_by_name.get(target.name), - actual_value=None, - relative_error=None, - relative_error_cap=relative_error_cap, - unsupported=True, - unsupported_penalty=unsupported_penalty, - household_count=len(tables.households), - materialized_variables=materialized_variables, - ) - ) - - filtered_rows = [ - row - for row in rows - if (family is None or row["loss_family"] == family) - and (geography is None or row["loss_geography"] == geography) - and (stage is None or row["stage"] == stage) - ] - filtered_rows.sort( - key=lambda row: ( - -float(row.get("capped_abs_relative_error") or -1.0), - -float(row.get("abs_relative_error") or -1.0), - row["target_name"], - ) - ) - - return { - "artifactDir": str(bundle_dir), - "datasetPath": str(dataset_path), - "targetDbPath": str(target_db_path), - "period": period, - "filters": { - "family": family, - "geography": geography, - "stage": stage, - "topK": int(top_k) if top_k is not None else None, - }, - "summary": { - "targetCount": len(filtered_rows), - "supportedTargetCount": sum( - 1 for row in filtered_rows if not row["unsupported"] - ), - "unsupportedTargetCount": sum( - 1 for row in filtered_rows if row["unsupported"] - ), - "stageCounts": { - key: int(value) - for key, value in sorted( - Counter(str(row["stage"]) for row in filtered_rows).items() - ) - }, - "largestFamilies": _top_counts(filtered_rows, "loss_family", top_k=10), - "largestGeographies": _top_counts( - filtered_rows, "loss_geography", top_k=10 - ), - "largestFamiliesByCappedError": _top_error_mass( - filtered_rows, - "loss_family", - top_k=10, - ), - "largestGeographiesByCappedError": _top_error_mass( - filtered_rows, - "loss_geography", - top_k=10, - ), - }, - "topRows": filtered_rows[:top_k] if top_k is not None else filtered_rows, - } - - -def _resolve_manifest_artifact_path(bundle_dir: Path, artifact_name: str) -> Path: - artifact_path = Path(artifact_name) - if artifact_path.is_absolute(): - return artifact_path - return (bundle_dir / artifact_path).resolve() - - -def _oracle_target_row( - *, - target: Any, - ledger_entry: dict[str, Any] | None, - actual_value: float | None, - relative_error: float | None, - relative_error_cap: float | None, - unsupported: bool, - household_count: int, - materialized_variables: set[str], - unsupported_penalty: float | None = None, -) -> dict[str, Any]: - entry = ( - dict(ledger_entry) - if ledger_entry is not None - else _policyengine_target_ledger_entry( - target=target, - stage="unknown", - reason="missing_manifest_ledger_entry", - household_count=household_count, - ) - ) - abs_relative_error = ( - abs(float(relative_error)) if relative_error is not None else None - ) - capped_abs_relative_error = abs_relative_error - if capped_abs_relative_error is not None and relative_error_cap is not None: - capped_abs_relative_error = min( - capped_abs_relative_error, float(relative_error_cap) - ) - if unsupported: - capped_abs_relative_error = ( - float(unsupported_penalty) - if unsupported_penalty is not None - else capped_abs_relative_error - ) - variable = str(entry.get("variable") or "") - domain_variable = entry.get("domain_variable") - driver_variable = str(domain_variable or variable or "") - driver_is_materialized = driver_variable in materialized_variables - variable_is_materialized = variable in materialized_variables if variable else False - domain_is_materialized = ( - str(domain_variable) in materialized_variables - if domain_variable is not None - else False - ) - target_value = float(target.value) - absolute_error = ( - abs(float(actual_value) - target_value) if actual_value is not None else None - ) - return { - "target_name": target.name, - "stage": str(entry.get("stage") or "unknown"), - "reason": str(entry.get("reason") or "unknown"), - "loss_family": _policyengine_target_loss_family_key(entry), - "loss_geography": _policyengine_target_loss_geography_key(entry), - "family": str(entry.get("family") or ""), - "variable": variable, - "domain_variable": domain_variable, - "driver_variable": driver_variable, - "driver_is_materialized": driver_is_materialized, - "variable_is_materialized": variable_is_materialized, - "domain_is_materialized": domain_is_materialized, - "provenance_class": ( - "policyengine_materialized" if driver_is_materialized else "stored_input" - ), - "geo_level": entry.get("geo_level"), - "geographic_id": entry.get("geographic_id"), - "unsupported": bool(unsupported), - "target_value": target_value, - "actual_value": float(actual_value) if actual_value is not None else None, - "absolute_error": absolute_error, - "relative_error": float(relative_error) if relative_error is not None else None, - "abs_relative_error": abs_relative_error, - "capped_abs_relative_error": capped_abs_relative_error, - "active_households": entry.get("active_households"), - "active_support_share": entry.get("active_support_share"), - "filters": list(entry.get("filters") or ()), - } - - -def _top_counts( - rows: list[dict[str, Any]], - key: str, - *, - top_k: int, -) -> list[dict[str, Any]]: - counter = Counter(str(row[key]) for row in rows if row.get(key) is not None) - return [ - {"group": group, "count": int(count)} - for group, count in sorted( - counter.items(), key=lambda item: (-int(item[1]), item[0]) - )[:top_k] - ] - - -def _top_error_mass( - rows: list[dict[str, Any]], - key: str, - *, - top_k: int, -) -> list[dict[str, Any]]: - grouped: dict[str, dict[str, float | int]] = {} - for row in rows: - group = row.get(key) - capped_error = row.get("capped_abs_relative_error") - if group is None or capped_error is None: - continue - bucket = grouped.setdefault( - str(group), - {"cappedErrorMass": 0.0, "count": 0}, - ) - bucket["cappedErrorMass"] = float(bucket["cappedErrorMass"]) + float( - capped_error - ) - bucket["count"] = int(bucket["count"]) + 1 - ranked = sorted( - grouped.items(), - key=lambda item: ( - -float(item[1]["cappedErrorMass"]), - -int(item[1]["count"]), - item[0], - ), - ) - return [ - { - "group": group, - "cappedErrorMass": float(metrics["cappedErrorMass"]), - "count": int(metrics["count"]), - "meanCappedError": float(metrics["cappedErrorMass"]) - / int(metrics["count"]), - } - for group, metrics in ranked[:top_k] - ] - - -def main(argv: list[str] | None = None) -> int: - """CLI entrypoint for one-artifact oracle drilldowns.""" - - parser = argparse.ArgumentParser( - description="Summarize the worst calibration-oracle target cells for one saved artifact.", - ) - parser.add_argument("artifact_dir", help="Saved artifact bundle directory.") - parser.add_argument("--family", help="Exact loss-family key to filter to.") - parser.add_argument("--geography", help="Exact loss-geography key to filter to.") - parser.add_argument("--stage", help="Exact target-ledger stage to filter to.") - parser.add_argument( - "--top-k", - type=int, - default=25, - help="Number of rows to keep in the output.", - ) - args = parser.parse_args(argv) - - payload = summarize_us_policyengine_oracle_target_drilldown( - args.artifact_dir, - family=args.family, - geography=args.geography, - stage=args.stage, - top_k=args.top_k, - ) - print(json.dumps(payload, indent=2, sort_keys=True)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/transparency_sidecars.py b/src/microplex_us/pipelines/transparency_sidecars.py deleted file mode 100644 index 75d91d74..00000000 --- a/src/microplex_us/pipelines/transparency_sidecars.py +++ /dev/null @@ -1,644 +0,0 @@ -"""Write non-gating transparency sidecars for Microplex artifact bundles.""" - -from __future__ import annotations - -import argparse -import json -import re -from collections import Counter -from datetime import datetime -from pathlib import Path -from typing import Any - -TRANSPARENCY_SIDECAR_SCHEMA_VERSION = 1 -DEFAULT_CONTRACT_PATH = Path(__file__).with_name("ecps_export_contract.json") -DEFAULT_OUTPUT_DIRNAME = "transparency" - -_TIMESTAMPED_LINE_RE = re.compile(r"^\[(?P[^\]]+)\]\s*(?P.*)$") -_KEY_RE = re.compile(r"(?P[A-Za-z_][A-Za-z0-9_]*)=") - - -def write_transparency_sidecars( - artifact_root: str | Path, - *, - dataset_path: str | Path | None = None, - log_paths: list[str | Path] | tuple[str | Path, ...] | None = None, - contract_path: str | Path = DEFAULT_CONTRACT_PATH, - output_dir: str | Path | None = None, -) -> dict[str, Any]: - """Write source, row, column, imputation, and calibration sidecars. - - These sidecars are observability artifacts. They intentionally do not - decide whether a dataset is production-ready; release performance remains - governed by the loss comparison. - """ - - root = Path(artifact_root).expanduser().resolve() - dataset = ( - Path(dataset_path).expanduser().resolve() - if dataset_path is not None - else root / "policyengine_us.h5" - ) - logs = _resolve_log_paths(root, log_paths) - destination = ( - Path(output_dir).expanduser().resolve() - if output_dir is not None - else root / DEFAULT_OUTPUT_DIRNAME - ) - destination.mkdir(parents=True, exist_ok=True) - - generated_at = _now_iso() - log_summary = _parse_logs(logs) - outputs: dict[str, Path] = {} - - common = { - "schema_version": TRANSPARENCY_SIDECAR_SCHEMA_VERSION, - "generated_at": generated_at, - "artifact_root": str(root), - "non_gating": True, - "production_performance_gate": "loss", - } - - source_manifest = { - **common, - "logs": [str(path) for path in logs], - "wrapper_events": log_summary["wrapper_events"], - "build_config": log_summary["build_config"], - "source_events": log_summary["source_events"], - "failures": log_summary["failures"], - } - outputs["source_manifest"] = _write_json( - destination / "source_manifest.json", - source_manifest, - ) - - imputation_manifest = { - **common, - "logs": [str(path) for path in logs], - "donor_integration": log_summary["donor_integration"], - } - outputs["imputation_manifest"] = _write_json( - destination / "imputation_manifest.json", - imputation_manifest, - ) - - column_manifest = _build_column_manifest( - dataset, - contract_path=Path(contract_path), - common=common, - ) - outputs["column_manifest"] = _write_json( - destination / "column_manifest.json", - column_manifest, - ) - - row_count_manifest = _build_row_count_manifest(dataset, root=root, common=common) - outputs["row_count_manifest"] = _write_json( - destination / "row_count_manifest.json", - row_count_manifest, - ) - - calibration_trace = _build_calibration_trace(root, common=common) - outputs["calibration_trace"] = _write_json( - destination / "calibration_trace.json", - calibration_trace, - ) - - summary = { - **common, - "output_dir": str(destination), - "sidecars": {key: str(path) for key, path in sorted(outputs.items())}, - "dataset_path": str(dataset), - "dataset_available": dataset.exists(), - "log_paths": [str(path) for path in logs], - } - outputs["summary"] = _write_json( - destination / "transparency_summary.json", - summary, - ) - return summary - - -def _now_iso() -> str: - return datetime.now().astimezone().isoformat(timespec="seconds") - - -def _write_json(path: Path, payload: dict[str, Any]) -> Path: - path.parent.mkdir(parents=True, exist_ok=True) - temp_path = path.with_name(f".{path.name}.tmp") - temp_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - temp_path.replace(path) - return path - - -def _resolve_log_paths( - artifact_root: Path, - log_paths: list[str | Path] | tuple[str | Path, ...] | None, -) -> list[Path]: - if log_paths: - return [Path(path).expanduser().resolve() for path in log_paths] - candidates = [ - artifact_root / "logs" / "gate1_build.log", - artifact_root / "logs" / "rebuild.log", - artifact_root / "resume_correct_targets" / "logs" / "resume.log", - ] - return [path for path in candidates if path.exists()] - - -def _parse_logs(paths: list[Path]) -> dict[str, Any]: - wrapper_events: list[dict[str, Any]] = [] - source_events: list[dict[str, Any]] = [] - failures: list[dict[str, Any]] = [] - donor_sources: dict[str, dict[str, Any]] = {} - build_config: dict[str, Any] = {} - - for path in paths: - if not path.exists(): - failures.append({"log_path": str(path), "reason": "missing_log"}) - continue - for line_number, raw_line in enumerate(path.read_text().splitlines(), start=1): - line = raw_line.strip() - if not line: - continue - timestamp, message = _split_timestamped_line(line) - if message.startswith(("Starting ", "Shape:", "Artifact root:")) or ( - message.startswith("Microplex git") or message.startswith("Free disk:") - ): - wrapper_events.append( - { - "log_path": str(path), - "line": line_number, - "timestamp": timestamp, - "message": message, - } - ) - if line.startswith("PE-US-data rebuild checkpoint: starting build"): - build_config = _parse_last_bracket_payload(line) - build_config["log_path"] = str(path) - build_config["line"] = line_number - elif _is_source_event(message): - source_events.append( - { - "log_path": str(path), - "line": line_number, - "timestamp": timestamp, - "message": message, - } - ) - elif "Traceback (most recent call last)" in line: - failures.append( - { - "log_path": str(path), - "line": line_number, - "timestamp": timestamp, - "reason": "traceback", - } - ) - elif re.match(r"^[A-Za-z_][A-Za-z0-9_]*(Error|Exception):", line): - failures.append( - { - "log_path": str(path), - "line": line_number, - "timestamp": timestamp, - "reason": "exception", - "message": line, - } - ) - - if line.startswith("US microplex donor integration:"): - _record_donor_event( - donor_sources, - path=path, - line_number=line_number, - timestamp=timestamp, - message=line, - ) - - return { - "wrapper_events": wrapper_events, - "build_config": build_config, - "source_events": source_events, - "failures": failures, - "donor_integration": { - "sources": [ - _finalize_donor_source(source) - for source in sorted( - donor_sources.values(), - key=lambda item: str(item.get("donor_source", "")), - ) - ] - }, - } - - -def _split_timestamped_line(line: str) -> tuple[str | None, str]: - match = _TIMESTAMPED_LINE_RE.match(line) - if match is None: - return None, line - return match.group("timestamp"), match.group("message") - - -def _is_source_event(message: str) -> bool: - return message.startswith( - ( - "Downloading ", - "Downloaded ", - "Parsing ", - "Cached processed data", - "Using repo-local PUF", - "Loading PUF", - "Loading demographics", - "Loading processed CPS", - "Expanded ", - " Raw records:", - " After demographics merge:", - ) - ) - - -def _parse_last_bracket_payload(line: str) -> dict[str, Any]: - start = line.rfind("[") - end = line.rfind("]") - if start == -1 or end == -1 or end <= start: - return {} - return _parse_key_values(line[start + 1 : end]) - - -def _parse_key_values(payload: str) -> dict[str, Any]: - matches = list(_KEY_RE.finditer(payload)) - fields: dict[str, Any] = {} - for index, match in enumerate(matches): - value_start = match.end() - value_end = ( - matches[index + 1].start() if index + 1 < len(matches) else len(payload) - ) - value = payload[value_start:value_end].strip().rstrip(",").strip() - fields[match.group("key")] = _coerce_scalar(value) - return fields - - -def _coerce_scalar(value: str) -> Any: - if value in {"true", "True"}: - return True - if value in {"false", "False"}: - return False - if value in {"none", "None", "null"}: - return None - if re.fullmatch(r"-?\d+", value): - try: - return int(value) - except ValueError: - return value - if re.fullmatch(r"-?\d+\.\d+", value): - try: - return float(value) - except ValueError: - return value - return value - - -def _record_donor_event( - donor_sources: dict[str, dict[str, Any]], - *, - path: Path, - line_number: int, - timestamp: str | None, - message: str, -) -> None: - payload = _parse_last_bracket_payload(message) - if payload.get("donor_source") is None: - return - donor_source = str(payload["donor_source"]) - source = donor_sources.setdefault( - donor_source, - { - "donor_source": donor_source, - "source_events": [], - "entity_id_events": [], - "blocks": {}, - }, - ) - event = { - "log_path": str(path), - "line": line_number, - "timestamp": timestamp, - "fields": payload, - } - if ( - "source start" in message - or "source ready" in message - or "source complete" in message - ): - source["source_events"].append(event) - if "source ready" in message: - source["ready"] = payload - if "source complete" in message: - source["complete"] = payload - return - if "entity ids " in message: - source["entity_id_events"].append(event) - return - if "block " not in message: - return - - block_name = str(payload.get("block") or "unknown") - block = source["blocks"].setdefault( - block_name, - { - "block": block_name, - "restored": payload.get("restored"), - "started_count": 0, - "completed_count": 0, - "run_events": [], - "complete_events": [], - }, - ) - if "block start" in message: - block["started_count"] += 1 - block["last_start"] = event - block["restored"] = payload.get("restored", block.get("restored")) - elif "block run" in message: - block["run_events"].append(event) - block["last_run"] = event - elif "block complete" in message: - block["completed_count"] += 1 - block["complete_events"].append(event) - block["last_complete"] = event - block["integrated_vars"] = payload.get("integrated_vars") - - -def _finalize_donor_source(source: dict[str, Any]) -> dict[str, Any]: - blocks = [ - block - for block in sorted( - source.get("blocks", {}).values(), - key=lambda item: str(item.get("block", "")), - ) - ] - completed = [ - block["block"] - for block in blocks - if int(block.get("completed_count") or 0) - >= int(block.get("started_count") or 0) - ] - active = [ - block["block"] - for block in blocks - if int(block.get("completed_count") or 0) < int(block.get("started_count") or 0) - ] - return { - **source, - "block_count": len(blocks), - "completed_block_count": len(completed), - "completed_blocks": completed, - "active_blocks": active, - "blocks": blocks, - } - - -def _build_column_manifest( - dataset_path: Path, - *, - contract_path: Path, - common: dict[str, Any], -) -> dict[str, Any]: - contract = _load_contract(contract_path) - required = set(contract.get("required", [])) - forbidden = set(contract.get("forbidden", [])) - optional = set(contract.get("ecps_internal_optional", [])) - formula_owned_excluded = set(contract.get("formula_owned_excluded", [])) - if not dataset_path.exists(): - return { - **common, - "dataset_path": str(dataset_path), - "available": False, - "reason": "dataset_not_found", - "contract_path": str(contract_path), - "required_count": len(required), - "forbidden_count": len(forbidden), - } - - present = _h5_top_level_columns(dataset_path) - known = required | forbidden | optional | formula_owned_excluded - missing_required = sorted(required - present) - forbidden_present = sorted(forbidden & present) - formula_owned_excluded_present = sorted(formula_owned_excluded & present) - return { - **common, - "dataset_path": str(dataset_path), - "available": True, - "contract_path": str(contract_path), - "present_count": len(present), - "required_count": len(required), - "forbidden_count": len(forbidden), - "missing_required_count": len(missing_required), - "forbidden_present_count": len(forbidden_present), - "formula_owned_excluded_present_count": len(formula_owned_excluded_present), - "missing_required": missing_required, - "forbidden_present": forbidden_present, - "formula_owned_excluded_present": formula_owned_excluded_present, - "extra_unknown": sorted(present - known), - "present_columns": sorted(present), - "diagnostic_status": "clean" - if not missing_required and not forbidden_present - else "needs_attention", - } - - -def _load_contract(path: Path) -> dict[str, Any]: - return json.loads(path.read_text()) - - -def _h5_top_level_columns(dataset_path: Path) -> set[str]: - import h5py - - with h5py.File(dataset_path, "r") as h5: - return {str(key).split("/")[0] for key in h5.keys()} - - -def _build_row_count_manifest( - dataset_path: Path, - *, - root: Path, - common: dict[str, Any], -) -> dict[str, Any]: - checkpoint_counts = _load_checkpoint_counts(root) - if not dataset_path.exists(): - return { - **common, - "dataset_path": str(dataset_path), - "available": False, - "reason": "dataset_not_found", - "checkpoint_counts": checkpoint_counts, - } - - h5_summary = _summarize_h5_variables(dataset_path) - return { - **common, - "dataset_path": str(dataset_path), - "available": True, - "checkpoint_counts": checkpoint_counts, - "shape_counts": h5_summary["shape_counts"], - "variables": h5_summary["variables"], - } - - -def _summarize_h5_variables(dataset_path: Path) -> dict[str, Any]: - import h5py - - variables: list[dict[str, Any]] = [] - shape_counter: Counter[str] = Counter() - shape_examples: dict[str, list[str]] = {} - with h5py.File(dataset_path, "r") as h5: - for name in sorted(h5.keys()): - obj = h5[name] - periods = [] - if isinstance(obj, h5py.Dataset): - periods.append(_dataset_summary("flat", obj)) - else: - for period in sorted(obj.keys()): - child = obj[period] - if isinstance(child, h5py.Dataset): - periods.append(_dataset_summary(str(period), child)) - for period in periods: - shape_key = "x".join(str(part) for part in period["shape"]) - shape_counter[shape_key] += 1 - shape_examples.setdefault(shape_key, []).append(name) - variables.append({"name": name, "periods": periods}) - shape_counts = [ - { - "shape": key, - "variable_count": count, - "example_variables": shape_examples.get(key, [])[:12], - } - for key, count in sorted( - shape_counter.items(), key=lambda item: (-item[1], item[0]) - ) - ] - return {"variables": variables, "shape_counts": shape_counts} - - -def _dataset_summary(period: str, dataset: Any) -> dict[str, Any]: - shape = [int(part) for part in dataset.shape] - return { - "period": period, - "shape": shape, - "rows": shape[0] if shape else None, - "dtype": str(dataset.dtype), - } - - -def _load_checkpoint_counts(root: Path) -> dict[str, Any]: - candidates = [ - root / "record_count_probe.json", - root / "resume_summary.json", - root / "resume_correct_targets" / "resume_summary.json", - root / "post-microsim" / "metadata.json", - root / "resume_correct_targets" / "post-microsim" / "metadata.json", - root / "checkpoints" / "post-microsim" / "metadata.json", - root / "checkpoints" / "post-imputation" / "metadata.json", - ] - loaded: dict[str, Any] = {} - for path in candidates: - if not path.exists(): - continue - try: - payload = json.loads(path.read_text()) - except json.JSONDecodeError: - continue - loaded[str(path)] = _extract_row_counts(payload) - return loaded - - -def _extract_row_counts(payload: dict[str, Any]) -> dict[str, Any]: - counts: dict[str, Any] = {} - for key in ("households", "persons", "tax_units", "families", "spm_units"): - value = payload.get(key) - if isinstance(value, dict) and "rows" in value: - counts[key] = value["rows"] - elif isinstance(value, int | float): - counts[key] = int(value) - for key in ("calibrated_rows", "row_count", "person_count", "household_count"): - if key in payload: - counts[key] = payload[key] - return counts - - -def _build_calibration_trace(root: Path, *, common: dict[str, Any]) -> dict[str, Any]: - candidates = [ - root / "calibration_summary.json", - root / "resume_correct_targets" / "calibration_summary.json", - ] - summaries = [] - for path in candidates: - if not path.exists(): - continue - try: - payload = json.loads(path.read_text()) - except json.JSONDecodeError: - continue - summaries.append( - { - "path": str(path), - "backend": payload.get("backend"), - "period": payload.get("period"), - "converged": payload.get("converged"), - "n_loaded_targets": payload.get("n_loaded_targets"), - "n_supported_targets": payload.get("n_supported_targets"), - "n_unsupported_targets": payload.get("n_unsupported_targets"), - "full_oracle_capped_mean_abs_relative_error": payload.get( - "full_oracle_capped_mean_abs_relative_error" - ), - "feasibility_filter": payload.get("feasibility_filter"), - "calibration_stages": payload.get("calibration_stages", []), - } - ) - return { - **common, - "available": bool(summaries), - "summaries": summaries, - } - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - prog="microplex-us-write-transparency-sidecars", - description=( - "Write non-gating source, row, column, imputation, and calibration " - "sidecars for one Microplex artifact root." - ), - ) - parser.add_argument("artifact_root", help="Microplex artifact root to inspect.") - parser.add_argument( - "--dataset", - help="PolicyEngine H5 path. Defaults to ARTIFACT_ROOT/policyengine_us.h5.", - ) - parser.add_argument( - "--log", - action="append", - dest="logs", - help="Log path to parse. May be repeated. Defaults to common artifact logs.", - ) - parser.add_argument( - "--contract", - default=str(DEFAULT_CONTRACT_PATH), - help="eCPS column contract path.", - ) - parser.add_argument( - "--output-dir", - help="Output directory. Defaults to ARTIFACT_ROOT/transparency.", - ) - args = parser.parse_args(argv) - summary = write_transparency_sidecars( - args.artifact_root, - dataset_path=args.dataset, - log_paths=args.logs, - contract_path=args.contract, - output_dir=args.output_dir, - ) - print(json.dumps(summary, indent=2, sort_keys=True)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/microplex_us/pipelines/us.py b/src/microplex_us/pipelines/us.py deleted file mode 100644 index 4ec0985c..00000000 --- a/src/microplex_us/pipelines/us.py +++ /dev/null @@ -1,11806 +0,0 @@ -"""Library-first US microplex build pipeline.""" - -from __future__ import annotations - -import logging -import sys -import time -import warnings -from collections import Counter -from collections.abc import Iterable, Mapping -from dataclasses import asdict, dataclass, field, replace -from functools import lru_cache -from pathlib import Path -from tempfile import TemporaryDirectory -from types import FunctionType -from typing import Any, Literal - -import h5py -import numpy as np -import pandas as pd -from microplex.calibration import ( - Calibrator, - HardConcreteCalibrator, - LinearConstraint, - SparseCalibrator, -) -from microplex.core import ( - EntityObservation, - EntityRelationship, - EntityType, - ObservationFrame, - RelationshipCardinality, - Shareability, - SourceDescriptor, - SourceProvider, - SourceQuery, - TimeStructure, -) -from microplex.fusion import FusionPlan -from microplex.geography import GeographyQuery -from microplex.hierarchical import TaxUnitOptimizer -from microplex.synthesizer import Synthesizer -from microplex.targets import TargetQuery, TargetSpec - -from microplex_us.data_sources.cps import ( - RETIREMENT_CATCH_UP_AGE, - RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR, - TAXABLE_PENSION_FRACTION, -) -from microplex_us.data_sources.forbes import ( - ForbesFixedSpine, - ForbesFixedSpineConfig, - append_forbes_fixed_spine_tables, - build_forbes_fixed_spine, - residualize_targets_for_fixed_spine, -) -from microplex_us.geography import ( - BlockGeography, - normalize_us_county_fips, -) -from microplex_us.pe_source_impute_engine import ( - PE_SOURCE_IMPUTE_BLOCK_ENGINE, - PESourceImputeBlockRunRequest, - PESourceImputeConditionedBlockRunRequest, -) -from microplex_us.pipelines.check_export_columns import ( - _format_report as _format_export_column_report, -) -from microplex_us.pipelines.check_export_columns import ( - compute_column_diff, - load_contract, -) -from microplex_us.pipelines.donor_imputers import ( - ColumnwiseQRFDonorImputer, - RegimeAwareDonorImputer, -) -from microplex_us.pipelines.pe_l0 import ( - PolicyEngineL0Calibrator, - make_policyengine_us_data_fit_l0_weights_fn, -) -from microplex_us.pipelines.pe_native_optimization import ( - optimize_policyengine_us_native_loss_dataset, -) -from microplex_us.pipelines.stage_contracts import ( - US_CANONICAL_STAGE_IDS, - canonicalize_us_pipeline_stage_id, - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_manifest_io import write_json_atomically -from microplex_us.pipelines.stage_policyengine_artifacts import ( - write_us_policyengine_entity_stage_artifact, -) -from microplex_us.pipelines.stage_run import ( - USArtifactRef, - USCalibrationOutputs, - USDiagnosticOutput, - USDonorSynthesisOutputs, - USPolicyEngineEntityOutputs, - USSeedScaffoldOutputs, - USSourceLoadingOutputs, - USSourcePlanningOutputs, -) -from microplex_us.pipelines.stage_runtime import USStageRuntimeWriter -from microplex_us.policyengine.aotc import ( - qualifying_expenses_from_american_opportunity_credit, -) -from microplex_us.policyengine.comparison import ( - evaluate_policyengine_us_target_set, - slice_policyengine_us_target_evaluation_report, -) -from microplex_us.policyengine.takeup import ( - DEFAULT_MEDICAID_TAKEUP_RATE, - DEFAULT_PREGNANCY_RATE, - DEFAULT_VOLUNTARY_FILING_RATE, - EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN, - VOLUNTARY_FILING_AGE_HEAD_HELPER_COLUMN, - VOLUNTARY_FILING_WAGE_INCOME_HELPER_COLUMN, - WIC_TAKEUP_CATEGORY_BREASTFEEDING, - WIC_TAKEUP_CATEGORY_CHILD, - WIC_TAKEUP_CATEGORY_INFANT, - WIC_TAKEUP_CATEGORY_NONE, - WIC_TAKEUP_CATEGORY_POSTPARTUM, - WIC_TAKEUP_CATEGORY_PREGNANT, - _load_microplex_eitc_takeup_rates, - _load_microplex_medicaid_takeup_rates, - _load_microplex_pregnancy_rates, - _load_microplex_takeup_rate, - _load_microplex_voluntary_filing_rates, - _load_microplex_wic_nutritional_risk_rates, - _load_microplex_wic_takeup_rates, - _microplex_seeded_rng, -) -from microplex_us.policyengine.target_profiles import ( - PolicyEngineUSTargetCell, - resolve_policyengine_us_target_profile, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSDBTargetProvider, - PolicyEngineUSEntityTableBundle, - PolicyEngineUSMicrosimulationAdapter, - PolicyEngineUSQuantityTarget, - PolicyEngineUSVariableBinding, - build_policyengine_us_export_column_names, - build_policyengine_us_export_variable_maps, - build_policyengine_us_time_period_arrays, - compile_supported_policyengine_us_household_linear_constraints, - compute_marketplace_plan_benchmark_ratio, - filter_supported_policyengine_us_targets, - infer_policyengine_us_variable_bindings, - load_us_pipeline_checkpoint, - materialize_policyengine_us_variables_safely, - policyengine_us_formula_variables_for_targets, - policyengine_us_variables_to_materialize, - resolve_policyengine_excluded_export_variables, - save_us_pipeline_checkpoint, - write_policyengine_us_time_period_dataset, -) -from microplex_us.policyengine.us import ( - subset_policyengine_tables_by_households as _subset_policyengine_tables_by_households, -) -from microplex_us.targets.arch import resolve_arch_sqlite_target_provider -from microplex_us.variables import ( - PE_STYLE_PUF_IRS_DEMOGRAPHIC_PREDICTORS, - UNSPLIT_DIVIDEND_QUALIFIED_SHARE, - DonorMatchStrategy, - VariableSupportFamily, - donor_imputation_block_specs, - normalize_dividend_columns, - normalize_social_security_columns, - prune_redundant_variables, - score_donor_condition_var, - social_security_retirement_compatible_amount, - variable_semantic_spec_for, -) - -LOGGER = logging.getLogger(__name__) - -PUF_SUPPORT_CLONE_FLAG_COLUMN = "person_is_puf_clone" -PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN = "_puf_support_clone_source_row_id" - -PUF_SUPPORT_CLONE_IMPUTED_VARIABLES: tuple[str, ...] = ( - "employment_income", - "partnership_s_corp_income", - "social_security", - "taxable_pension_income", - "interest_deduction", - "tax_exempt_pension_income", - "long_term_capital_gains", - "unreimbursed_business_employee_expenses", - "pre_tax_contributions", - "taxable_ira_distributions", - "self_employment_income", - "w2_wages_from_qualified_business", - "unadjusted_basis_qualified_property", - "business_is_sstb", - "sstb_self_employment_income_before_lsr", - "sstb_self_employment_income", - "sstb_self_employment_income_would_be_qualified", - "sstb_w2_wages_from_qualified_business", - "sstb_unadjusted_basis_qualified_property", - "short_term_capital_gains", - "qualified_dividend_income", - "charitable_cash_donations", - "self_employed_pension_contribution_ald", - "unrecaptured_section_1250_gain", - "taxable_unemployment_compensation", - "taxable_interest_income", - "domestic_production_ald", - "self_employed_health_insurance_ald", - "rental_income", - "non_qualified_dividend_income", - "cdcc_relevant_expenses", - "tax_exempt_interest_income", - "salt_refund_income", - "foreign_tax_credit", - "estate_income", - "charitable_non_cash_donations", - "american_opportunity_credit", - "miscellaneous_income", - "alimony_expense", - "farm_income", - "partnership_se_income", - "alimony_income", - "health_savings_account_ald", - "non_sch_d_capital_gains", - "general_business_credit", - "energy_efficient_home_improvement_credit", - "traditional_ira_contributions", - "amt_foreign_tax_credit", - "excess_withheld_payroll_tax", - "savers_credit", - "student_loan_interest", - "investment_income_elected_form_4952", - "early_withdrawal_penalty", - "prior_year_minimum_tax_credit", - "farm_rent_income", - "qualified_tuition_expenses", - "educator_expense", - "long_term_capital_gains_on_collectibles", - "other_credits", - "casualty_loss", - "unreported_payroll_tax", - "recapture_of_investment_credit", - "deductible_mortgage_interest", - "home_mortgage_interest", - "investment_interest_expense", - "other_health_insurance_premiums", - "qualified_reit_and_ptp_income", - "qualified_bdc_income", - "farm_operations_income", - "estate_income_would_be_qualified", - "farm_operations_income_would_be_qualified", - "farm_rent_income_would_be_qualified", - "partnership_s_corp_income_would_be_qualified", - "rental_income_would_be_qualified", - "self_employment_income_would_be_qualified", -) - -PUF_SUPPORT_CLONE_CPS_REFRESH_CONDITION_VARIABLES: tuple[str, ...] = ( - "age", - "is_male", - "state_fips", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - "employment_income", - "self_employment_income", - "social_security", -) - -PUF_SUPPORT_CLONE_CPS_REFRESH_INCOME_VARIABLES: frozenset[str] = frozenset( - { - "employment_income", - "self_employment_income", - "social_security", - } -) - -PUF_SUPPORT_CLONE_CPS_DIRECT_PASSTHROUGH_ALIASES: Mapping[str, tuple[str, ...]] = { - "taxable_unemployment_compensation": ("unemployment_compensation",), -} -PUF_SUPPORT_CLONE_CPS_TAXABLE_INTEREST_FALLBACK_SHARE = 0.680 -PUF_SUPPORT_CLONE_CPS_SPLIT_TOTALS: tuple[ - tuple[tuple[str, str], str, float], - ..., -] = ( - ( - ("taxable_interest_income", "tax_exempt_interest_income"), - "interest_income", - PUF_SUPPORT_CLONE_CPS_TAXABLE_INTEREST_FALLBACK_SHARE, - ), - ( - ("taxable_pension_income", "tax_exempt_pension_income"), - "pension_income", - TAXABLE_PENSION_FRACTION, - ), -) -PUF_SUPPORT_CLONE_CPS_DIVIDEND_TOTAL_ALIAS = "dividend_income" - -# Refresh categorical/status fields against the PUF income surface, but never -# overwrite amount fields here. PUF and CPS income amounts must come from donor -# imputation/calibration, not from post-hoc bucket or nearest-neighbor surgery. -PUF_SUPPORT_CLONE_CPS_REFRESH_VARIABLES: tuple[str, ...] = ( - "is_male", - "cps_race", - "is_hispanic", - "detailed_occupation_recode", - "treasury_tipped_occupation_code", - "is_disabled", - "difficulty_seeing", - "difficulty_hearing", - "difficulty_walking_or_climbing_stairs", - "difficulty_dressing_or_bathing", - "difficulty_doing_errands", - "difficulty_remembering_or_making_decisions", - "meets_ssi_disability_criteria", - "receives_wic", - "receives_housing_assistance", - "is_paid_hourly", - "is_union_member_or_covered", -) - -PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES: tuple[str, ...] = ( - "partnership_s_corp_income", - "interest_deduction", - "unreimbursed_business_employee_expenses", - "pre_tax_contributions", - "w2_wages_from_qualified_business", - "unadjusted_basis_qualified_property", - "business_is_sstb", - "sstb_self_employment_income_before_lsr", - "sstb_self_employment_income", - "sstb_self_employment_income_would_be_qualified", - "sstb_w2_wages_from_qualified_business", - "sstb_unadjusted_basis_qualified_property", - "charitable_cash_donations", - "self_employed_pension_contribution_ald", - "unrecaptured_section_1250_gain", - "taxable_unemployment_compensation", - "domestic_production_ald", - "self_employed_health_insurance_ald", - "cdcc_relevant_expenses", - "salt_refund_income", - "foreign_tax_credit", - "estate_income", - "charitable_non_cash_donations", - "american_opportunity_credit", - "miscellaneous_income", - "alimony_expense", - "health_savings_account_ald", - "non_sch_d_capital_gains", - "general_business_credit", - "energy_efficient_home_improvement_credit", - "amt_foreign_tax_credit", - "excess_withheld_payroll_tax", - "savers_credit", - "student_loan_interest", - "investment_income_elected_form_4952", - "early_withdrawal_penalty", - "prior_year_minimum_tax_credit", - "farm_rent_income", - "qualified_tuition_expenses", - "educator_expense", - "long_term_capital_gains_on_collectibles", - "other_credits", - "casualty_loss", - "unreported_payroll_tax", - "recapture_of_investment_credit", - "deductible_mortgage_interest", - "home_mortgage_interest", - "investment_interest_expense", - "other_health_insurance_premiums", - "qualified_reit_and_ptp_income", - "qualified_bdc_income", - "farm_operations_income", - "estate_income_would_be_qualified", - "farm_operations_income_would_be_qualified", - "farm_rent_income_would_be_qualified", - "partnership_s_corp_income_would_be_qualified", - "rental_income_would_be_qualified", - "self_employment_income_would_be_qualified", -) - -PUF_SUPPORT_CLONE_SPECIAL_VARIABLES: tuple[str, ...] = ("weeks_unemployed",) - -PUF_SUPPORT_CLONE_IRS_DETAIL_COLLAPSE_VARIABLES: tuple[str, ...] = tuple( - dict.fromkeys( - ( - "taxable_interest_income", - "tax_exempt_interest_income", - "interest_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "ordinary_dividend_income", - "dividend_income", - "taxable_pension_income", - "tax_exempt_pension_income", - "taxable_private_pension_income", - "tax_exempt_private_pension_income", - "pension_income", - "taxable_unemployment_compensation", - "unemployment_compensation", - ) - + PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES - ) -) - -PUF_SUPPORT_CLONE_CPS_MEASURED_OVERLAP_VARIABLES: tuple[str, ...] = ("social_security",) -PUF_SUPPORT_CLONE_DONOR_ONLY_COLLAPSE_EXCLUDED_VARIABLES: tuple[str, ...] = ( - "employment_income_before_lsr", -) -PUF_SUPPORT_CLONE_COLLAPSE_OVERLAP_VARIABLES: tuple[str, ...] = tuple( - variable - for variable in dict.fromkeys( - PUF_SUPPORT_CLONE_IMPUTED_VARIABLES - + PUF_SUPPORT_CLONE_SPECIAL_VARIABLES - + ("wage_income", "dividend_income", "capital_gains") - ) - if variable not in PUF_SUPPORT_CLONE_CPS_MEASURED_OVERLAP_VARIABLES -) - - -@lru_cache(maxsize=1) -def _default_block_geography() -> BlockGeography: - return BlockGeography() - - -def _normalize_household_county_fips_series( - county_fips: pd.Series, - state_fips: pd.Series, -) -> pd.Series: - """Normalize CPS county fragments into PE's five-digit county FIPS values.""" - county_numeric = pd.to_numeric(county_fips, errors="coerce") - state_numeric = pd.to_numeric(state_fips, errors="coerce") - combined = county_numeric.copy() - county_fragment_mask = ( - county_numeric.notna() - & county_numeric.gt(0) - & county_numeric.lt(1000) - & state_numeric.notna() - & state_numeric.gt(0) - ) - combined.loc[county_fragment_mask] = state_numeric.loc[ - county_fragment_mask - ].round().astype(int) * 1000 + county_numeric.loc[ - county_fragment_mask - ].round().astype(int) - normalized = combined.round().astype("Int64").astype("string").str.zfill(5) - invalid = combined.isna() | combined.le(0) - return normalized.mask(invalid).astype("string") - - -def _normalize_household_state_fips_series(state_fips: pd.Series) -> pd.Series: - numeric = pd.to_numeric(state_fips, errors="coerce") - normalized = numeric.round().astype("Int64").astype("string").str.zfill(2) - return normalized.mask(numeric.isna() | numeric.le(0)).astype("string") - - -def _congressional_district_geoid_from_cd_id( - cd_id: Any, - state_fips: Any, -) -> int: - try: - state = int(str(state_fips).strip()) - except (TypeError, ValueError): - return 0 - cd_text = str(cd_id).strip() - if not cd_text or cd_text.lower() in {"nan", "none", ""}: - return 0 - district_token = cd_text.split("-")[-1] - # eCPS normalizes at-large districts to 01: the raw Census codes "AL"/"ZZ" - # (at-large) and "98" (DC) map to district 0, which is then bumped to 1 - # (policyengine-us-data db/create_initial_strata.py). Microplex's crosswalk - # feeds the "-AL" token, but accept the raw Census forms too so the encoder - # stays faithful to the eCPS 436-CD universe regardless of input convention. - if district_token.upper() in {"AL", "ZZ"}: - district = 1 - else: - try: - district = int(district_token) - except ValueError: - return 0 - if district in (0, 98): - district = 1 - return state * 100 + district - - -def _attach_household_census_geographies( - households: pd.DataFrame, - *, - seed: int, - geography: BlockGeography | None = None, -) -> pd.DataFrame: - """Attach eCPS-contract block, tract, county, and CD geographies to households.""" - # Intermediate frames are indexed by row label and written back via .loc; - # a non-unique household-frame index makes those reindex operations ambiguous - # (ValueError: cannot reindex on an axis with duplicate labels). The caller - # consumes this result by merging on the household_id column, not the index, - # so collapsing to a fresh RangeIndex here is both safe and robust. - result = households.reset_index(drop=True) - for column, default in ( - ("block_geoid", ""), - ("tract_geoid", ""), - ("congressional_district_geoid", 0), - ): - if column not in result.columns: - result[column] = default - if result.empty or "state_fips" not in result.columns: - return result - - assigned_blocks = pd.Series(pd.NA, index=result.index, dtype="string") - state_values = _normalize_household_state_fips_series(result["state_fips"]) - county_values = ( - _normalize_household_county_fips_series( - result["county_fips"], result["state_fips"] - ) - if "county_fips" in result.columns - else pd.Series(pd.NA, index=result.index, dtype="string") - ) - result["county_fips"] = county_values.fillna("00000") - - try: - block_geography = geography or _default_block_geography() - block_data = block_geography.data - except FileNotFoundError: - return result - - valid_counties = set(block_data["county_fips"].dropna().astype(str)) - county_mask = county_values.isin(valid_counties) - if county_mask.any(): - county_query = GeographyQuery( - partition_columns=("county_fips",), - partition_normalizers={"county_fips": normalize_us_county_fips}, - ) - county_assigner = block_geography.load_assigner(county_query) - county_frame = pd.DataFrame( - {"county_fips": county_values.loc[county_mask]}, - index=result.index[county_mask], - ) - assigned = county_assigner.assign( - county_frame, - random_state=seed, - ) - assigned_blocks.loc[assigned.index] = assigned["block_geoid"].astype("string") - - remaining_mask = assigned_blocks.isna() - state_mask = remaining_mask & state_values.notna() - if state_mask.any(): - state_frame = pd.DataFrame( - {"state_fips": state_values.loc[state_mask]}, - index=result.index[state_mask], - ) - assigned = block_geography.assign( - state_frame, - random_state=seed + 1, - ) - assigned_blocks.loc[assigned.index] = assigned["block_geoid"].astype("string") - - assigned_mask = assigned_blocks.notna() - if not assigned_mask.any(): - return result - - materialized = block_geography.materialize( - pd.DataFrame( - { - "_row_index": assigned_blocks.index[assigned_mask], - "block_geoid": assigned_blocks.loc[assigned_mask].astype(str), - }, - index=result.index[assigned_mask], - ), - columns=("state_fips", "county_fips", "tract_geoid", "cd_id"), - ) - row_index = materialized["_row_index"].to_numpy() - for column in ("block_geoid", "tract_geoid", "county_fips"): - if column in materialized.columns: - result.loc[row_index, column] = materialized[column].to_numpy() - result.loc[row_index, "state_fips"] = ( - pd.to_numeric(materialized["state_fips"], errors="coerce") - .fillna(0) - .astype(int) - .to_numpy() - ) - result.loc[row_index, "congressional_district_geoid"] = [ - _congressional_district_geoid_from_cd_id(cd_id, state_fips) - for cd_id, state_fips in zip( - materialized.get("cd_id", pd.Series(index=row_index)), - materialized["state_fips"], - strict=False, - ) - ] - return result - - -def _root_logger_has_handlers() -> bool: - return bool(logging.getLogger().handlers) - - -def _format_progress_values(values: Iterable[Any], *, limit: int = 6) -> str: - rendered = [str(value) for value in values] - if len(rendered) <= limit: - return ",".join(rendered) - return ",".join(rendered[:limit]) + f",...(+{len(rendered) - limit})" - - -def _emit_us_pipeline_progress(message: str, /, **context: object) -> None: - details = ", ".join( - f"{key}={value}" - for key, value in context.items() - if value is not None and value != "" - ) - line = f"{message} [{details}]" if details else message - LOGGER.info(line) - if not LOGGER.handlers and not _root_logger_has_handlers(): - print(line, file=sys.stderr, flush=True) - - -STATE_FIPS = { - 1: "AL", - 2: "AK", - 4: "AZ", - 5: "AR", - 6: "CA", - 8: "CO", - 9: "CT", - 10: "DE", - 11: "DC", - 12: "FL", - 13: "GA", - 15: "HI", - 16: "ID", - 17: "IL", - 18: "IN", - 19: "IA", - 20: "KS", - 21: "KY", - 22: "LA", - 23: "ME", - 24: "MD", - 25: "MA", - 26: "MI", - 27: "MN", - 28: "MS", - 29: "MO", - 30: "MT", - 31: "NE", - 32: "NV", - 33: "NH", - 34: "NJ", - 35: "NM", - 36: "NY", - 37: "NC", - 38: "ND", - 39: "OH", - 40: "OK", - 41: "OR", - 42: "PA", - 44: "RI", - 45: "SC", - 46: "SD", - 47: "TN", - 48: "TX", - 49: "UT", - 50: "VT", - 51: "VA", - 53: "WA", - 54: "WV", - 55: "WI", - 56: "WY", -} - -AGE_BINS = [0, 18, 35, 55, 65, np.inf] - - -AGE_LABELS = ["0-17", "18-34", "35-54", "55-64", "65+"] -INCOME_BINS = [-np.inf, 25_000, 50_000, 100_000, np.inf] -INCOME_LABELS = ["<25k", "25-50k", "50-100k", "100k+"] -ENTITY_ID_COLUMNS = { - EntityType.PERSON: "person_id", - EntityType.HOUSEHOLD: "household_id", - EntityType.TAX_UNIT: "tax_unit_id", - EntityType.SPM_UNIT: "spm_unit_id", - EntityType.FAMILY: "family_id", -} -TINY_WEIGHT_THRESHOLD = 1e-8 -DEFAULT_POLICYENGINE_CALIBRATION_MAX_CONSTRAINTS_PER_HOUSEHOLD = 1.0 -DEFAULT_POLICYENGINE_CALIBRATION_MIN_ACTIVE_HOUSEHOLDS = 5 -CALIBRATION_FEASIBILITY_DROP_WARNING_THRESHOLD = 0.2 -STATE_PROGRAM_SUPPORT_PROXY_VARIABLES = ( - "has_medicaid", - "public_assistance", - "ssi", - "social_security", -) -STATE_PROGRAM_AUTO_CONDITION_VARIABLES = ("has_medicaid",) - - -def _summarize_weight_diagnostics( - weights: pd.Series | np.ndarray | list[float], - *, - tiny_threshold: float = TINY_WEIGHT_THRESHOLD, -) -> dict[str, Any]: - """Summarize whether a calibrated weight vector looks numerically healthy.""" - series = ( - pd.to_numeric(pd.Series(weights), errors="coerce").fillna(0.0).astype(float) - ) - row_count = int(len(series)) - if row_count == 0: - return { - "row_count": 0, - "positive_count": 0, - "nonpositive_count": 0, - "tiny_count": 0, - "tiny_share": 0.0, - "total_weight": 0.0, - "min_weight": 0.0, - "p01_weight": 0.0, - "p50_weight": 0.0, - "p99_weight": 0.0, - "max_weight": 0.0, - "effective_sample_size": 0.0, - "collapse_suspected": True, - } - - total_weight = float(series.sum()) - squared_weight_sum = float(np.square(series).sum()) - positive_count = int((series > 0.0).sum()) - nonpositive_count = row_count - positive_count - tiny_count = int((series <= tiny_threshold).sum()) - tiny_share = float(tiny_count / row_count) - effective_sample_size = ( - float((total_weight * total_weight) / squared_weight_sum) - if squared_weight_sum > 0.0 - else 0.0 - ) - effective_sample_ratio = ( - float(effective_sample_size / positive_count) if positive_count > 0 else 0.0 - ) - collapse_suspected = bool( - total_weight <= tiny_threshold - or positive_count == 0 - or tiny_share >= 0.95 - or effective_sample_ratio <= 0.25 - ) - return { - "row_count": row_count, - "positive_count": positive_count, - "nonpositive_count": nonpositive_count, - "tiny_count": tiny_count, - "tiny_share": tiny_share, - "total_weight": total_weight, - "min_weight": float(series.min()), - "p01_weight": float(series.quantile(0.01)), - "p50_weight": float(series.quantile(0.5)), - "p99_weight": float(series.quantile(0.99)), - "max_weight": float(series.max()), - "effective_sample_size": effective_sample_size, - "effective_sample_ratio": effective_sample_ratio, - "collapse_suspected": collapse_suspected, - } - - -def _state_program_support_proxy_summary( - available_columns: set[str], -) -> dict[str, list[str]]: - available = sorted( - variable - for variable in STATE_PROGRAM_SUPPORT_PROXY_VARIABLES - if variable in available_columns - ) - missing = sorted( - variable - for variable in STATE_PROGRAM_SUPPORT_PROXY_VARIABLES - if variable not in available_columns - ) - return { - "available": available, - "missing": missing, - } - - -def _subset_policyengine_linear_constraints( - constraints: tuple[LinearConstraint, ...] | list[LinearConstraint], - household_mask: np.ndarray, -) -> tuple[LinearConstraint, ...]: - mask = np.asarray(household_mask, dtype=bool) - subset: list[LinearConstraint] = [] - for constraint in constraints: - coefficients = np.asarray(constraint.coefficients, dtype=float) - if len(coefficients) != len(mask): - raise ValueError( - "PolicyEngine linear constraint coefficients do not match household mask length" - ) - subset.append( - LinearConstraint( - name=constraint.name, - coefficients=coefficients[mask], - target=float(constraint.target), - ) - ) - return tuple(subset) - - -def _normalize_policyengine_constraints_for_microcalibrate( - constraints: tuple[LinearConstraint, ...] | list[LinearConstraint], -) -> tuple[tuple[LinearConstraint, ...], dict[str, Any]]: - """Make signed equality targets safe for microcalibrate's relative-error loss. - - ``microcalibrate`` normalizes its objective by ``target + 1`` and is - therefore numerically unsafe for large negative targets. Multiplying - a linear equality by ``-1`` preserves the exact feasible set, so - negative-target constraints are passed to the backend with positive - targets and flipped coefficients. - """ - - normalized: list[LinearConstraint] = [] - sign_flipped_names: list[str] = [] - for constraint in constraints: - target = float(constraint.target) - coefficients = np.asarray(constraint.coefficients, dtype=float) - if target < 0.0: - normalized.append( - LinearConstraint( - name=constraint.name, - coefficients=-coefficients, - target=-target, - ) - ) - sign_flipped_names.append(constraint.name) - else: - normalized.append( - LinearConstraint( - name=constraint.name, - coefficients=coefficients, - target=target, - ) - ) - max_names_to_record = 100 - summary = { - "sign_flipped_constraint_count": len(sign_flipped_names), - "sign_flipped_constraint_names": sign_flipped_names[:max_names_to_record], - "sign_flipped_constraint_names_truncated": ( - len(sign_flipped_names) > max_names_to_record - ), - } - return tuple(normalized), summary - - -def _policyengine_target_geo_priority(target: TargetSpec) -> int: - geo_level = str(target.metadata.get("geo_level", "")).lower() - return { - "national": 0, - "state": 1, - "district": 2, - }.get(geo_level, 99) - - -def _constraint_active_household_count( - constraint: Any, - *, - epsilon: float = 1e-12, - metadata_lookup: dict[str, dict[str, Any]] | None = None, -) -> int: - """Count households with nonzero coefficient. Uses ``metadata_lookup`` when provided.""" - if metadata_lookup is not None: - cached = metadata_lookup.get(getattr(constraint, "name", None)) - if cached is not None and "active_households" in cached: - return int(cached["active_households"]) - coefficients = np.asarray(getattr(constraint, "coefficients", ()), dtype=float) - if coefficients.size == 0: - return 0 - return int(np.count_nonzero(np.abs(coefficients) > epsilon)) - - -def _precompute_constraint_metadata( - constraints: tuple[Any, ...], - *, - epsilon: float = 1e-12, -) -> dict[str, dict[str, Any]]: - """Per-constraint {active_households, coefficient_mass} scalar metadata.""" - metadata: dict[str, dict[str, Any]] = {} - for constraint in constraints: - name = getattr(constraint, "name", None) - if name is None: - continue - coefficients = np.asarray(getattr(constraint, "coefficients", ()), dtype=float) - if coefficients.size == 0: - metadata[name] = { - "active_households": 0, - "coefficient_mass": 0.0, - } - continue - metadata[name] = { - "active_households": int(np.count_nonzero(np.abs(coefficients) > epsilon)), - "coefficient_mass": float(np.abs(coefficients).sum()), - } - return metadata - - -def _strip_constraint_coefficients( - constraints: tuple[Any, ...], -) -> tuple[LinearConstraint, ...]: - """Replace each constraint's coefficient array with a zero-length sentinel.""" - return tuple( - LinearConstraint( - name=c.name, coefficients=np.zeros(0, dtype=float), target=float(c.target) - ) - for c in constraints - ) - - -def _build_policyengine_constraint_records( - targets: list[TargetSpec], - constraints: tuple[Any, ...], - *, - metadata_lookup: dict[str, dict[str, Any]] | None = None, -) -> list[dict[str, Any]]: - records: list[dict[str, Any]] = [] - for target, constraint in zip(targets, constraints, strict=True): - aggregation_name = str( - getattr(getattr(target, "aggregation", None), "name", target.aggregation) - ).upper() - name = getattr(constraint, "name", None) - cached = ( - metadata_lookup.get(name) - if metadata_lookup is not None and name is not None - else None - ) - if cached is not None and "coefficient_mass" in cached: - coefficient_mass = float(cached["coefficient_mass"]) - else: - coefficient_mass = float( - np.abs( - np.asarray(getattr(constraint, "coefficients", ()), dtype=float) - ).sum() - ) - records.append( - { - "target": target, - "constraint": constraint, - "active_households": _constraint_active_household_count( - constraint, metadata_lookup=metadata_lookup - ), - "geo_priority": _policyengine_target_geo_priority(target), - "aggregation_priority": 0 if aggregation_name == "COUNT" else 1, - "coefficient_mass": coefficient_mass, - } - ) - return records - - -def _policyengine_target_has_entity_table( - target: TargetSpec, - tables: PolicyEngineUSEntityTableBundle, -) -> bool: - return { - EntityType.HOUSEHOLD: tables.households, - EntityType.PERSON: tables.persons, - EntityType.TAX_UNIT: tables.tax_units, - EntityType.SPM_UNIT: tables.spm_units, - EntityType.FAMILY: tables.families, - }.get(target.entity) is not None - - -def _policyengine_target_variable_name(target: TargetSpec) -> str: - metadata = dict(target.metadata or {}) - variable = metadata.get("variable") - if variable is not None: - return str(variable) - if target.measure is not None: - return str(target.measure) - aggregation_name = str( - getattr(getattr(target, "aggregation", None), "name", target.aggregation) - ).upper() - if aggregation_name == "COUNT": - entity_value = ( - target.entity.value - if isinstance(target.entity, EntityType) - else str(target.entity) - ) - return f"{entity_value}_count" - return "unknown" - - -def _policyengine_target_family_key(target: TargetSpec) -> str: - metadata = dict(target.metadata or {}) - geo_level = str(metadata.get("geo_level") or "unspecified") - domain_variable = str(metadata.get("domain_variable") or "") - variable = _policyengine_target_variable_name(target) - parts = [geo_level, variable] - if domain_variable: - parts.append(f"domain={domain_variable}") - return "|".join(parts) - - -def _policyengine_target_loss_family_key(entry: dict[str, Any]) -> str: - variable = str(entry.get("variable") or "unknown") - domain_variable = str(entry.get("domain_variable") or "") - if domain_variable: - return f"{variable}|domain={domain_variable}" - return variable - - -def _policyengine_target_loss_geography_key(entry: dict[str, Any]) -> str: - geo_level = str(entry.get("geo_level") or "unspecified") - geographic_id = entry.get("geographic_id") - if geographic_id is None or str(geographic_id) == "": - return geo_level - geographic_key = str(geographic_id).strip() - if geo_level == "national": - return f"{geo_level}:US" - if geo_level == "state": - try: - state_fips = int(geographic_key) - except (TypeError, ValueError): - geographic_key = geographic_key.upper() - else: - geographic_key = STATE_FIPS.get(state_fips, f"{state_fips:02d}") - return f"{geo_level}:{geographic_key}" - - -def _select_ssi_takeup_by_age_amount( - *, - person_ids: pd.Series, - ages: pd.Series, - weights: pd.Series, - reported_ssi: pd.Series, - full_takeup_ssi: pd.Series, -) -> tuple[np.ndarray, dict[str, Any]]: - """Select SSI takeup records to match reported SSI dollars by age group.""" - index = person_ids.index - person_ids = person_ids.reindex(index) - age_values = pd.to_numeric(ages.reindex(index), errors="coerce").fillna(0.0) - weight_values = ( - pd.to_numeric(weights.reindex(index), errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - ) - reported_values = ( - pd.to_numeric(reported_ssi.reindex(index), errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - ) - full_values = ( - pd.to_numeric(full_takeup_ssi.reindex(index), errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - ) - - reported_amount = (reported_values * weight_values).to_numpy(dtype=float) - full_amount = (full_values * weight_values).to_numpy(dtype=float) - reported_positive = reported_values.to_numpy(dtype=float) > 0.0 - formula_positive = full_values.to_numpy(dtype=float) > 0.0 - age_array = age_values.to_numpy(dtype=float) - selected = np.zeros(len(index), dtype=bool) - stable_rank = pd.util.hash_pandas_object( - person_ids.astype("string"), - index=False, - ).to_numpy(dtype=np.uint64) - - def _select_until_amount(candidate_mask: np.ndarray, amount: float) -> None: - if amount <= 0.0: - return - candidate_index = np.flatnonzero(candidate_mask & ~selected) - if candidate_index.size == 0: - return - ordered = candidate_index[ - np.argsort(stable_rank[candidate_index], kind="stable") - ] - cumulative = np.cumsum(full_amount[ordered]) - cutoff = int(np.searchsorted(cumulative, amount, side="left")) - selected[ordered[: min(cutoff + 1, len(ordered))]] = True - - groups = { - "aged": age_array >= 65, - "under65": age_array < 65, - } - group_summary: dict[str, Any] = {} - for group_name, group_mask in groups.items(): - source_candidates = group_mask & reported_positive & formula_positive - other_candidates = group_mask & ~reported_positive & formula_positive - target_amount = float(reported_amount[group_mask].sum()) - _select_until_amount(source_candidates, target_amount) - selected_amount = float(full_amount[selected & group_mask].sum()) - _select_until_amount(other_candidates, target_amount - selected_amount) - selected_amount = float(full_amount[selected & group_mask].sum()) - group_summary[group_name] = { - "reported_amount": target_amount, - "reported_recipients": float( - weight_values.to_numpy(dtype=float)[ - group_mask & reported_positive - ].sum() - ), - "formula_all_takeup_amount": float(full_amount[group_mask].sum()), - "formula_all_takeup_recipients": float( - weight_values.to_numpy(dtype=float)[group_mask & formula_positive].sum() - ), - "selected_amount": selected_amount, - "selected_recipients": float( - weight_values.to_numpy(dtype=float)[group_mask & selected].sum() - ), - "source_candidate_amount": float(full_amount[source_candidates].sum()), - "other_candidate_amount": float(full_amount[other_candidates].sum()), - } - - weight_array = weight_values.to_numpy(dtype=float) - summary = { - "enabled": True, - "method": "reported_ssi_amount_by_age_group", - "reported_amount": float(reported_amount.sum()), - "reported_recipients": float(weight_array[reported_positive].sum()), - "formula_all_takeup_amount": float(full_amount.sum()), - "formula_all_takeup_recipients": float(weight_array[formula_positive].sum()), - "selected_amount": float(full_amount[selected].sum()), - "selected_recipients": float(weight_array[selected].sum()), - "groups": group_summary, - } - return selected, summary - - -def _policyengine_target_ledger_entry( - *, - target: TargetSpec, - stage: str, - reason: str, - household_count: int, - active_households: int | None = None, - min_active_households: int | None = None, - missing_features: Iterable[str] = (), - failed_materializations: Iterable[str] = (), -) -> dict[str, Any]: - metadata = dict(target.metadata or {}) - required_features = sorted(str(feature) for feature in target.required_features) - entity_value = ( - target.entity.value - if isinstance(target.entity, EntityType) - else str(target.entity) - ) - aggregation_value = getattr(target.aggregation, "value", str(target.aggregation)) - active_support_share = None - if active_households is not None and household_count > 0: - active_support_share = float(active_households / household_count) - return { - "target_name": target.name, - "target_id": metadata.get("target_id"), - "stratum_id": metadata.get("stratum_id"), - "stage": stage, - "reason": reason, - "family": _policyengine_target_family_key(target), - "entity": entity_value, - "aggregation": aggregation_value, - "measure": target.measure, - "value": float(target.value), - "geo_level": metadata.get("geo_level"), - "geographic_id": metadata.get("geographic_id"), - "variable": _policyengine_target_variable_name(target), - "domain_variable": metadata.get("domain_variable"), - "filters": [ - { - "feature": target_filter.feature, - "operator": target_filter.operator, - "value": target_filter.value, - } - for target_filter in target.filters - ], - "required_features": required_features, - "missing_features": sorted(str(feature) for feature in missing_features), - "failed_materializations": sorted( - str(feature) for feature in failed_materializations - ), - "active_households": active_households, - "active_support_share": active_support_share, - "min_active_households": min_active_households, - "source": target.source, - "description": target.description, - } - - -def _summarize_policyengine_target_ledger( - ledger: list[dict[str, Any]], - *, - compiled_target_count: int, - preselection_target_count: int, - final_solve_target_count: int, -) -> dict[str, Any]: - stage_order = ("solve_now", "solve_later", "audit_only") - stage_counts = Counter(entry["stage"] for entry in ledger) - reason_counts = Counter(entry["reason"] for entry in ledger) - stage_reason_counts: dict[str, Counter[str]] = { - stage: Counter() for stage in stage_order - } - family_stage_counts: dict[str, Counter[str]] = {} - geo_level_stage_counts: dict[str, Counter[str]] = {} - for entry in ledger: - stage = str(entry["stage"]) - stage_reason_counts.setdefault(stage, Counter())[str(entry["reason"])] += 1 - family = str(entry["family"]) - family_stage_counts.setdefault(family, Counter())[stage] += 1 - geo_level = str(entry.get("geo_level") or "unspecified") - geo_level_stage_counts.setdefault(geo_level, Counter())[stage] += 1 - return { - "n_targets": len(ledger), - "n_compile_ready_targets": int(compiled_target_count), - "n_selected_after_feasibility": int(preselection_target_count), - "n_selected_for_current_solve": int(final_solve_target_count), - "stage_counts": { - stage: int(stage_counts.get(stage, 0)) for stage in stage_order - }, - "reason_counts": { - reason: int(count) for reason, count in sorted(reason_counts.items()) - }, - "stage_reason_counts": { - stage: { - reason: int(count) - for reason, count in sorted( - stage_reason_counts.get(stage, Counter()).items() - ) - } - for stage in stage_order - }, - "geo_level_stage_counts": { - geo_level: {stage: int(count) for stage, count in sorted(counter.items())} - for geo_level, counter in sorted(geo_level_stage_counts.items()) - }, - "family_stage_counts": { - family: {stage: int(count) for stage, count in sorted(counter.items())} - for family, counter in sorted(family_stage_counts.items()) - }, - } - - -def _build_policyengine_calibration_target_ledger( - *, - canonical_targets: list[TargetSpec], - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], - compiled_targets: list[TargetSpec], - structurally_unsupported_targets: list[TargetSpec], - compiled_constraints: tuple[Any, ...], - preselection_targets: list[TargetSpec], - selected_stage_by_name: dict[str, int], - household_count: int, - min_active_households: int, - materialization_failures: dict[str, str], - compiled_constraint_metadata: dict[str, dict[str, Any]] | None = None, -) -> tuple[dict[str, Any], list[dict[str, Any]]]: - min_required_households = max(1, int(min_active_households)) - structurally_unsupported_names = { - target.name for target in structurally_unsupported_targets - } - preselection_names = {target.name for target in preselection_targets} - final_solve_names = set(selected_stage_by_name) - - ledger: list[dict[str, Any]] = [] - classified_names: set[str] = set() - for target in canonical_targets: - missing_features = sorted( - str(feature) - for feature in target.required_features - if feature not in bindings - ) - has_entity_table = _policyengine_target_has_entity_table(target, tables) - if not has_entity_table: - ledger.append( - _policyengine_target_ledger_entry( - target=target, - stage="audit_only", - reason="missing_entity_table", - household_count=household_count, - missing_features=missing_features, - ) - ) - classified_names.add(target.name) - continue - if missing_features: - failed_materializations = [ - feature - for feature in missing_features - if feature in materialization_failures - ] - ledger.append( - _policyengine_target_ledger_entry( - target=target, - stage="audit_only", - reason=( - "materialization_failure" - if failed_materializations - else "missing_required_features" - ), - household_count=household_count, - missing_features=missing_features, - failed_materializations=failed_materializations, - ) - ) - classified_names.add(target.name) - continue - if target.name in structurally_unsupported_names: - ledger.append( - _policyengine_target_ledger_entry( - target=target, - stage="audit_only", - reason="unsupported_structure", - household_count=household_count, - ) - ) - classified_names.add(target.name) - - for record in _build_policyengine_constraint_records( - compiled_targets, - compiled_constraints, - metadata_lookup=compiled_constraint_metadata, - ): - target = record["target"] - classified_names.add(target.name) - active_households = int(record["active_households"]) - if target.name in final_solve_names: - stage = "solve_now" - reason = f"selected_stage_{int(selected_stage_by_name[target.name])}" - elif target.name in preselection_names: - stage = "solve_later" - reason = "household_budget_selection" - elif active_households < min_required_households: - stage = "solve_later" - reason = "low_household_support" - else: - stage = "solve_later" - reason = "constraint_capacity" - ledger.append( - _policyengine_target_ledger_entry( - target=target, - stage=stage, - reason=reason, - household_count=household_count, - active_households=active_households, - min_active_households=min_required_households, - ) - ) - - for target in canonical_targets: - if target.name in classified_names: - continue - ledger.append( - _policyengine_target_ledger_entry( - target=target, - stage="audit_only", - reason="unclassified", - household_count=household_count, - ) - ) - - stage_rank = {"solve_now": 0, "solve_later": 1, "audit_only": 2} - ledger.sort( - key=lambda entry: ( - stage_rank.get(str(entry["stage"]), 99), - str(entry["reason"]), - str(entry["family"]), - str(entry["target_name"]), - ) - ) - return ( - _summarize_policyengine_target_ledger( - ledger, - compiled_target_count=len(compiled_targets), - preselection_target_count=len(preselection_targets), - final_solve_target_count=len(final_solve_names), - ), - ledger, - ) - - -def _ranked_policyengine_group_focus_keys( - ranking: list[dict[str, Any]] | tuple[dict[str, Any], ...] | None, - *, - limit: int | None, -) -> list[str]: - if not ranking: - return [] - if limit is not None and limit <= 0: - return [] - selected: list[str] = [] - for row in ranking: - score = float(row.get("capped_sum_abs_relative_error") or 0.0) - if score <= 0.0: - continue - selected.append(str(row["group"])) - if limit is not None and len(selected) >= limit: - break - return selected - - -def _select_policyengine_deferred_stage_constraints( - *, - compiled_targets: list[TargetSpec], - compiled_constraints: tuple[LinearConstraint, ...], - target_ledger: list[dict[str, Any]], - deferred_oracle_loss: dict[str, Any], - deferred_target_priority_lookup: dict[str, float] | None, - selected_target_names: set[str], - household_count: int, - min_active_households: int, - max_constraints: int | None, - max_constraints_per_household: float | None, - top_family_count: int | None, - top_geography_count: int | None, - compiled_constraint_metadata: dict[str, dict[str, Any]] | None = None, -) -> tuple[list[TargetSpec], tuple[LinearConstraint, ...], dict[str, Any]]: - ledger_by_name = { - str(entry["target_name"]): entry - for entry in target_ledger - if entry.get("target_name") is not None - } - family_focus = _ranked_policyengine_group_focus_keys( - deferred_oracle_loss.get("family_ranking"), - limit=top_family_count, - ) - geography_focus = _ranked_policyengine_group_focus_keys( - deferred_oracle_loss.get("geography_ranking"), - limit=top_geography_count, - ) - family_focus_set = set(family_focus) - geography_focus_set = set(geography_focus) - family_scores = { - str(row["group"]): float(row.get("capped_loss_share") or 0.0) - for row in deferred_oracle_loss.get("family_ranking", ()) - } - geography_scores = { - str(row["group"]): float(row.get("capped_loss_share") or 0.0) - for row in deferred_oracle_loss.get("geography_ranking", ()) - } - - candidate_targets: list[TargetSpec] = [] - candidate_constraints: list[LinearConstraint] = [] - priority_scores: dict[str, float] = {} - focus_eligible_count = 0 - min_required_households = max(1, int(min_active_households)) - - for record in _build_policyengine_constraint_records( - compiled_targets, - compiled_constraints, - metadata_lookup=compiled_constraint_metadata, - ): - target = record["target"] - if target.name in selected_target_names: - continue - ledger_entry = ledger_by_name.get(target.name) - if ledger_entry is None or ledger_entry.get("stage") != "solve_later": - continue - if int(record["active_households"]) < min_required_households: - continue - family_key = _policyengine_target_loss_family_key(ledger_entry) - geography_key = _policyengine_target_loss_geography_key(ledger_entry) - if family_focus_set or geography_focus_set: - if ( - family_key not in family_focus_set - and geography_key not in geography_focus_set - ): - continue - focus_eligible_count += 1 - candidate_targets.append(target) - candidate_constraints.append(record["constraint"]) - target_score = ( - float(deferred_target_priority_lookup.get(target.name, 0.0)) - if deferred_target_priority_lookup is not None - else 0.0 - ) - priority_scores[target.name] = ( - target_score - + family_scores.get(family_key, 0.0) - + geography_scores.get(geography_key, 0.0) - ) - - selected_targets, selected_constraints, feasibility_summary = ( - _select_feasible_policyengine_calibration_constraints( - candidate_targets, - tuple(candidate_constraints), - household_count=household_count, - max_constraints=max_constraints, - max_constraints_per_household=max_constraints_per_household, - min_active_households=min_required_households, - priority_scores=priority_scores, - ) - ) - return ( - selected_targets, - selected_constraints, - { - "min_active_households": min_required_households, - "top_family_count": top_family_count, - "top_geography_count": top_geography_count, - "focused_families": family_focus, - "focused_geographies": geography_focus, - "n_focus_eligible_constraints": focus_eligible_count, - "target_error_priority_available": deferred_target_priority_lookup - is not None, - "feasibility_filter": feasibility_summary, - }, - ) - - -def _policyengine_unsupported_target_error_penalty( - *, - relative_error_cap: float | None, -) -> float: - if relative_error_cap is not None: - return float(relative_error_cap) - return 1.0 - - -def _policyengine_target_fit_loss_components( - report: Any, - *, - relative_error_cap: float | None = None, -) -> dict[str, Any]: - supported_abs_relative_errors = [ - abs(evaluation.relative_error) - for evaluation in report.evaluations - if evaluation.relative_error is not None - ] - capped_supported_abs_relative_errors = [ - min(error, float(relative_error_cap)) - if relative_error_cap is not None - else error - for error in supported_abs_relative_errors - ] - unsupported_target_count = int(len(report.unsupported_targets)) - unsupported_target_error_penalty = _policyengine_unsupported_target_error_penalty( - relative_error_cap=relative_error_cap - ) - penalized_abs_relative_errors = [ - *supported_abs_relative_errors, - *([unsupported_target_error_penalty] * unsupported_target_count), - ] - capped_penalized_abs_relative_errors = [ - *capped_supported_abs_relative_errors, - *([unsupported_target_error_penalty] * unsupported_target_count), - ] - return { - "supported_abs_relative_errors": supported_abs_relative_errors, - "capped_supported_abs_relative_errors": capped_supported_abs_relative_errors, - "penalized_abs_relative_errors": penalized_abs_relative_errors, - "capped_penalized_abs_relative_errors": capped_penalized_abs_relative_errors, - "unsupported_target_count": unsupported_target_count, - "unsupported_target_error_penalty": unsupported_target_error_penalty, - } - - -def _summarize_policyengine_target_fit_report( - report: Any, - *, - target_count: int, - relative_error_cap: float | None = None, -) -> dict[str, Any]: - supported_target_count = int(report.supported_target_count) - unsupported_target_count = int(len(report.unsupported_targets)) - supported_target_rate = None - if target_count > 0: - supported_target_rate = float(supported_target_count / target_count) - loss_components = _policyengine_target_fit_loss_components( - report, - relative_error_cap=relative_error_cap, - ) - supported_only_mean_abs_relative_error = report.mean_abs_relative_error - supported_only_max_abs_relative_error = report.max_abs_relative_error - supported_only_capped_mean_abs_relative_error = ( - float( - sum(loss_components["capped_supported_abs_relative_errors"]) - / len(loss_components["capped_supported_abs_relative_errors"]) - ) - if loss_components["capped_supported_abs_relative_errors"] - else None - ) - penalized_abs_relative_errors = loss_components["penalized_abs_relative_errors"] - capped_penalized_abs_relative_errors = loss_components[ - "capped_penalized_abs_relative_errors" - ] - mean_abs_relative_error = ( - float(sum(penalized_abs_relative_errors) / target_count) - if target_count > 0 and penalized_abs_relative_errors - else None - ) - max_abs_relative_error = None - if target_count > 0: - max_candidates = [] - if supported_only_max_abs_relative_error is not None: - max_candidates.append(float(supported_only_max_abs_relative_error)) - if unsupported_target_count > 0: - max_candidates.append(loss_components["unsupported_target_error_penalty"]) - if max_candidates: - max_abs_relative_error = max(max_candidates) - capped_mean_abs_relative_error = ( - float(sum(capped_penalized_abs_relative_errors) / target_count) - if target_count > 0 and capped_penalized_abs_relative_errors - else None - ) - return { - "target_count": int(target_count), - "supported_target_count": supported_target_count, - "unsupported_target_count": unsupported_target_count, - "supported_target_rate": supported_target_rate, - "mean_abs_relative_error": ( - float(mean_abs_relative_error) - if mean_abs_relative_error is not None - else None - ), - "supported_only_mean_abs_relative_error": ( - float(supported_only_mean_abs_relative_error) - if supported_only_mean_abs_relative_error is not None - else None - ), - "max_abs_relative_error": ( - float(max_abs_relative_error) - if max_abs_relative_error is not None - else None - ), - "supported_only_max_abs_relative_error": ( - float(supported_only_max_abs_relative_error) - if supported_only_max_abs_relative_error is not None - else None - ), - "relative_error_cap": ( - float(relative_error_cap) if relative_error_cap is not None else None - ), - "unsupported_target_error_penalty": ( - loss_components["unsupported_target_error_penalty"] - if unsupported_target_count > 0 - else None - ), - "capped_mean_abs_relative_error": capped_mean_abs_relative_error, - "supported_only_capped_mean_abs_relative_error": ( - supported_only_capped_mean_abs_relative_error - ), - } - - -def _summarize_policyengine_target_fit_group_reports( - report: Any, - *, - targets_by_group: dict[str, list[TargetSpec]], - relative_error_cap: float | None = None, -) -> tuple[dict[str, dict[str, Any]], list[dict[str, Any]]]: - total_loss_components = _policyengine_target_fit_loss_components( - report, - relative_error_cap=relative_error_cap, - ) - total_abs_relative_error = float( - sum(total_loss_components["penalized_abs_relative_errors"]) - ) - total_capped_abs_relative_error = float( - sum(total_loss_components["capped_penalized_abs_relative_errors"]) - ) - grouped: list[tuple[str, dict[str, Any]]] = [] - for group_key, group_targets in targets_by_group.items(): - group_report = slice_policyengine_us_target_evaluation_report( - report, - group_targets, - ) - group_loss_components = _policyengine_target_fit_loss_components( - group_report, - relative_error_cap=relative_error_cap, - ) - sum_abs_relative_error = float( - sum(group_loss_components["penalized_abs_relative_errors"]) - ) - capped_sum_abs_relative_error = float( - sum(group_loss_components["capped_penalized_abs_relative_errors"]) - ) - summary = _summarize_policyengine_target_fit_report( - group_report, - target_count=len(group_targets), - relative_error_cap=relative_error_cap, - ) - summary["sum_abs_relative_error"] = sum_abs_relative_error - summary["loss_share"] = ( - float(sum_abs_relative_error / total_abs_relative_error) - if total_abs_relative_error > 0.0 - else None - ) - summary["capped_sum_abs_relative_error"] = capped_sum_abs_relative_error - summary["capped_loss_share"] = ( - float(capped_sum_abs_relative_error / total_capped_abs_relative_error) - if total_capped_abs_relative_error > 0.0 - else None - ) - grouped.append((group_key, summary)) - - grouped.sort( - key=lambda item: ( - -item[1]["capped_sum_abs_relative_error"], - -item[1]["sum_abs_relative_error"], - -item[1]["target_count"], - item[0], - ) - ) - return ( - {group_key: summary for group_key, summary in grouped}, - [ - { - "group": group_key, - **summary, - } - for group_key, summary in grouped - ], - ) - - -def _summarize_policyengine_target_fit_report_with_groups( - report: Any, - *, - targets: list[TargetSpec], - ledger_by_name: dict[str, dict[str, Any]], - relative_error_cap: float | None = None, -) -> dict[str, Any]: - summary = _summarize_policyengine_target_fit_report( - report, - target_count=len(targets), - relative_error_cap=relative_error_cap, - ) - family_targets: dict[str, list[TargetSpec]] = {} - geography_targets: dict[str, list[TargetSpec]] = {} - for target in targets: - ledger_entry = ledger_by_name.get(target.name) - if ledger_entry is None: - continue - family_targets.setdefault( - _policyengine_target_loss_family_key(ledger_entry), - [], - ).append(target) - geography_targets.setdefault( - _policyengine_target_loss_geography_key(ledger_entry), - [], - ).append(target) - ( - summary["family_summaries"], - summary["family_ranking"], - ) = _summarize_policyengine_target_fit_group_reports( - report, - targets_by_group=family_targets, - relative_error_cap=relative_error_cap, - ) - ( - summary["geography_summaries"], - summary["geography_ranking"], - ) = _summarize_policyengine_target_fit_group_reports( - report, - targets_by_group=geography_targets, - relative_error_cap=relative_error_cap, - ) - return summary - - -def _evaluate_policyengine_target_fit_summaries( - *, - tables: PolicyEngineUSEntityTableBundle, - canonical_targets: list[TargetSpec], - final_solve_targets: list[TargetSpec], - target_ledger: list[dict[str, Any]], - period: int | str, - dataset_year: int | None, - simulation_cls: Any | None, - direct_override_variables: tuple[str, ...] = (), - relative_error_cap: float | None = None, -) -> dict[str, dict[str, Any]]: - summaries, _ = _evaluate_policyengine_target_fit_context( - tables=tables, - canonical_targets=canonical_targets, - final_solve_targets=final_solve_targets, - target_ledger=target_ledger, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - direct_override_variables=direct_override_variables, - relative_error_cap=relative_error_cap, - ) - return summaries - - -def _policyengine_target_fit_priority_lookup( - report: Any, - *, - relative_error_cap: float | None = None, -) -> dict[str, float]: - target_scores: dict[str, float] = {} - for evaluation in report.evaluations: - abs_relative_error = abs(float(evaluation.relative_error)) - capped_abs_relative_error = ( - min(abs_relative_error, float(relative_error_cap)) - if relative_error_cap is not None - else abs_relative_error - ) - target_scores[evaluation.target.name] = float(capped_abs_relative_error) - unsupported_target_error_penalty = _policyengine_unsupported_target_error_penalty( - relative_error_cap=relative_error_cap - ) - for target in report.unsupported_targets: - target_scores[target.name] = float(unsupported_target_error_penalty) - return target_scores - - -def _evaluate_policyengine_target_fit_context( - *, - tables: PolicyEngineUSEntityTableBundle, - canonical_targets: list[TargetSpec], - final_solve_targets: list[TargetSpec], - target_ledger: list[dict[str, Any]], - period: int | str, - dataset_year: int | None, - simulation_cls: Any | None, - direct_override_variables: tuple[str, ...] = (), - relative_error_cap: float | None = None, -) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, float]]]: - target_by_name = {target.name: target for target in canonical_targets} - ledger_by_name = { - str(entry["target_name"]): entry - for entry in target_ledger - if entry.get("target_name") - } - deferred_targets = [ - target_by_name[entry["target_name"]] - for entry in target_ledger - if entry["stage"] == "solve_later" and entry["target_name"] in target_by_name - ] - audit_only_targets = [ - target_by_name[entry["target_name"]] - for entry in target_ledger - if entry["stage"] == "audit_only" and entry["target_name"] in target_by_name - ] - full_report = evaluate_policyengine_us_target_set( - tables, - canonical_targets, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - label="policyengine_db_calibration", - direct_override_variables=direct_override_variables, - ) - active_solve_report = slice_policyengine_us_target_evaluation_report( - full_report, - final_solve_targets, - ) - deferred_report = slice_policyengine_us_target_evaluation_report( - full_report, - deferred_targets, - ) - audit_only_report = slice_policyengine_us_target_evaluation_report( - full_report, - audit_only_targets, - ) - summaries = { - "full_oracle": _summarize_policyengine_target_fit_report_with_groups( - full_report, - targets=canonical_targets, - ledger_by_name=ledger_by_name, - relative_error_cap=relative_error_cap, - ), - "active_solve": _summarize_policyengine_target_fit_report_with_groups( - active_solve_report, - targets=final_solve_targets, - ledger_by_name=ledger_by_name, - relative_error_cap=relative_error_cap, - ), - "deferred": _summarize_policyengine_target_fit_report_with_groups( - deferred_report, - targets=deferred_targets, - ledger_by_name=ledger_by_name, - relative_error_cap=relative_error_cap, - ), - "audit_only": _summarize_policyengine_target_fit_report_with_groups( - audit_only_report, - targets=audit_only_targets, - ledger_by_name=ledger_by_name, - relative_error_cap=relative_error_cap, - ), - } - return summaries, { - "full_oracle": _policyengine_target_fit_priority_lookup( - full_report, - relative_error_cap=relative_error_cap, - ), - "active_solve": _policyengine_target_fit_priority_lookup( - active_solve_report, - relative_error_cap=relative_error_cap, - ), - "deferred": _policyengine_target_fit_priority_lookup( - deferred_report, - relative_error_cap=relative_error_cap, - ), - "audit_only": _policyengine_target_fit_priority_lookup( - audit_only_report, - relative_error_cap=relative_error_cap, - ), - } - - -def _select_feasible_policyengine_calibration_constraints( - targets: list[TargetSpec], - constraints: tuple[Any, ...], - *, - household_count: int, - max_constraints: int | None, - max_constraints_per_household: float | None, - min_active_households: int, - priority_scores: dict[str, float] | None = None, -) -> tuple[list[TargetSpec], tuple[Any, ...], dict[str, Any]]: - selected_targets = list(targets) - selected_constraints = tuple(constraints) - requested_max_constraints = max_constraints - if ( - requested_max_constraints is None - and max_constraints_per_household is not None - and household_count > 0 - ): - requested_max_constraints = max( - 1, - int(np.floor(max_constraints_per_household * household_count)), - ) - - records = _build_policyengine_constraint_records(targets, constraints) - - min_required_households = max(1, int(min_active_households)) - support_filtered = [ - record - for record in records - if record["active_households"] >= min_required_households - ] - low_support_dropped = len(records) - len(support_filtered) - - support_filtered.sort( - key=lambda record: ( - -float(priority_scores.get(record["target"].name, 0.0)) - if priority_scores is not None - else 0.0, - record["geo_priority"], - record["aggregation_priority"], - -record["active_households"], - -record["coefficient_mass"], - record["target"].name, - ) - ) - - over_capacity_dropped = 0 - if ( - requested_max_constraints is not None - and len(support_filtered) > requested_max_constraints - ): - over_capacity_dropped = len(support_filtered) - requested_max_constraints - support_filtered = support_filtered[:requested_max_constraints] - - selected_targets = [record["target"] for record in support_filtered] - selected_constraints = tuple(record["constraint"] for record in support_filtered) - dropped_total = low_support_dropped + over_capacity_dropped - drop_share = float(dropped_total / len(records)) if records else 0.0 - warning_messages: list[str] = [] - if drop_share > CALIBRATION_FEASIBILITY_DROP_WARNING_THRESHOLD: - warning_messages.append( - "Calibration feasibility filter dropped " - f"{dropped_total}/{len(records)} constraints " - f"({drop_share:.1%}) before solving." - ) - diagnostics = { - "requested_max_constraints": requested_max_constraints, - "max_constraints_per_household": max_constraints_per_household, - "min_active_households": min_required_households, - "n_constraints_before_feasibility_filter": len(constraints), - "n_constraints_after_feasibility_filter": len(selected_constraints), - "n_constraints_dropped_low_support": low_support_dropped, - "n_constraints_dropped_over_capacity": over_capacity_dropped, - "n_constraints_dropped_total": dropped_total, - "constraint_drop_share": drop_share, - "warning_messages": warning_messages, - "feasibility_filter_applied": bool( - low_support_dropped > 0 or over_capacity_dropped > 0 - ), - } - return selected_targets, selected_constraints, diagnostics - - -@dataclass(frozen=True) -class USMicroplexBuildConfig: - """Configuration for the US microplex build pipeline.""" - - n_synthetic: int = 100_000 - synthesis_backend: Literal["bootstrap", "synthesizer", "seed"] = "synthesizer" - calibration_backend: Literal[ - "entropy", - "ipf", - "chi2", - "sparse", - "hardconcrete", - "pe_l0", - "microcalibrate", - "none", - ] = "entropy" - calibration_tol: float = 1e-6 - calibration_max_iter: int = 100 - random_seed: int = 42 - target_sparsity: float = 0.9 - device: str = "cpu" - synthesizer_condition_vars: tuple[str, ...] = ( - "age", - "sex", - "education", - "employment_status", - "state_fips", - "tenure", - ) - synthesizer_target_vars: tuple[str, ...] = ("income",) - synthesizer_epochs: int = 100 - synthesizer_batch_size: int = 256 - synthesizer_learning_rate: float = 1e-3 - synthesizer_n_layers: int = 4 - synthesizer_hidden_dim: int = 64 - donor_imputer_epochs: int = 20 - donor_imputer_batch_size: int = 128 - donor_imputer_learning_rate: float = 1e-3 - donor_imputer_n_layers: int = 2 - donor_imputer_hidden_dim: int = 32 - donor_imputer_backend: Literal["maf", "qrf", "zi_qrf", "regime_aware"] = "maf" - donor_imputer_qrf_n_estimators: int = 100 - donor_imputer_qrf_max_train_samples: int | None = 50_000 - donor_imputer_qrf_zero_threshold: float = 0.05 - donor_imputer_condition_selection: Literal[ - "all_shared", - "top_correlated", - "pe_prespecified", - "pe_plus_puf_native_challenger", - ] = "top_correlated" - donor_imputer_max_condition_vars: int | None = 8 - donor_imputer_excluded_variables: tuple[str, ...] = ("filing_status_code",) - donor_imputer_authoritative_override_variables: tuple[str, ...] = () - puf_support_clone_enabled: bool = False - puf_support_clone_source_prefixes: tuple[str, ...] = ("irs_soi_puf",) - puf_support_clone_zero_initial_weight: bool = True - puf_support_clone_flag_column: str = PUF_SUPPORT_CLONE_FLAG_COLUMN - puf_support_clone_prior_weight_share: float = 0.05 - puf_support_clone_output_mode: Literal[ - "append", - "collapse_to_scaffold", - ] = "append" - puf_support_clone_overlap_variables: tuple[str, ...] = ( - PUF_SUPPORT_CLONE_IMPUTED_VARIABLES - + PUF_SUPPORT_CLONE_SPECIAL_VARIABLES - + ("wage_income", "dividend_income", "capital_gains") - ) - puf_support_clone_both_halves_override_variables: tuple[str, ...] = ( - PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES - ) - puf_support_clone_collapse_irs_detail_variables: tuple[str, ...] = ( - PUF_SUPPORT_CLONE_IRS_DETAIL_COLLAPSE_VARIABLES - ) - puf_support_clone_collapse_overlap_variables: tuple[str, ...] = ( - PUF_SUPPORT_CLONE_COLLAPSE_OVERLAP_VARIABLES - ) - puf_support_clone_scale_tax_details_to_cps_totals: bool = False - puf_support_clone_refresh_cps_only_fields: bool = True - puf_support_clone_cps_refresh_variables: tuple[str, ...] = ( - PUF_SUPPORT_CLONE_CPS_REFRESH_VARIABLES - ) - puf_support_clone_cps_refresh_condition_variables: tuple[str, ...] = ( - PUF_SUPPORT_CLONE_CPS_REFRESH_CONDITION_VARIABLES - ) - dependent_tax_leaf_soft_cap_multiplier: float | None = None - dependent_tax_leaf_soft_cap_base_variables: tuple[str, ...] = ( - "employment_income", - "wage_income", - "self_employment_income", - ) - dependent_tax_leaf_soft_cap_variables: tuple[str, ...] = ( - "taxable_interest_income", - "tax_exempt_interest_income", - "taxable_pension_income", - "dividend_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "partnership_s_corp_income", - "rental_income", - ) - bootstrap_strata_columns: tuple[str, ...] = () - prefer_cached_cps_asec_source: bool = False - cps_asec_source_year: int = 2023 - cps_asec_cache_dir: str | None = None - policyengine_dataset: str | None = None - policyengine_baseline_dataset: str | None = None - policyengine_dataset_year: int | None = None - policyengine_direct_override_variables: tuple[str, ...] = () - policyengine_export_column_contract_path: str | Path | None = None - """Optional eCPS export-column contract checked before calibration. - - When set, the pipeline verifies the final H5 column surface from the - post-imputation PE entity tables, then fails before microsimulation or - calibration if required columns are missing or forbidden columns would be - exported. - """ - policyengine_prefer_existing_tax_unit_ids: bool = True - policyengine_quantity_targets: tuple[PolicyEngineUSQuantityTarget, ...] = () - policyengine_targets_db: str | None = None - arch_targets_db: str | tuple[str, ...] | None = None - calibration_target_source: Literal["policyengine", "arch"] = "policyengine" - policyengine_target_period: int | None = None - policyengine_target_variables: tuple[str, ...] = () - policyengine_target_domains: tuple[str, ...] = () - policyengine_target_geo_levels: tuple[str, ...] = () - policyengine_target_profile: str | None = None - policyengine_calibration_target_variables: tuple[str, ...] = () - policyengine_calibration_target_domains: tuple[str, ...] = () - policyengine_calibration_target_geo_levels: tuple[str, ...] = () - policyengine_calibration_target_profile: str | None = None - policyengine_calibrate_ssi_takeup: bool = True - policyengine_calibration_rescale_to_input_weight_sum: bool = False - policyengine_calibration_rescale_to_target_total_weight: bool = False - policyengine_calibration_target_total_weight: float | None = None - policyengine_selection_backend: Literal["sparse", "pe_native_loss"] = "sparse" - policyengine_selection_household_budget: int | None = None - policyengine_selection_state_floor: int = 0 - policyengine_selection_max_iter: int = 200 - policyengine_selection_tol: float = 1e-8 - policyengine_selection_l2_penalty: float = 0.0 - policyengine_selection_target_total_weight: float | None = None - policyengine_calibration_max_constraints: int | None = None - policyengine_calibration_max_constraints_per_household: float | None = ( - DEFAULT_POLICYENGINE_CALIBRATION_MAX_CONSTRAINTS_PER_HOUSEHOLD - ) - policyengine_calibration_min_active_households: int = ( - DEFAULT_POLICYENGINE_CALIBRATION_MIN_ACTIVE_HOUSEHOLDS - ) - policyengine_calibration_deferred_stage_min_active_households: tuple[int, ...] = () - policyengine_calibration_deferred_stage_max_constraints: int | None = 24 - policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error: ( - float | None - ) = None - policyengine_calibration_deferred_stage_top_family_count: int | None = 8 - policyengine_calibration_deferred_stage_top_geography_count: int | None = 8 - policyengine_oracle_relative_error_cap: float | None = 10.0 - policyengine_target_reform_id: int = 0 - policyengine_simulation_cls: Any | None = None - policyengine_materialize_batch_size: int | None = None - """Batch size for PolicyEngine variable materialization. - - At 1.5M-household scale a single Microsimulation is 25–35 GB. With - a batch size of e.g. 100_000, the pipeline splits the entity tables - into chunks and runs one Microsimulation per chunk, reducing peak - memory to a few GB. ``None`` (default) keeps the legacy single-pass - behavior. Safe for per-household scalar variables (all our - calibration targets); unsafe for population-quantile-dependent - variables (see docstring on - :func:`materialize_policyengine_us_variables`). - """ - pipeline_checkpoint_save_post_imputation_path: str | Path | None = None - """Write a post-imputation pipeline checkpoint to this directory. - - Saved right after donor imputation + ``build_policyengine_entity_tables`` - and before microsim materializes calibration target variables. The - ~11 h synthesis + imputation + PE-tables build can be skipped on a - rerun that loads from this checkpoint, leaving only microsim (~30 - min) + calibration fit (~30 min) to redo. - """ - pipeline_checkpoint_save_post_microsim_path: str | Path | None = None - """Write a post-microsim pipeline checkpoint to this directory. - - Saved after ``_resolve_policyengine_calibration_targets`` has - materialized every calibration target variable onto the bundle, and - before the L0/microcalibrate fit loop. A rerun that loads from this - checkpoint skips microsim too, leaving only the ~30 min calibration - fit — useful for tuning calibration targets or backends. - """ - capital_gains_lots_enabled: bool = False - """Write an anchor-preserving synthetic capital-gains lot sidecar artifact.""" - capital_gains_lots_max_lots_per_person: int = 4 - capital_gains_lots_random_seed: int | None = None - forbes_fixed_spine_records_path: str | Path | None = None - """Normalized Forbes fixed-spine records to append after calibration.""" - forbes_fixed_spine_snapshot_id: str = "forbes-us-top-tail" - forbes_fixed_spine_replicates_per_unit: int = 10 - - def __post_init__(self) -> None: - if self.puf_support_clone_enabled: - if self.synthesis_backend != "seed": - raise ValueError( - "puf_support_clone_enabled requires synthesis_backend='seed' " - "until post-synthesis clone construction is implemented" - ) - if self.policyengine_selection_household_budget is not None: - raise ValueError( - "puf_support_clone_enabled cannot be combined with " - "policyengine_selection_household_budget until selector " - "clone activation is implemented" - ) - if not self.puf_support_clone_source_prefixes: - raise ValueError( - "puf_support_clone_source_prefixes must not be empty when " - "puf_support_clone_enabled is true" - ) - if not (0.0 <= self.puf_support_clone_prior_weight_share < 1.0): - raise ValueError( - "puf_support_clone_prior_weight_share must be in [0, 1)" - ) - if self.puf_support_clone_output_mode not in { - "append", - "collapse_to_scaffold", - }: - raise ValueError( - "puf_support_clone_output_mode must be 'append' or " - "'collapse_to_scaffold'" - ) - if ( - self.policyengine_calibration_rescale_to_input_weight_sum - and self.policyengine_calibration_rescale_to_target_total_weight - ): - raise ValueError( - "policyengine_calibration_rescale_to_input_weight_sum and " - "policyengine_calibration_rescale_to_target_total_weight are mutually exclusive" - ) - if ( - self.policyengine_calibration_rescale_to_target_total_weight - and self.policyengine_calibration_target_total_weight is None - ): - raise ValueError( - "policyengine_calibration_rescale_to_target_total_weight requires " - "policyengine_calibration_target_total_weight" - ) - if ( - self.policyengine_oracle_relative_error_cap is not None - and float(self.policyengine_oracle_relative_error_cap) <= 0.0 - ): - raise ValueError( - "policyengine_oracle_relative_error_cap must be positive when provided" - ) - if ( - self.dependent_tax_leaf_soft_cap_multiplier is not None - and float(self.dependent_tax_leaf_soft_cap_multiplier) < 0.0 - ): - raise ValueError( - "dependent_tax_leaf_soft_cap_multiplier must be non-negative when provided" - ) - if self.forbes_fixed_spine_replicates_per_unit < 1: - raise ValueError( - "forbes_fixed_spine_replicates_per_unit must be at least 1" - ) - if any( - int(value) <= 0 - for value in self.policyengine_calibration_deferred_stage_min_active_households - ): - raise ValueError( - "policyengine_calibration_deferred_stage_min_active_households must contain only positive values" - ) - if int(self.capital_gains_lots_max_lots_per_person) <= 0: - raise ValueError("capital_gains_lots_max_lots_per_person must be positive") - if ( - self.policyengine_calibration_deferred_stage_max_constraints is not None - and int(self.policyengine_calibration_deferred_stage_max_constraints) <= 0 - ): - raise ValueError( - "policyengine_calibration_deferred_stage_max_constraints must be positive when provided" - ) - if ( - self.policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error - is not None - and float( - self.policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error - ) - <= 0.0 - ): - raise ValueError( - "policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error must be positive when provided" - ) - if ( - self.policyengine_calibration_deferred_stage_top_family_count is not None - and int(self.policyengine_calibration_deferred_stage_top_family_count) < 0 - ): - raise ValueError( - "policyengine_calibration_deferred_stage_top_family_count must be nonnegative when provided" - ) - if ( - self.policyengine_calibration_deferred_stage_top_geography_count is not None - and int(self.policyengine_calibration_deferred_stage_top_geography_count) - < 0 - ): - raise ValueError( - "policyengine_calibration_deferred_stage_top_geography_count must be nonnegative when provided" - ) - - def to_dict(self) -> dict[str, Any]: - return _normalize_config_value(asdict(self)) - - -def _normalize_config_value(value: Any) -> Any: - if isinstance(value, Path): - return str(value) - if isinstance(value, dict): - return {str(key): _normalize_config_value(item) for key, item in value.items()} - if isinstance(value, (list, tuple)): - return [_normalize_config_value(item) for item in value] - if isinstance(value, type) or isinstance(value, FunctionType): - return f"{value.__module__}.{value.__qualname__}" - return value - - -@dataclass(frozen=True) -class USMicroplexTargets: - """Calibration targets for the US microplex pipeline.""" - - marginal: dict[str, dict[str, float]] - continuous: dict[str, float] - - -@dataclass(frozen=True) -class USMicroplexSourceInput: - """Normalized source-planning context for one US build.""" - - frame: ObservationFrame - fusion_plan: FusionPlan - household_observation: EntityObservation - person_observation: EntityObservation - household_person_relationship: EntityRelationship - households: pd.DataFrame - persons: pd.DataFrame - - -@dataclass(frozen=True) -class USMicroplexSynthesisVariables: - """Observed variables to use during synthesis.""" - - condition_vars: tuple[str, ...] - target_vars: tuple[str, ...] - - -@dataclass -class USMicroplexBuildResult: - """Artifacts from a US microplex build.""" - - config: USMicroplexBuildConfig - seed_data: pd.DataFrame - synthetic_data: pd.DataFrame - calibrated_data: pd.DataFrame - targets: USMicroplexTargets - calibration_summary: dict[str, Any] - synthesis_metadata: dict[str, Any] = field(default_factory=dict) - synthesizer: Synthesizer | Any | None = None - policyengine_tables: PolicyEngineUSEntityTableBundle | None = None - pre_calibration_policyengine_tables: PolicyEngineUSEntityTableBundle | None = None - source_frame: ObservationFrame | None = None - source_frames: tuple[ObservationFrame, ...] = () - fusion_plan: FusionPlan | None = None - scaffold_seed_data: pd.DataFrame | None = None - - @property - def n_nonzero_weights(self) -> int: - if "weight" not in self.calibrated_data.columns: - return 0 - return int((self.calibrated_data["weight"] > 1e-9).sum()) - - @property - def total_weighted_population(self) -> float: - if "weight" not in self.calibrated_data.columns: - return 0.0 - return float(self.calibrated_data["weight"].sum()) - - -def _source_loading_stage_outputs( - frames: list[ObservationFrame], -) -> USSourceLoadingOutputs: - frame_summaries: list[dict[str, Any]] = [] - relationship_summaries: dict[str, list[dict[str, Any]]] = {} - source_names: list[str] = [] - for frame in frames: - source_names.append(frame.source.name) - table_rows = { - entity.value: int(len(table)) for entity, table in frame.tables.items() - } - frame_summaries.append( - { - "source": frame.source.name, - "tables": table_rows, - "relationship_count": len(frame.relationships), - } - ) - relationship_summaries[frame.source.name] = [ - { - "parentEntity": relationship.parent_entity.value, - "childEntity": relationship.child_entity.value, - "parentKey": relationship.parent_key, - "childKey": relationship.child_key, - "cardinality": relationship.cardinality.value, - } - for relationship in frame.relationships - ] - return USSourceLoadingOutputs( - observation_frame_summary={ - "source_count": len(frames), - "frames": frame_summaries, - }, - source_descriptors=tuple(dict.fromkeys(source_names)), - source_relationships=relationship_summaries, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - description="Runtime source-loading summary.", - summary={ - "source_names": source_names, - "source_count": len(frames), - }, - ) - }, - ) - - -def _runtime_stage_artifact_path( - writer: USStageRuntimeWriter, - stage_id: str, - artifact_key: str, -) -> Path: - return resolve_us_stage_artifact_contract_path( - writer.artifact_root, - stage_id, - artifact_key, - ) - - -def _runtime_stage_artifact_ref( - writer: USStageRuntimeWriter, - stage_id: str, - artifact_key: str, - *, - assume_exists: bool = False, -) -> USArtifactRef: - contract = get_us_stage_artifact_contract(stage_id, artifact_key) - return USArtifactRef( - key=artifact_key, - path=_runtime_stage_artifact_path(writer, stage_id, artifact_key), - format=contract.format, - required=contract.required, - resume_role=contract.resume_role, - assume_exists=assume_exists, - ) - - -def _runtime_stage_diagnostics( - stage_id: str, - summary: Mapping[str, Any], -) -> dict[str, USDiagnosticOutput]: - return { - "stage_summary": USDiagnosticOutput( - key="stage_summary", - description=f"Runtime diagnostic summary for {stage_id}.", - summary=dict(summary), - ) - } - - -def _write_runtime_dataframe_artifact(path: Path, frame: pd.DataFrame) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - frame.to_parquet(path, index=False) - - -def _runtime_source_plan_payload( - source_inputs: list[USMicroplexSourceInput], - fusion_plan: FusionPlan, - scaffold_input: USMicroplexSourceInput, -) -> dict[str, Any]: - source_names = tuple(input.frame.source.name for input in source_inputs) - return { - "formatVersion": 1, - "stageId": "03_source_planning", - "sourceNames": list(source_names), - "scaffoldSource": scaffold_input.frame.source.name, - "donorSourceNames": [ - source_name - for source_name in source_names - if source_name != scaffold_input.frame.source.name - ], - "fusionPlan": { - "sourceNames": list(fusion_plan.source_names), - }, - "scaffoldSelection": _runtime_scaffold_selection_summary( - source_inputs, - scaffold_input, - ), - } - - -def _runtime_scaffold_selection_summary( - source_inputs: list[USMicroplexSourceInput], - scaffold_input: USMicroplexSourceInput, -) -> dict[str, Any]: - return { - "scaffold_source": scaffold_input.frame.source.name, - "candidate_sources": [ - source_input.frame.source.name for source_input in source_inputs - ], - "household_rows": int(len(scaffold_input.households)), - "person_rows": int(len(scaffold_input.persons)), - } - - -def _runtime_seed_schema_metadata(seed_data: pd.DataFrame) -> dict[str, Any]: - identifier_columns = ( - "household_id", - "person_id", - "tax_unit_id", - "spm_unit_id", - "family_id", - "marital_unit_id", - ) - return { - "rows": int(len(seed_data)), - "columns": int(len(seed_data.columns)), - "identifier_columns": { - column: column in seed_data.columns for column in identifier_columns - }, - "has_weight": "weight" in seed_data.columns, - } - - -def _runtime_targets_payload(targets: USMicroplexTargets) -> dict[str, Any]: - return { - "marginal": targets.marginal, - "continuous": targets.continuous, - } - - -def _runtime_target_ledger(targets: USMicroplexTargets) -> dict[str, Any]: - return { - "n_marginal_groups": len(targets.marginal), - "n_continuous": len(targets.continuous), - "marginal_keys": sorted(targets.marginal.keys()), - "continuous_keys": sorted(targets.continuous.keys()), - } - - -def _runtime_policyengine_table_summary( - tables: PolicyEngineUSEntityTableBundle, -) -> dict[str, Any]: - return { - "households": int(len(tables.households)), - "persons": int(len(tables.persons)), - "tax_units": int(len(tables.tax_units)), - "spm_units": int(len(tables.spm_units)), - "families": int(len(tables.families)), - "marital_units": int(len(tables.marital_units)), - } - - -class USMicroplexPipeline: - """End-to-end build orchestration for a US microplex dataset.""" - - def __init__( - self, - config: USMicroplexBuildConfig | None = None, - *, - stage_runtime_writer: USStageRuntimeWriter | None = None, - ): - self.config = config or USMicroplexBuildConfig() - self.stage_runtime_writer = stage_runtime_writer - - def _runtime_start_stage( - self, - stage_id: str, - *, - metadata: Mapping[str, Any] | None = None, - ) -> None: - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.start_stage(stage_id, metadata=metadata) - - def _runtime_fail_stage( - self, - stage_id: str, - error: BaseException, - *, - metadata: Mapping[str, Any] | None = None, - ) -> None: - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.fail_stage(stage_id, error, metadata=metadata) - - def build_from_data_dir(self, data_dir: str | Path) -> USMicroplexBuildResult: - from microplex_us.data_sources.cps import ( - DEFAULT_CACHE_DIR, - CPSASECParquetSourceProvider, - CPSASECSourceProvider, - ) - - if self.config.prefer_cached_cps_asec_source: - cache_dir = ( - Path(self.config.cps_asec_cache_dir) - if self.config.cps_asec_cache_dir is not None - else DEFAULT_CACHE_DIR - ) - processed_path = ( - cache_dir - / f"cps_asec_{int(self.config.cps_asec_source_year)}_processed.parquet" - ) - if processed_path.exists(): - return self.build_from_source_provider( - CPSASECSourceProvider( - year=int(self.config.cps_asec_source_year), - cache_dir=cache_dir, - download=False, - ) - ) - - return self.build_from_source_provider( - CPSASECParquetSourceProvider(data_dir=data_dir) - ) - - def build_from_source_provider( - self, - provider: SourceProvider, - query: SourceQuery | None = None, - ) -> USMicroplexBuildResult: - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.start_stage("02_source_loading") - try: - frame = provider.load_frame(query) - except Exception as exc: - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.fail_stage("02_source_loading", exc) - raise - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.complete_stage( - _source_loading_stage_outputs([frame]) - ) - return self.build_from_frames([frame]) - - def build_from_source_providers( - self, - providers: list[SourceProvider], - queries: dict[str, SourceQuery] | None = None, - ) -> USMicroplexBuildResult: - if not providers: - raise ValueError( - "USMicroplexPipeline requires at least one source provider" - ) - - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.start_stage("02_source_loading") - frames: list[ObservationFrame] = [] - try: - for provider in providers: - frame = provider.load_frame( - self._resolve_source_query(provider, queries or {}) - ) - frames.append(frame) - except Exception as exc: - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.fail_stage("02_source_loading", exc) - raise - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.complete_stage( - _source_loading_stage_outputs(frames) - ) - return self.build_from_frames(frames) - - def build_from_frame(self, frame: ObservationFrame) -> USMicroplexBuildResult: - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.start_stage("02_source_loading") - self.stage_runtime_writer.complete_stage( - _source_loading_stage_outputs([frame]) - ) - return self.build_from_frames([frame]) - - def build_from_frames( - self, - frames: list[ObservationFrame], - *, - resume_from_stage: str | None = None, - restored_scaffold_seed_data: pd.DataFrame | None = None, - ) -> USMicroplexBuildResult: - if not frames: - raise ValueError( - "USMicroplexPipeline requires at least one observation frame" - ) - start_stage = ( - canonicalize_us_pipeline_stage_id(resume_from_stage) - if resume_from_stage is not None - else "03_source_planning" - ) - resumable_frame_stages = { - "03_source_planning", - "04_seed_scaffold", - "05_donor_integration_synthesis", - } - if start_stage not in resumable_frame_stages: - valid_stages = ", ".join(sorted(resumable_frame_stages)) - raise ValueError( - f"Cannot build from frames starting at {start_stage}; " - f"expected one of: {valid_stages}" - ) - start_stage_index = US_CANONICAL_STAGE_IDS.index(start_stage) - - if start_stage_index <= US_CANONICAL_STAGE_IDS.index("03_source_planning"): - self._runtime_start_stage("03_source_planning") - try: - source_inputs = [self.prepare_source_input(frame) for frame in frames] - fusion_plan = FusionPlan.from_sources( - [frame.source for frame in frames] - ) - scaffold_input = self._select_scaffold_source(source_inputs) - if self.stage_runtime_writer is not None: - source_plan_path = _runtime_stage_artifact_path( - self.stage_runtime_writer, - "03_source_planning", - "source_plan", - ) - write_json_atomically( - source_plan_path, - _runtime_source_plan_payload( - source_inputs, - fusion_plan, - scaffold_input, - ), - ) - scaffold_selection = _runtime_scaffold_selection_summary( - source_inputs, - scaffold_input, - ) - self.stage_runtime_writer.complete_stage( - USSourcePlanningOutputs( - source_plan=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "03_source_planning", - "source_plan", - ), - scaffold_selection=scaffold_selection, - diagnostics=_runtime_stage_diagnostics( - "03_source_planning", - scaffold_selection, - ), - ) - ) - except Exception as exc: - self._runtime_fail_stage("03_source_planning", exc) - raise - else: - source_inputs = [self.prepare_source_input(frame) for frame in frames] - fusion_plan = FusionPlan.from_sources([frame.source for frame in frames]) - scaffold_input = self._select_scaffold_source(source_inputs) - - if start_stage_index <= US_CANONICAL_STAGE_IDS.index("04_seed_scaffold"): - self._runtime_start_stage("04_seed_scaffold") - try: - seed_data = self.prepare_seed_data_from_source(scaffold_input) - seed_data = self._strip_generated_entity_ids( - seed_data, - scaffold_input=scaffold_input, - ) - scaffold_seed_data = seed_data.copy() - if self.stage_runtime_writer is not None: - scaffold_seed_path = _runtime_stage_artifact_path( - self.stage_runtime_writer, - "04_seed_scaffold", - "scaffold_seed_data", - ) - _write_runtime_dataframe_artifact( - scaffold_seed_path, scaffold_seed_data - ) - seed_schema_metadata = _runtime_seed_schema_metadata( - scaffold_seed_data - ) - self.stage_runtime_writer.complete_stage( - USSeedScaffoldOutputs( - scaffold_seed_data=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "04_seed_scaffold", - "scaffold_seed_data", - ), - seed_schema_metadata=seed_schema_metadata, - diagnostics=_runtime_stage_diagnostics( - "04_seed_scaffold", - { - **seed_schema_metadata, - "scaffold_source": ( - scaffold_input.frame.source.name - ), - }, - ), - ) - ) - except Exception as exc: - self._runtime_fail_stage("04_seed_scaffold", exc) - raise - else: - if restored_scaffold_seed_data is None: - raise ValueError( - "resume_from_stage='05_donor_integration_synthesis' requires " - "restored_scaffold_seed_data" - ) - scaffold_seed_data = restored_scaffold_seed_data.copy() - seed_data = scaffold_seed_data.copy() - - self._runtime_start_stage("05_donor_integration_synthesis") - try: - donor_integration = self._integrate_donor_sources( - seed_data, - scaffold_input=scaffold_input, - donor_inputs=[ - source for source in source_inputs if source is not scaffold_input - ], - ) - seed_data = donor_integration["seed_data"] - seed_data = self._apply_dependent_tax_leaf_soft_caps(seed_data) - _emit_us_pipeline_progress( - "US microplex build: seed ready", - scaffold_source=scaffold_input.frame.source.name, - sources=_format_progress_values(fusion_plan.source_names), - rows=int(len(seed_data)), - columns=int(len(seed_data.columns)), - donor_integrated_variables=int( - len(donor_integration["integrated_variables"]) - ), - ) - _emit_us_pipeline_progress( - "US microplex build: targets start", - rows=int(len(seed_data)), - ) - targets = self.build_targets(seed_data) - _emit_us_pipeline_progress( - "US microplex build: targets complete", - marginal_targets=int(len(targets.marginal)), - continuous_targets=int(len(targets.continuous)), - ) - synthesis_variables = self._resolve_synthesis_variables( - scaffold_input, - fusion_plan=fusion_plan, - include_all_observed_targets=len(source_inputs) > 1, - available_columns=set(seed_data.columns), - observed_frame=seed_data, - ) - _emit_us_pipeline_progress( - "US microplex build: synthesis variables ready", - condition_vars=int(len(synthesis_variables.condition_vars)), - target_vars=int(len(synthesis_variables.target_vars)), - ) - _emit_us_pipeline_progress( - "US microplex build: synthesis start", - rows=int(len(seed_data)), - ) - synthetic_data, synthesizer, synthesis_metadata = self.synthesize( - seed_data, - synthesis_variables=synthesis_variables, - ) - _emit_us_pipeline_progress( - "US microplex build: synthesis complete", - rows=int(len(synthetic_data)), - columns=int(len(synthetic_data.columns)), - ) - synthesis_metadata = { - **synthesis_metadata, - "source_names": fusion_plan.source_names, - "condition_vars": list(synthesis_variables.condition_vars), - "target_vars": list(synthesis_variables.target_vars), - "scaffold_source": scaffold_input.frame.source.name, - "donor_integrated_variables": donor_integration["integrated_variables"], - "donor_conditioning_diagnostics": donor_integration.get( - "conditioning_diagnostics", [] - ), - "processed_donor_source_order": donor_integration.get( - "processed_donor_source_order", [] - ), - "puf_clone_source_order": donor_integration.get( - "puf_clone_source_order", [] - ), - "puf_support_clone": donor_integration.get("puf_support_clone_summary"), - "donor_excluded_variables": list( - self.config.donor_imputer_excluded_variables - ), - "donor_authoritative_override_variables": list( - self.config.donor_imputer_authoritative_override_variables - ), - "state_program_support_proxies": _state_program_support_proxy_summary( - set(seed_data.columns) - ), - } - _emit_us_pipeline_progress( - "US microplex build: support enforcement start", - rows=int(len(synthetic_data)), - ) - synthetic_data = self.ensure_target_support( - synthetic_data, seed_data, targets - ) - _emit_us_pipeline_progress( - "US microplex build: support enforcement complete", - rows=int(len(synthetic_data)), - columns=int(len(synthetic_data.columns)), - ) - if self.stage_runtime_writer is not None: - seed_data_path = _runtime_stage_artifact_path( - self.stage_runtime_writer, - "05_donor_integration_synthesis", - "seed_data", - ) - synthetic_data_path = _runtime_stage_artifact_path( - self.stage_runtime_writer, - "05_donor_integration_synthesis", - "synthetic_data", - ) - _write_runtime_dataframe_artifact(seed_data_path, seed_data) - _write_runtime_dataframe_artifact(synthetic_data_path, synthetic_data) - self.stage_runtime_writer.complete_stage( - USDonorSynthesisOutputs( - seed_data=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "05_donor_integration_synthesis", - "seed_data", - ), - synthetic_data=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "05_donor_integration_synthesis", - "synthetic_data", - ), - synthesis_metadata=synthesis_metadata, - diagnostics=_runtime_stage_diagnostics( - "05_donor_integration_synthesis", - { - "seed_rows": int(len(seed_data)), - "synthetic_rows": int(len(synthetic_data)), - "donor_integrated_variables": len( - donor_integration["integrated_variables"] - ), - "condition_vars": len( - synthesis_variables.condition_vars - ), - "target_vars": len(synthesis_variables.target_vars), - }, - ), - ) - ) - except Exception as exc: - self._runtime_fail_stage("05_donor_integration_synthesis", exc) - raise - - self._runtime_start_stage("06_policyengine_entities") - try: - _emit_us_pipeline_progress( - "US microplex build: policyengine tables start", - rows=int(len(synthetic_data)), - ) - synthetic_tables = self.build_policyengine_entity_tables(synthetic_data) - _emit_us_pipeline_progress( - "US microplex build: policyengine tables complete", - households=int(len(synthetic_tables.households)), - persons=int(len(synthetic_tables.persons)), - ) - if self.config.pipeline_checkpoint_save_post_imputation_path is not None: - save_us_pipeline_checkpoint( - synthetic_tables, - self.config.pipeline_checkpoint_save_post_imputation_path, - stage="post_imputation", - ) - _emit_us_pipeline_progress( - "US microplex build: post-imputation checkpoint saved", - path=str(self.config.pipeline_checkpoint_save_post_imputation_path), - ) - self._check_policyengine_export_column_contract( - synthetic_tables, - stage="pre_calibration", - ) - if self.stage_runtime_writer is not None: - write_us_policyengine_entity_stage_artifact( - synthetic_tables, - self.stage_runtime_writer.artifact_root, - stage_id="06_policyengine_entities", - artifact_key="pre_calibration_policyengine_entity_tables", - checkpoint_stage="post_microsim", - ) - entity_summary = _runtime_policyengine_table_summary(synthetic_tables) - self.stage_runtime_writer.complete_stage( - USPolicyEngineEntityOutputs( - pre_calibration_policyengine_entity_tables=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "06_policyengine_entities", - "pre_calibration_policyengine_entity_tables", - ), - materialized_policyengine_inputs=entity_summary, - diagnostics=_runtime_stage_diagnostics( - "06_policyengine_entities", - entity_summary, - ), - ) - ) - except Exception as exc: - self._runtime_fail_stage("06_policyengine_entities", exc) - raise - - self._runtime_start_stage("07_calibration") - try: - if self._has_policyengine_calibration_targets(): - _emit_us_pipeline_progress( - "US microplex build: policyengine calibration start", - backend=self.config.calibration_backend, - ) - ( - policyengine_tables, - calibrated_data, - calibration_summary, - ) = self.calibrate_policyengine_tables(synthetic_tables) - _emit_us_pipeline_progress( - "US microplex build: policyengine calibration complete", - backend=self.config.calibration_backend, - calibrated_rows=int(len(calibrated_data)), - ) - else: - _emit_us_pipeline_progress( - "US microplex build: calibration start", - backend=self.config.calibration_backend, - rows=int(len(synthetic_data)), - ) - calibrated_data, calibration_summary = self.calibrate( - synthetic_data, targets - ) - _emit_us_pipeline_progress( - "US microplex build: calibration complete", - backend=self.config.calibration_backend, - calibrated_rows=int(len(calibrated_data)), - ) - _emit_us_pipeline_progress( - "US microplex build: policyengine tables start", - rows=int(len(calibrated_data)), - ) - policyengine_tables = self.build_policyengine_entity_tables( - calibrated_data - ) - _emit_us_pipeline_progress( - "US microplex build: policyengine tables complete", - households=int(len(policyengine_tables.households)), - persons=int(len(policyengine_tables.persons)), - ) - if self.stage_runtime_writer is not None: - write_us_policyengine_entity_stage_artifact( - policyengine_tables, - self.stage_runtime_writer.artifact_root, - stage_id="07_calibration", - artifact_key="policyengine_entity_tables", - checkpoint_stage="post_calibration", - ) - calibrated_data_path = _runtime_stage_artifact_path( - self.stage_runtime_writer, - "07_calibration", - "calibrated_data", - ) - targets_path = _runtime_stage_artifact_path( - self.stage_runtime_writer, - "07_calibration", - "targets", - ) - calibration_summary_path = _runtime_stage_artifact_path( - self.stage_runtime_writer, - "07_calibration", - "calibration_summary", - ) - _write_runtime_dataframe_artifact( - calibrated_data_path, - calibrated_data, - ) - write_json_atomically(targets_path, _runtime_targets_payload(targets)) - write_json_atomically(calibration_summary_path, calibration_summary) - target_ledger = _runtime_target_ledger(targets) - self.stage_runtime_writer.complete_stage( - USCalibrationOutputs( - calibrated_data=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "07_calibration", - "calibrated_data", - ), - targets=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "07_calibration", - "targets", - ), - calibration_summary=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "07_calibration", - "calibration_summary", - ), - policyengine_entity_tables=_runtime_stage_artifact_ref( - self.stage_runtime_writer, - "07_calibration", - "policyengine_entity_tables", - ), - target_ledger=target_ledger, - diagnostics=_runtime_stage_diagnostics( - "07_calibration", - { - "calibrated_rows": int(len(calibrated_data)), - "backend": self.config.calibration_backend, - **target_ledger, - }, - ), - ) - ) - except Exception as exc: - self._runtime_fail_stage("07_calibration", exc) - raise - - return USMicroplexBuildResult( - config=self.config, - seed_data=seed_data, - synthetic_data=synthetic_data, - calibrated_data=calibrated_data, - targets=targets, - calibration_summary=calibration_summary, - synthesis_metadata=synthesis_metadata, - synthesizer=synthesizer, - policyengine_tables=policyengine_tables, - pre_calibration_policyengine_tables=synthetic_tables, - source_frame=scaffold_input.frame, - source_frames=tuple(frame for frame in frames), - fusion_plan=fusion_plan, - scaffold_seed_data=scaffold_seed_data, - ) - - def build( - self, - persons: pd.DataFrame, - households: pd.DataFrame, - ) -> USMicroplexBuildResult: - return self.build_from_frame( - self._build_direct_input_frame( - persons=persons, - households=households, - ) - ) - - def _resolve_source_query( - self, - provider: SourceProvider, - queries: dict[str, SourceQuery], - ) -> SourceQuery | None: - for key in self._source_query_keys(provider): - query = queries.get(key) - if query is not None: - return query - return None - - def _source_query_keys(self, provider: SourceProvider) -> tuple[str, ...]: - base_name = provider.descriptor.name - keys: list[str] = [base_name] - for attr_name in ("year", "target_year"): - attr_value = getattr(provider, attr_name, None) - if attr_value is None: - continue - keys.append(f"{base_name}_{attr_value}") - descriptor_cache = getattr(provider, "_descriptor_cache", None) - cached_name = getattr(descriptor_cache, "name", None) - if cached_name is not None: - keys.append(cached_name) - return tuple(dict.fromkeys(keys)) - - def prepare_source_input( - self, - frame: ObservationFrame, - ) -> USMicroplexSourceInput: - """Validate and extract the source-planning context for a US build.""" - frame.validate() - households = frame.tables.get(EntityType.HOUSEHOLD) - persons = frame.tables.get(EntityType.PERSON) - if households is None or persons is None: - raise ValueError( - "USMicroplexPipeline requires household and person tables from the source provider" - ) - - fusion_plan = FusionPlan.from_sources([frame.source]) - observations_by_entity = { - observation.entity: observation for observation in frame.source.observations - } - household_observation = observations_by_entity.get(EntityType.HOUSEHOLD) - person_observation = observations_by_entity.get(EntityType.PERSON) - if household_observation is None or person_observation is None: - raise ValueError( - "USMicroplexPipeline requires household and person observations in the source descriptor" - ) - - relationship = next( - ( - candidate - for candidate in frame.relationships - if candidate.parent_entity == EntityType.HOUSEHOLD - and candidate.child_entity == EntityType.PERSON - and candidate.cardinality == RelationshipCardinality.ONE_TO_MANY - ), - None, - ) - if relationship is None: - raise ValueError( - "USMicroplexPipeline requires a one-to-many household-to-person relationship" - ) - - return USMicroplexSourceInput( - frame=frame, - fusion_plan=fusion_plan, - household_observation=household_observation, - person_observation=person_observation, - household_person_relationship=relationship, - households=households, - persons=persons, - ) - - def prepare_seed_data_from_source( - self, - source_input: USMicroplexSourceInput, - ) -> pd.DataFrame: - """Project an observation frame into the canonical US seed schema.""" - household_coverage = source_input.fusion_plan.variables_for( - EntityType.HOUSEHOLD - ) - person_coverage = source_input.fusion_plan.variables_for(EntityType.PERSON) - relationship = source_input.household_person_relationship - - hh = source_input.households.copy() - persons_df = source_input.persons.copy() - - household_renames = { - relationship.parent_key: "household_id", - } - if source_input.household_observation.weight_column is not None: - household_renames[source_input.household_observation.weight_column] = ( - "hh_weight" - ) - hh = hh.rename(columns=household_renames) - - person_renames = { - source_input.person_observation.key_column: "person_id", - relationship.child_key: "household_id", - } - persons_df = persons_df.rename(columns=person_renames) - - if "household_id" not in hh.columns: - raise ValueError( - "USMicroplexPipeline could not resolve a canonical household_id from the source frame" - ) - if ( - "household_id" not in persons_df.columns - or "person_id" not in persons_df.columns - ): - raise ValueError( - "USMicroplexPipeline could not resolve canonical person/household linkage columns" - ) - - if "hh_weight" not in hh.columns: - hh["hh_weight"] = 1.0 - if "state_fips" not in household_coverage or "state_fips" not in hh.columns: - hh["state_fips"] = 0 - if "county_fips" not in household_coverage or "county_fips" not in hh.columns: - hh["county_fips"] = 0 - if "tenure" not in household_coverage or "tenure" not in hh.columns: - hh["tenure"] = 0 - hh = _attach_household_census_geographies( - hh, - seed=self.config.random_seed, - ) - - required_person_defaults = { - "age": 0, - "sex": 0, - "education": 0, - "employment_status": 0, - "income": 0.0, - } - for column, default in required_person_defaults.items(): - if column not in person_coverage or column not in persons_df.columns: - persons_df[column] = default - - household_seed_columns = [ - "household_id", - "state_fips", - "county_fips", - "hh_weight", - "tenure", - "block_geoid", - "tract_geoid", - "congressional_district_geoid", - ] - seed_data = persons_df.merge( - hh[[column for column in household_seed_columns if column in hh.columns]], - on="household_id", - how="left", - suffixes=("", "__household"), - ) - for column in ( - "state_fips", - "county_fips", - "hh_weight", - "tenure", - "block_geoid", - "tract_geoid", - "congressional_district_geoid", - ): - household_column = f"{column}__household" - if household_column not in seed_data.columns: - continue - if column in seed_data.columns: - seed_data[column] = seed_data[household_column].combine_first( - seed_data[column] - ) - else: - seed_data[column] = seed_data[household_column] - seed_data = seed_data.drop(columns=[household_column]) - seed_data["hh_weight"] = seed_data["hh_weight"].fillna(1.0).astype(float) - seed_data["tenure"] = seed_data["tenure"].fillna(0).astype(int) - seed_data["state_fips"] = seed_data["state_fips"].fillna(0).astype(int) - seed_data["county_fips"] = ( - seed_data["county_fips"].map(normalize_us_county_fips).fillna("00000") - ) - if "block_geoid" in seed_data.columns: - seed_data["block_geoid"] = seed_data["block_geoid"].fillna("").astype(str) - if "tract_geoid" in seed_data.columns: - seed_data["tract_geoid"] = seed_data["tract_geoid"].fillna("").astype(str) - if "congressional_district_geoid" in seed_data.columns: - seed_data["congressional_district_geoid"] = ( - pd.to_numeric( - seed_data["congressional_district_geoid"], - errors="coerce", - ) - .fillna(0) - .astype(int) - ) - seed_data["income"] = pd.to_numeric( - seed_data["income"], errors="coerce" - ).fillna(0.0) - seed_data = normalize_social_security_columns(seed_data) - - seed_data["state"] = seed_data["state_fips"].map(STATE_FIPS).fillna("UNK") - seed_data["age_group"] = pd.cut( - seed_data["age"], - bins=AGE_BINS, - labels=AGE_LABELS, - right=False, - ) - seed_data["income_bracket"] = pd.cut( - seed_data["income"], - bins=INCOME_BINS, - labels=INCOME_LABELS, - ) - - return seed_data.reset_index(drop=True) - - def prepare_seed_data( - self, - persons: pd.DataFrame, - households: pd.DataFrame, - ) -> pd.DataFrame: - """Merge canonical person and household inputs into a synthesis-ready seed frame.""" - return self.prepare_seed_data_from_source( - self.prepare_source_input( - self._build_direct_input_frame( - persons=persons, - households=households, - ) - ) - ) - - def _build_direct_input_frame( - self, - *, - persons: pd.DataFrame, - households: pd.DataFrame, - ) -> ObservationFrame: - """Wrap direct person/household inputs in an observation frame.""" - household_weight_column = next( - ( - column - for column in ("hh_weight", "household_weight") - if column in households.columns - ), - None, - ) - person_weight_column = "weight" if "weight" in persons.columns else None - household_columns = tuple( - column - for column in households.columns - if column - not in { - "household_id", - household_weight_column, - } - ) - person_columns = tuple( - column - for column in persons.columns - if column - not in { - "person_id", - "household_id", - person_weight_column, - } - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="us_microplex_direct_input", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=household_columns, - weight_column=household_weight_column, - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=person_columns, - weight_column=person_weight_column, - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: households.copy(), - EntityType.PERSON: persons.copy(), - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - frame.validate() - return frame - - def build_targets( - self, - seed_data: pd.DataFrame, - weight_col: str = "hh_weight", - ) -> USMicroplexTargets: - """Build weighted calibration targets from the seed data.""" - weights = seed_data[weight_col].astype(float).values - marginal: dict[str, dict[str, float]] = {} - - for column in ("state", "age_group", "income_bracket"): - marginal[column] = {} - categories = seed_data[column].dropna().astype(str).unique() - for category in categories: - mask = seed_data[column].astype(str) == category - marginal[column][category] = float(weights[mask].sum()) - - continuous = { - "income": float((weights * seed_data["income"].astype(float).values).sum()) - } - - if self.config.policyengine_quantity_targets: - if self.config.policyengine_dataset is None: - raise ValueError( - "policyengine_dataset is required when policyengine_quantity_targets are configured" - ) - adapter = PolicyEngineUSMicrosimulationAdapter.from_dataset( - self.config.policyengine_dataset, - dataset_year=self.config.policyengine_dataset_year, - ) - continuous.update( - self.build_policyengine_continuous_targets( - seed_data=seed_data, - adapter=adapter, - quantity_targets=self.config.policyengine_quantity_targets, - ) - ) - - return USMicroplexTargets(marginal=marginal, continuous=continuous) - - def build_policyengine_continuous_targets( - self, - seed_data: pd.DataFrame, - adapter: PolicyEngineUSMicrosimulationAdapter | Any, - quantity_targets: tuple[PolicyEngineUSQuantityTarget, ...], - ) -> dict[str, float]: - """Compute PE-based continuous totals for columns present in the seed data.""" - missing_columns = sorted( - { - target.column - for target in quantity_targets - if target.column not in seed_data.columns - } - ) - if missing_columns: - raise ValueError( - f"PolicyEngine target columns not available in seed data: {missing_columns}" - ) - - computed = adapter.compute_targets(quantity_targets) - continuous_targets: dict[str, float] = {} - for target in quantity_targets: - if target.name not in computed: - raise ValueError( - f"PolicyEngine adapter did not return target '{target.name}'" - ) - continuous_targets[target.column] = float(computed[target.name]) - return continuous_targets - - def ensure_target_support( - self, - synthetic_data: pd.DataFrame, - seed_data: pd.DataFrame, - targets: USMicroplexTargets, - ) -> pd.DataFrame: - """Ensure every marginal target category has support in the synthetic sample.""" - result = synthetic_data.copy().reset_index(drop=True) - bool_columns = [ - column - for column in result.columns - if pd.api.types.is_bool_dtype(result[column].dtype) - ] - if bool_columns: - result[bool_columns] = result[bool_columns].astype(float) - replace_idx = 0 - - for _ in range(sum(len(v) for v in targets.marginal.values())): - missing: list[tuple[str, str]] = [] - for column, categories in targets.marginal.items(): - current = result[column].astype(str) - for category in categories: - if not (current == str(category)).any(): - missing.append((column, str(category))) - - if not missing: - break - - for column, category in missing: - exemplars = seed_data[seed_data[column].astype(str) == category] - if exemplars.empty: - continue - exemplar = exemplars.iloc[0] - row_idx = replace_idx % len(result) - for column_name, value in exemplar.items(): - if column_name in result.columns and column_name not in { - "person_id", - "household_id", - "weight", - }: - resolved_value = value - destination = result[column_name] - if pd.api.types.is_bool_dtype( - destination.dtype - ) and not isinstance( - resolved_value, - (bool, np.bool_), - ): - result[column_name] = destination.astype(float) - destination = result[column_name] - if pd.api.types.is_numeric_dtype( - destination.dtype - ) and isinstance( - value, - (bool, np.bool_), - ): - resolved_value = float(value) - result.at[row_idx, column_name] = resolved_value - replace_idx += 1 - - initial_weight = ( - float(result["weight"].mean()) if "weight" in result.columns else 1.0 - ) - base = result.drop( - columns=["person_id", "state", "age_group", "income_bracket"], - errors="ignore", - ) - return self._finalize_synthetic_population(base, initial_weight=initial_weight) - - def synthesize( - self, - seed_data: pd.DataFrame, - synthesis_variables: USMicroplexSynthesisVariables | None = None, - ) -> tuple[pd.DataFrame, Synthesizer | None, dict[str, Any]]: - """Generate synthetic records from the seed data.""" - if "hh_weight" in seed_data.columns: - initial_weight = float(seed_data["hh_weight"].sum()) / max( - self.config.n_synthetic, 1 - ) - else: - initial_weight = 1.0 - synthesis_variables = synthesis_variables or USMicroplexSynthesisVariables( - condition_vars=self._resolve_synthesis_condition_vars( - seed_data.columns, - observed_frame=seed_data, - ), - target_vars=tuple( - column - for column in self.config.synthesizer_target_vars - if column in seed_data.columns - ), - ) - - if self.config.synthesis_backend == "seed": - synthetic = seed_data.copy() - if "hh_weight" in synthetic.columns and "weight" not in synthetic.columns: - synthetic["weight"] = ( - pd.to_numeric(synthetic["hh_weight"], errors="coerce") - .fillna(initial_weight) - .astype(float) - ) - synthetic = self._finalize_synthetic_population( - synthetic, - initial_weight=float( - pd.to_numeric( - synthetic.get("weight", pd.Series([initial_weight])), - errors="coerce", - ) - .fillna(initial_weight) - .mean() - ), - ) - return ( - synthetic, - None, - { - "backend": "seed", - "n_seed_records": int(len(seed_data)), - }, - ) - - if self.config.synthesis_backend == "bootstrap": - bootstrap_strata_columns = self._resolve_bootstrap_strata_columns(seed_data) - synthetic = self._synthesize_bootstrap( - seed_data, - initial_weight=initial_weight, - strata_columns=bootstrap_strata_columns, - ) - return ( - synthetic, - None, - { - "backend": "bootstrap", - "bootstrap_strata_columns": list(bootstrap_strata_columns), - }, - ) - - synthesizer = self._fit_synthesizer(seed_data, synthesis_variables) - synthetic = synthesizer.sample( - self.config.n_synthetic, - seed=self.config.random_seed, - ) - synthetic = self._finalize_synthetic_population( - synthetic, - initial_weight=initial_weight, - ) - return synthetic, synthesizer, {"backend": "synthesizer"} - - def calibrate( - self, - synthetic_data: pd.DataFrame, - targets: USMicroplexTargets, - ) -> tuple[pd.DataFrame, dict[str, Any]]: - """Calibrate synthetic records to weighted targets.""" - if self.config.calibration_backend == "none": - return synthetic_data.copy(), { - "backend": "none", - "max_error": 0.0, - "mean_error": 0.0, - "converged": True, - } - calibrator = self._build_weight_calibrator() - if self.config.calibration_backend in {"entropy", "ipf", "chi2"}: - calibrated = calibrator.fit_transform( - synthetic_data, - targets.marginal, - targets.continuous, - weight_col="weight", - ) - validation = calibrator.validate(calibrated) - all_errors = [] - for var_errors in validation["marginal_errors"].values(): - all_errors.extend( - item["relative_error"] for item in var_errors.values() - ) - all_errors.extend( - item["relative_error"] - for item in validation["continuous_errors"].values() - ) - summary = { - "backend": self.config.calibration_backend, - "max_error": float(validation["max_error"]), - "mean_error": float(np.mean(all_errors)) if all_errors else 0.0, - "converged": bool(validation["converged"]), - } - return calibrated, summary - - calibrated = calibrator.fit_transform( - synthetic_data, - targets.marginal, - targets.continuous, - weight_col="weight", - ) - validation = calibrator.validate(calibrated) - summary = { - "backend": self.config.calibration_backend, - "max_error": float(validation["max_error"]), - "mean_error": float(validation["mean_error"]), - "sparsity": float(validation.get("sparsity", 0.0)), - "converged": bool(validation.get("converged", False)), - } - return calibrated, summary - - def _build_weight_calibrator( - self, - stage_index: int = 1, - ) -> ( - Calibrator - | SparseCalibrator - | HardConcreteCalibrator - | PolicyEngineL0Calibrator - ): - # Stage 1 selects the sparse support via L0; stages 2+ only - # refine weights against additional targets. Re-applying the same - # L0 penalty on warm-started weights compounds sparsity and - # collapses the support set (v10 went 442k → 1.5k across stages). - sparsity_pass = stage_index <= 1 - l0_penalty = 1e-4 if sparsity_pass else 0.0 - if self.config.calibration_backend in {"entropy", "ipf", "chi2"}: - return Calibrator( - method=self.config.calibration_backend, - tol=self.config.calibration_tol, - max_iter=self.config.calibration_max_iter, - ) - if self.config.calibration_backend == "sparse": - return SparseCalibrator( - target_sparsity=self.config.target_sparsity, - tol=self.config.calibration_tol, - max_iter=max(self.config.calibration_max_iter, 1_000), - ) - if self.config.calibration_backend == "hardconcrete": - if l0_penalty <= 0.0: - from microplex_us.calibration import ( - MicrocalibrateAdapter, - MicrocalibrateAdapterConfig, - ) - - return MicrocalibrateAdapter( - MicrocalibrateAdapterConfig( - epochs=max(self.config.calibration_max_iter, 32), - learning_rate=1e-3, - device=self.config.device, - seed=self.config.random_seed, - regularize_with_l0=False, - ) - ) - return HardConcreteCalibrator( - lambda_l0=l0_penalty, - epochs=max(self.config.calibration_max_iter, 500), - lr=0.1, - device=self.config.device, - verbose=False, - ) - if self.config.calibration_backend == "pe_l0": - return PolicyEngineL0Calibrator( - lambda_l0=l0_penalty, - epochs=max(self.config.calibration_max_iter, 100), - device=self.config.device, - tol=self.config.calibration_tol, - fit_l0_weights_fn=make_policyengine_us_data_fit_l0_weights_fn(), - ) - if self.config.calibration_backend == "microcalibrate": - from microplex_us.calibration import ( - MicrocalibrateAdapter, - MicrocalibrateAdapterConfig, - ) - - return MicrocalibrateAdapter( - MicrocalibrateAdapterConfig( - epochs=max(self.config.calibration_max_iter, 32), - learning_rate=1e-3, - device=self.config.device, - seed=self.config.random_seed, - ) - ) - raise ValueError( - f"Unsupported calibration backend: {self.config.calibration_backend}" - ) - - def _select_policyengine_household_budget( - self, - tables: PolicyEngineUSEntityTableBundle, - supported_targets: list[TargetSpec], - constraints: tuple[LinearConstraint, ...], - ) -> tuple[ - PolicyEngineUSEntityTableBundle, - list[TargetSpec], - tuple[LinearConstraint, ...], - dict[str, Any], - ]: - requested_budget = self.config.policyengine_selection_household_budget - household_count = len(tables.households) - if requested_budget is None or requested_budget >= household_count: - return ( - tables, - supported_targets, - constraints, - { - "applied": False, - "requested_household_budget": requested_budget, - "input_household_count": household_count, - }, - ) - if requested_budget <= 0: - raise ValueError("policyengine_selection_household_budget must be positive") - if not constraints: - return ( - tables, - supported_targets, - constraints, - { - "applied": False, - "requested_household_budget": requested_budget, - "input_household_count": household_count, - "reason": "no_constraints", - }, - ) - - target_sparsity = max(0.0, 1.0 - (requested_budget / household_count)) - household_ids = tables.households["household_id"].to_numpy(dtype=np.int64) - selection_backend = self.config.policyengine_selection_backend - state_floor_positions = np.asarray([], dtype=np.int64) - state_floor_summary = { - "applied": False, - "requested_state_floor": int( - max(self.config.policyengine_selection_state_floor, 0) - ), - } - if selection_backend == "sparse": - selector = SparseCalibrator( - target_sparsity=target_sparsity, - tol=self.config.calibration_tol, - max_iter=max(self.config.calibration_max_iter, 1_000), - ) - selector_result = selector.fit_transform( - tables.households.copy(), - {}, - weight_col="household_weight", - linear_constraints=constraints, - ) - selector_validation = selector.validate(selector_result) - selector_weights = ( - pd.to_numeric(selector_result["household_weight"], errors="coerce") - .fillna(0.0) - .to_numpy(dtype=float) - ) - selector_metadata = { - "selector_converged": bool(selector_validation.get("converged", False)), - "selector_max_error": float(selector_validation.get("max_error", 0.0)), - "selector_mean_error": float( - selector_validation.get("mean_error", 0.0) - ), - "selector_sparsity": float(selector_validation.get("sparsity", 0.0)), - } - elif selection_backend == "pe_native_loss": - ( - state_floor_positions, - state_floor_summary, - ) = self._select_policyengine_state_floor_positions( - tables=tables, - requested_budget=requested_budget, - ) - state_floor_mask = np.zeros(household_count, dtype=bool) - state_floor_mask[state_floor_positions] = True - remaining_budget = requested_budget - int(state_floor_mask.sum()) - if remaining_budget < 0: - raise ValueError( - "policyengine_selection_state_floor selects more households than " - "policyengine_selection_household_budget allows" - ) - remaining_tables = ( - _subset_policyengine_tables_by_households( - tables, - pd.Index( - household_ids[~state_floor_mask], - name="household_id", - ), - ) - if state_floor_mask.any() - else tables - ) - remaining_household_ids = ( - household_ids[~state_floor_mask] - if state_floor_mask.any() - else household_ids - ) - if remaining_budget == 0 or len(remaining_household_ids) == 0: - selector_weights = np.zeros( - len(remaining_household_ids), dtype=np.float64 - ) - optimization_summary = { - "metric": "enhanced_cps_native_loss_weight_optimization", - "initial_loss": 0.0, - "optimized_loss": 0.0, - "loss_delta": 0.0, - "initial_weight_sum": 0.0, - "optimized_weight_sum": 0.0, - "household_count": int(len(remaining_household_ids)), - "positive_household_count": 0, - "budget": int(remaining_budget), - "converged": True, - "iterations": 0, - } - else: - selector_weights, optimization_summary = ( - self._select_policyengine_household_budget_with_pe_native_loss( - tables=remaining_tables, - requested_budget=remaining_budget, - household_ids=remaining_household_ids, - ) - ) - if state_floor_mask.any(): - full_selector_weights = np.zeros(household_count, dtype=np.float64) - full_selector_weights[~state_floor_mask] = selector_weights - floor_priority = ( - float(selector_weights.max()) + 1.0 - if selector_weights.size - else 1.0 - ) - full_selector_weights[state_floor_mask] = floor_priority - selector_weights = full_selector_weights - selector_metadata = { - "selector_converged": bool( - optimization_summary.get("converged", False) - ), - "selector_max_error": 0.0, - "selector_mean_error": 0.0, - "selector_sparsity": 0.0, - "pe_native_optimization": optimization_summary, - "state_floor": state_floor_summary, - } - else: - raise ValueError( - f"Unsupported policyengine_selection_backend: {selection_backend}" - ) - - ranking = np.lexsort((household_ids, -selector_weights)) - selected_positions = np.sort(ranking[:requested_budget]) - household_mask = np.zeros(household_count, dtype=bool) - household_mask[selected_positions] = True - selected_ids = pd.Index(household_ids[household_mask], name="household_id") - - return ( - _subset_policyengine_tables_by_households(tables, selected_ids), - supported_targets, - _subset_policyengine_linear_constraints(constraints, household_mask), - { - "applied": True, - "backend": selection_backend, - "requested_household_budget": int(requested_budget), - "input_household_count": int(household_count), - "selected_household_count": int(household_mask.sum()), - "target_sparsity": float(target_sparsity), - "selector_nonzero_count": int((selector_weights > 0.0).sum()), - "selector_positive_selected_count": int( - (selector_weights[household_mask] > 0.0).sum() - ), - "selector_weight_diagnostics": _summarize_weight_diagnostics( - selector_weights - ), - **selector_metadata, - }, - ) - - def _select_policyengine_state_floor_positions( - self, - *, - tables: PolicyEngineUSEntityTableBundle, - requested_budget: int, - ) -> tuple[np.ndarray, dict[str, Any]]: - requested_floor = int(max(self.config.policyengine_selection_state_floor, 0)) - if requested_floor <= 0: - return ( - np.asarray([], dtype=np.int64), - {"applied": False, "requested_state_floor": requested_floor}, - ) - households = tables.households.copy() - if "state_fips" not in households.columns: - return ( - np.asarray([], dtype=np.int64), - { - "applied": False, - "requested_state_floor": requested_floor, - "reason": "missing_state_fips", - }, - ) - ranked = households.loc[ - :, ["household_id", "state_fips", "household_weight"] - ].copy() - ranked["_position"] = np.arange(len(ranked), dtype=np.int64) - ranked["state_fips"] = pd.to_numeric(ranked["state_fips"], errors="coerce") - ranked["household_weight"] = pd.to_numeric( - ranked["household_weight"], errors="coerce" - ).fillna(0.0) - ranked = ranked.dropna(subset=["state_fips"]) - if ranked.empty: - return ( - np.asarray([], dtype=np.int64), - { - "applied": False, - "requested_state_floor": requested_floor, - "reason": "no_rankable_states", - }, - ) - ranked["state_fips"] = ranked["state_fips"].astype(int) - ranked = ranked.sort_values( - ["state_fips", "household_weight", "household_id"], - ascending=[True, False, True], - kind="mergesort", - ) - selected = ranked.groupby("state_fips", sort=True).head(requested_floor) - selected_positions = np.sort(selected["_position"].to_numpy(dtype=np.int64)) - if len(selected_positions) > requested_budget: - raise ValueError( - "policyengine_selection_state_floor selects " - f"{len(selected_positions)} households, exceeding budget " - f"{requested_budget}" - ) - counts_by_state = ( - selected.groupby("state_fips")["household_id"].size().astype(int).to_dict() - ) - return ( - selected_positions, - { - "applied": True, - "requested_state_floor": requested_floor, - "selected_household_count": int(len(selected_positions)), - "state_count": int(selected["state_fips"].nunique()), - "counts_by_state": { - str(int(state_fips)): int(count) - for state_fips, count in counts_by_state.items() - }, - }, - ) - - def _select_policyengine_household_budget_with_pe_native_loss( - self, - *, - tables: PolicyEngineUSEntityTableBundle, - requested_budget: int, - household_ids: np.ndarray, - ) -> tuple[np.ndarray, dict[str, Any]]: - period = ( - self.config.policyengine_dataset_year - or self.config.policyengine_target_period - or 2024 - ) - with TemporaryDirectory(prefix="microplex-us-pe-native-selection-") as temp_dir: - temp_dir_path = Path(temp_dir) - selection_build_result = USMicroplexBuildResult( - config=self.config, - seed_data=pd.DataFrame(), - synthetic_data=pd.DataFrame(), - calibrated_data=pd.DataFrame(), - targets=USMicroplexTargets(marginal={}, continuous={}), - calibration_summary={}, - policyengine_tables=tables, - ) - selection_input_path = self.export_policyengine_dataset( - selection_build_result, - temp_dir_path / "selection_candidate.h5", - period=period, - direct_override_variables=self.config.policyengine_direct_override_variables, - ) - selection_output_path = temp_dir_path / "selection_candidate_optimized.h5" - optimization_result = optimize_policyengine_us_native_loss_dataset( - input_dataset_path=selection_input_path, - output_dataset_path=selection_output_path, - period=period, - **self._policyengine_selection_optimizer_kwargs( - requested_budget=requested_budget - ), - ) - with h5py.File(selection_output_path, "r") as handle: - period_key = str(period) - optimized_household_ids = handle["household_id"][period_key][:].astype( - np.int64, - copy=False, - ) - optimized_household_weights = handle["household_weight"][period_key][ - : - ].astype( - np.float64, - copy=False, - ) - weight_by_household_id = { - int(household_id): float(weight) - for household_id, weight in zip( - optimized_household_ids, - optimized_household_weights, - strict=True, - ) - } - selector_weights = np.asarray( - [ - weight_by_household_id[int(household_id)] - for household_id in household_ids - ], - dtype=np.float64, - ) - optimization_summary = optimization_result.to_dict() - optimization_summary.pop("target_names", None) - return selector_weights, optimization_summary - - def _policyengine_selection_optimizer_kwargs( - self, - *, - requested_budget: int, - ) -> dict[str, Any]: - kwargs: dict[str, Any] = { - "budget": requested_budget, - "max_iter": max(self.config.policyengine_selection_max_iter, 1), - "l2_penalty": float(self.config.policyengine_selection_l2_penalty), - "tol": float(self.config.policyengine_selection_tol), - } - if self.config.policyengine_selection_target_total_weight is not None: - kwargs["target_total_weight"] = float( - self.config.policyengine_selection_target_total_weight - ) - return kwargs - - def _puf_clone_household_summary( - self, - tables: PolicyEngineUSEntityTableBundle, - ) -> dict[str, Any]: - flag_column = self.config.puf_support_clone_flag_column - if tables.persons is None or flag_column not in tables.persons.columns: - return { - "available": False, - "clone_household_count": 0, - "mixed_flag_household_count": 0, - } - persons = tables.persons - if "household_id" not in persons.columns: - return { - "available": False, - "reason": "missing_person_household_id", - "clone_household_count": 0, - "mixed_flag_household_count": 0, - } - flags = pd.to_numeric(persons[flag_column], errors="coerce").fillna(0.0) - grouped = flags.groupby(persons["household_id"], sort=False) - flag_min = grouped.min() - flag_max = grouped.max() - clone_household_ids = flag_min.index[(flag_min > 0.5) & (flag_max > 0.5)] - mixed_household_ids = flag_min.index[(flag_min <= 0.5) & (flag_max > 0.5)] - activated_count = 0 - weight_sum = 0.0 - weight_share = 0.0 - if "household_id" in tables.households.columns: - households = tables.households - weights = pd.to_numeric( - households.get("household_weight", 0.0), - errors="coerce", - ).fillna(0.0) - household_weights = pd.Series( - weights.to_numpy(dtype=float), - index=households["household_id"].to_numpy(), - dtype=float, - ) - clone_weights = household_weights.reindex(clone_household_ids).fillna(0.0) - activated_count = int((clone_weights > 0.0).sum()) - weight_sum = float(clone_weights.sum()) - total_weight = float(weights.sum()) - weight_share = float(weight_sum / total_weight) if total_weight else 0.0 - clone_household_id_values = [ - value.item() if hasattr(value, "item") else value - for value in clone_household_ids.to_list() - ] - return { - "available": True, - "flag_column": flag_column, - "clone_household_count": int(len(clone_household_ids)), - "mixed_flag_household_count": int(len(mixed_household_ids)), - "activated_household_count": activated_count, - "household_weight_sum": weight_sum, - "household_weight_share": weight_share, - "clone_household_ids": clone_household_id_values, - } - - def _initialize_puf_clone_calibration_weights( - self, - tables: PolicyEngineUSEntityTableBundle, - ) -> tuple[PolicyEngineUSEntityTableBundle, dict[str, Any]]: - if not self.config.puf_support_clone_enabled: - return tables, {"applied": False} - summary = self._puf_clone_household_summary(tables) - if not summary.get("available"): - return tables, {"applied": False, **summary} - if summary.get("mixed_flag_household_count", 0): - raise ValueError( - "PUF support clone household diagnostics found mixed original/clone " - "person flags within a household" - ) - if self.config.calibration_backend == "none": - return tables, { - "applied": False, - "reason": "calibration_backend_none", - **summary, - } - clone_household_ids = set(summary.get("clone_household_ids", [])) - if not clone_household_ids or "household_id" not in tables.households.columns: - return tables, {"applied": False, **summary} - households = tables.households.copy() - weights = pd.to_numeric( - households["household_weight"], - errors="coerce", - ).fillna(0.0) - clone_mask = households["household_id"].isin(clone_household_ids) - share = float(self.config.puf_support_clone_prior_weight_share) - clone_count = int(clone_mask.sum()) - original_weight_sum = float(weights.loc[~clone_mask].sum()) - clone_prior_total = ( - original_weight_sum * share / (1.0 - share) - if share > 0.0 and original_weight_sum > 0.0 and clone_count - else 0.0 - ) - clone_prior_weight = ( - clone_prior_total / clone_count - if clone_count and clone_prior_total - else 0.0 - ) - if clone_prior_weight > 0.0: - households.loc[clone_mask, "household_weight"] = clone_prior_weight - updated_tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=tables.persons, - tax_units=tables.tax_units, - spm_units=tables.spm_units, - families=tables.families, - marital_units=tables.marital_units, - ) - return updated_tables, { - "applied": bool(clone_prior_weight > 0.0), - "clone_prior_weight_share": share, - "clone_prior_total_weight": clone_prior_total, - "clone_prior_household_weight": clone_prior_weight, - "clone_household_count": clone_count, - "pre_clone_weight_sum": float(weights.loc[clone_mask].sum()), - "pre_clone_original_weight_sum": original_weight_sum, - } - - def calibrate_policyengine_tables( - self, - tables: PolicyEngineUSEntityTableBundle, - ) -> tuple[PolicyEngineUSEntityTableBundle, pd.DataFrame, dict[str, Any]]: - """Calibrate household weights using PolicyEngine US target DB constraints.""" - provider, _source = self._resolve_calibration_target_provider() - target_period = ( - self.config.policyengine_target_period - or self.config.policyengine_dataset_year - or 2024 - ) - forbes_fixed_spine = self._build_forbes_fixed_spine() - tables, ssi_takeup_summary = ( - self._calibrate_policyengine_ssi_takeup_from_reported_amounts( - tables, - target_period=target_period, - ) - ) - ( - tables, - bindings, - canonical_targets, - compiled_targets, - unsupported_targets, - compiled_constraints, - supported_targets, - constraints, - feasibility_filter_summary, - materialized_variables, - materialization_failures, - fixed_spine_residualization_summary, - ) = self._resolve_policyengine_calibration_targets( - tables, - provider=provider, - target_period=target_period, - forbes_fixed_spine=forbes_fixed_spine, - ) - if self.config.pipeline_checkpoint_save_post_microsim_path is not None: - save_us_pipeline_checkpoint( - tables, - self.config.pipeline_checkpoint_save_post_microsim_path, - stage="post_microsim", - ) - _emit_us_pipeline_progress( - "US microplex build: post-microsim checkpoint saved", - path=str(self.config.pipeline_checkpoint_save_post_microsim_path), - ) - tables, puf_clone_calibration_initialization = ( - self._initialize_puf_clone_calibration_weights(tables) - ) - preselection_supported_targets = list(supported_targets) - target_planning_household_count = len(tables.households) - if not supported_targets: - raise ValueError( - "No supported PolicyEngine DB targets matched current tables" - ) - compiled_constraint_tables = tables - selection_summary: dict[str, Any] | None = None - if self.config.policyengine_selection_household_budget is not None: - preselection_household_ids = compiled_constraint_tables.households[ - "household_id" - ].to_numpy(dtype=np.int64) - ( - tables, - supported_targets, - constraints, - selection_summary, - ) = self._select_policyengine_household_budget( - tables, - supported_targets, - tuple(constraints), - ) - if selection_summary.get("applied"): - ( - supported_targets, - constraints, - post_selection_feasibility_summary, - ) = _select_feasible_policyengine_calibration_constraints( - supported_targets, - constraints, - household_count=len(tables.households), - max_constraints=self.config.policyengine_calibration_max_constraints, - max_constraints_per_household=( - self.config.policyengine_calibration_max_constraints_per_household - ), - min_active_households=( - self.config.policyengine_calibration_min_active_households - ), - ) - feasibility_filter_summary = { - **post_selection_feasibility_summary, - "pre_selection": feasibility_filter_summary, - } - if not supported_targets: - raise ValueError( - "No supported PolicyEngine DB targets remained after household-budget selection" - ) - selected_household_ids = tables.households["household_id"].to_numpy( - dtype=np.int64 - ) - selection_mask = np.isin( - preselection_household_ids, - selected_household_ids, - ) - compiled_constraints = _subset_policyengine_linear_constraints( - compiled_constraints, - selection_mask, - ) - - input_household_weight_sum = float(tables.households["household_weight"].sum()) - - def _apply_policyengine_constraint_stage( - stage_tables: PolicyEngineUSEntityTableBundle, - stage_constraints: tuple[LinearConstraint, ...], - stage_index: int = 1, - ) -> tuple[PolicyEngineUSEntityTableBundle, pd.DataFrame, dict[str, Any]]: - stage_input_household_weight_sum = float( - stage_tables.households["household_weight"].sum() - ) - stage_calibrator = None - microcalibrate_constraint_normalization = None - if self.config.calibration_backend == "none": - calibrated_households = stage_tables.households.copy() - pre_rescale_household_weight_sum = stage_input_household_weight_sum - else: - stage_calibrator = self._build_weight_calibrator( - stage_index=stage_index - ) - calibration_constraints = list(stage_constraints) - if self.config.policyengine_calibration_target_total_weight is not None: - n_hh = len(stage_tables.households) - calibration_constraints.append( - LinearConstraint( - name="total_household_weight_sum", - coefficients=np.ones(n_hh, dtype=float), - target=float( - self.config.policyengine_calibration_target_total_weight - ), - ) - ) - if self.config.calibration_backend == "microcalibrate": - ( - calibration_constraints, - microcalibrate_constraint_normalization, - ) = _normalize_policyengine_constraints_for_microcalibrate( - calibration_constraints - ) - calibrated_households = stage_calibrator.fit_transform( - stage_tables.households.copy(), - {}, - weight_col="household_weight", - linear_constraints=tuple(calibration_constraints), - ) - pre_rescale_household_weight_sum = float( - calibrated_households["household_weight"].sum() - ) - weight_sum_rescaled = False - weight_sum_rescale_mode: str | None = None - if ( - self.config.policyengine_calibration_rescale_to_target_total_weight - and self.config.policyengine_calibration_target_total_weight is not None - and pre_rescale_household_weight_sum > 0.0 - and not np.isclose( - pre_rescale_household_weight_sum, - float(self.config.policyengine_calibration_target_total_weight), - ) - ): - calibrated_households["household_weight"] = calibrated_households[ - "household_weight" - ].astype(float) * ( - float(self.config.policyengine_calibration_target_total_weight) - / pre_rescale_household_weight_sum - ) - weight_sum_rescaled = True - weight_sum_rescale_mode = "target_total_weight" - elif ( - self.config.policyengine_calibration_rescale_to_input_weight_sum - and pre_rescale_household_weight_sum > 0.0 - and not np.isclose( - pre_rescale_household_weight_sum, - stage_input_household_weight_sum, - ) - ): - calibrated_households["household_weight"] = calibrated_households[ - "household_weight" - ].astype(float) * ( - stage_input_household_weight_sum / pre_rescale_household_weight_sum - ) - weight_sum_rescaled = True - weight_sum_rescale_mode = "input_weight_sum" - if self.config.calibration_backend == "none": - validation = { - "converged": True, - "max_error": 0.0, - "sparsity": 0.0, - "linear_errors": {}, - } - else: - validation = stage_calibrator.validate(calibrated_households) - - household_weights = calibrated_households.set_index("household_id")[ - "household_weight" - ] - calibrated_persons = ( - stage_tables.persons.copy() - if stage_tables.persons is not None - else pd.DataFrame() - ) - if not calibrated_persons.empty: - calibrated_persons["weight"] = ( - calibrated_persons["household_id"] - .map(household_weights) - .astype(float) - ) - - updated_stage_tables = PolicyEngineUSEntityTableBundle( - households=calibrated_households, - persons=calibrated_persons - if not calibrated_persons.empty - else stage_tables.persons, - tax_units=stage_tables.tax_units, - spm_units=stage_tables.spm_units, - families=stage_tables.families, - marital_units=stage_tables.marital_units, - ) - return ( - updated_stage_tables, - calibrated_persons, - { - "validation": validation, - "input_household_weight_sum": stage_input_household_weight_sum, - "pre_rescale_household_weight_sum": pre_rescale_household_weight_sum, - "post_rescale_household_weight_sum": float( - calibrated_households["household_weight"].sum() - ), - "weight_sum_rescaled": weight_sum_rescaled, - "weight_sum_rescale_mode": weight_sum_rescale_mode, - "household_weight_diagnostics": _summarize_weight_diagnostics( - calibrated_households["household_weight"] - ), - "person_weight_diagnostics": ( - _summarize_weight_diagnostics(calibrated_persons["weight"]) - if not calibrated_persons.empty - and "weight" in calibrated_persons.columns - else None - ), - "microcalibrate_constraint_normalization": ( - microcalibrate_constraint_normalization - ), - }, - ) - - selected_stage_by_name = {target.name: 1 for target in supported_targets} - all_selected_targets = list(supported_targets) - all_selected_constraints = list(constraints) - # Pre-compute the ledger-needed scalars once, while compiled_constraints' - # coefficient arrays are still live. Downstream calls (ledger + - # deferred-stage selection) read from this lookup instead of - # rescanning the ~4k × 1.5M float64 arrays three times. The - # repeated scans were allocating ~30 GB of transient - # ``np.abs(...)`` copies on top of the 48 GB baseline, a - # contributor to the v8 197 GB-compressed jetsam kill. - compiled_constraint_metadata = _precompute_constraint_metadata( - compiled_constraints - ) - updated_tables, calibrated_persons, final_stage_summary = ( - _apply_policyengine_constraint_stage( - tables, - tuple(constraints), - ) - ) - target_plan_summary, target_ledger = ( - _build_policyengine_calibration_target_ledger( - canonical_targets=canonical_targets, - tables=tables, - bindings=bindings, - compiled_targets=compiled_targets, - structurally_unsupported_targets=unsupported_targets, - compiled_constraints=compiled_constraints, - preselection_targets=preselection_supported_targets, - selected_stage_by_name=selected_stage_by_name, - household_count=target_planning_household_count, - min_active_households=self.config.policyengine_calibration_min_active_households, - materialization_failures=materialization_failures, - compiled_constraint_metadata=compiled_constraint_metadata, - ) - ) - oracle_loss, oracle_target_priority_lookup = ( - _evaluate_policyengine_target_fit_context( - tables=updated_tables, - canonical_targets=canonical_targets, - final_solve_targets=all_selected_targets, - target_ledger=target_ledger, - period=target_period, - dataset_year=self.config.policyengine_dataset_year - or int(target_period), - simulation_cls=self.config.policyengine_simulation_cls, - direct_override_variables=( - self.config.policyengine_direct_override_variables - ), - relative_error_cap=self.config.policyengine_oracle_relative_error_cap, - ) - ) - - calibration_stages: list[dict[str, Any]] = [] - applied_stage_count = 1 - final_stage_index = 1 - deferred_stage_accept_metric = "full_oracle_capped_mean_abs_relative_error" - deferred_stage_trigger_metric = "full_oracle_capped_mean_abs_relative_error" - - def _append_stage_summary( - *, - stage_index: int, - kind: str, - status: str, - min_active_households: int, - selected_targets_for_stage: list[TargetSpec], - stage_metadata: dict[str, Any], - stage_result: dict[str, Any] | None, - oracle_loss_snapshot: dict[str, dict[str, Any]], - pre_oracle_loss_snapshot: dict[str, dict[str, Any]] | None = None, - ) -> None: - validation = ( - stage_result.get("validation", {}) if stage_result is not None else {} - ) - linear_errors = list(validation.get("linear_errors", {}).values()) - stage_summary = { - "stage_index": stage_index, - "kind": kind, - "status": status, - "min_active_households": int(min_active_households), - "selected_target_count": len(selected_targets_for_stage), - "selected_constraint_count": len(selected_targets_for_stage), - "selected_target_names": [ - target.name for target in selected_targets_for_stage - ], - "post_full_oracle_mean_abs_relative_error": oracle_loss_snapshot[ - "full_oracle" - ]["mean_abs_relative_error"], - "post_full_oracle_capped_mean_abs_relative_error": ( - oracle_loss_snapshot["full_oracle"][ - "capped_mean_abs_relative_error" - ] - ), - "post_active_solve_mean_abs_relative_error": oracle_loss_snapshot[ - "active_solve" - ]["mean_abs_relative_error"], - "post_active_solve_capped_mean_abs_relative_error": ( - oracle_loss_snapshot["active_solve"][ - "capped_mean_abs_relative_error" - ] - ), - **stage_metadata, - } - if pre_oracle_loss_snapshot is not None: - stage_summary.update( - { - "pre_full_oracle_mean_abs_relative_error": ( - pre_oracle_loss_snapshot["full_oracle"][ - "mean_abs_relative_error" - ] - ), - "pre_full_oracle_capped_mean_abs_relative_error": ( - pre_oracle_loss_snapshot["full_oracle"][ - "capped_mean_abs_relative_error" - ] - ), - "pre_active_solve_mean_abs_relative_error": ( - pre_oracle_loss_snapshot["active_solve"][ - "mean_abs_relative_error" - ] - ), - "pre_active_solve_capped_mean_abs_relative_error": ( - pre_oracle_loss_snapshot["active_solve"][ - "capped_mean_abs_relative_error" - ] - ), - } - ) - if stage_result is not None: - stage_summary.update( - { - "input_household_weight_sum": stage_result[ - "input_household_weight_sum" - ], - "pre_rescale_household_weight_sum": stage_result[ - "pre_rescale_household_weight_sum" - ], - "post_rescale_household_weight_sum": stage_result[ - "post_rescale_household_weight_sum" - ], - "weight_sum_rescaled": stage_result["weight_sum_rescaled"], - "weight_sum_rescale_mode": stage_result[ - "weight_sum_rescale_mode" - ], - "household_weight_diagnostics": stage_result[ - "household_weight_diagnostics" - ], - "person_weight_diagnostics": stage_result[ - "person_weight_diagnostics" - ], - "microcalibrate_constraint_normalization": stage_result.get( - "microcalibrate_constraint_normalization" - ), - "max_error": float(validation.get("max_error", 0.0)), - "effective_backend": validation.get("backend"), - "uses_gates": validation.get("uses_gates"), - "mean_error": ( - float( - np.mean( - [error["relative_error"] for error in linear_errors] - ) - ) - if linear_errors - else 0.0 - ), - "converged": bool(validation.get("converged", False)), - "sparsity": float(validation.get("sparsity", 0.0)), - } - ) - calibration_stages.append(stage_summary) - - _append_stage_summary( - stage_index=1, - kind="initial", - status="applied", - min_active_households=self.config.policyengine_calibration_min_active_households, - selected_targets_for_stage=list(supported_targets), - stage_metadata={"feasibility_filter": feasibility_filter_summary}, - stage_result=final_stage_summary, - oracle_loss_snapshot=oracle_loss, - ) - - deferred_stage_schedule: list[int] = [] - for ( - min_active_households - ) in self.config.policyengine_calibration_deferred_stage_min_active_households: - resolved_min_active = int(min_active_households) - if ( - resolved_min_active - >= self.config.policyengine_calibration_min_active_households - or resolved_min_active in deferred_stage_schedule - ): - continue - deferred_stage_schedule.append(resolved_min_active) - - if self.config.calibration_backend != "none": - for stage_index, min_active_households in enumerate( - deferred_stage_schedule, - start=2, - ): - pre_stage_oracle_loss = oracle_loss - pre_stage_trigger_metric_value = pre_stage_oracle_loss["full_oracle"][ - "capped_mean_abs_relative_error" - ] - trigger_threshold = self.config.policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error - if ( - trigger_threshold is not None - and pre_stage_trigger_metric_value is not None - and float(pre_stage_trigger_metric_value) < float(trigger_threshold) - ): - _append_stage_summary( - stage_index=stage_index, - kind="deferred", - status="skipped", - min_active_households=min_active_households, - selected_targets_for_stage=[], - stage_metadata={ - "trigger_metric": deferred_stage_trigger_metric, - "trigger_threshold": float(trigger_threshold), - "trigger_metric_value": float( - pre_stage_trigger_metric_value - ), - "skip_reason": "trigger_metric_below_threshold", - }, - stage_result=None, - oracle_loss_snapshot=oracle_loss, - pre_oracle_loss_snapshot=pre_stage_oracle_loss, - ) - continue - stage_targets, stage_constraints, stage_metadata = ( - _select_policyengine_deferred_stage_constraints( - compiled_targets=compiled_targets, - compiled_constraints=compiled_constraints, - target_ledger=target_ledger, - deferred_oracle_loss=oracle_loss["deferred"], - deferred_target_priority_lookup=oracle_target_priority_lookup[ - "deferred" - ], - selected_target_names=set(selected_stage_by_name), - household_count=target_planning_household_count, - min_active_households=min_active_households, - max_constraints=( - self.config.policyengine_calibration_deferred_stage_max_constraints - if self.config.policyengine_calibration_deferred_stage_max_constraints - is not None - else self.config.policyengine_calibration_max_constraints - ), - max_constraints_per_household=( - self.config.policyengine_calibration_max_constraints_per_household - ), - top_family_count=( - self.config.policyengine_calibration_deferred_stage_top_family_count - ), - top_geography_count=( - self.config.policyengine_calibration_deferred_stage_top_geography_count - ), - compiled_constraint_metadata=compiled_constraint_metadata, - ) - ) - if not stage_targets: - _append_stage_summary( - stage_index=stage_index, - kind="deferred", - status="skipped", - min_active_households=min_active_households, - selected_targets_for_stage=[], - stage_metadata=stage_metadata, - stage_result=None, - oracle_loss_snapshot=oracle_loss, - pre_oracle_loss_snapshot=pre_stage_oracle_loss, - ) - continue - ( - candidate_tables, - candidate_calibrated_persons, - candidate_stage_summary, - ) = _apply_policyengine_constraint_stage( - updated_tables, - stage_constraints, - stage_index=stage_index, - ) - candidate_selected_stage_by_name = dict(selected_stage_by_name) - for target in stage_targets: - candidate_selected_stage_by_name[target.name] = stage_index - candidate_all_selected_targets = [ - *all_selected_targets, - *stage_targets, - ] - candidate_all_selected_constraints = [ - *all_selected_constraints, - *stage_constraints, - ] - candidate_target_plan_summary, candidate_target_ledger = ( - _build_policyengine_calibration_target_ledger( - canonical_targets=canonical_targets, - tables=tables, - bindings=bindings, - compiled_targets=compiled_targets, - structurally_unsupported_targets=unsupported_targets, - compiled_constraints=compiled_constraints, - preselection_targets=preselection_supported_targets, - selected_stage_by_name=candidate_selected_stage_by_name, - household_count=target_planning_household_count, - min_active_households=( - self.config.policyengine_calibration_min_active_households - ), - materialization_failures=materialization_failures, - compiled_constraint_metadata=compiled_constraint_metadata, - ) - ) - candidate_oracle_loss, candidate_target_priority_lookup = ( - _evaluate_policyengine_target_fit_context( - tables=candidate_tables, - canonical_targets=canonical_targets, - final_solve_targets=candidate_all_selected_targets, - target_ledger=candidate_target_ledger, - period=target_period, - dataset_year=self.config.policyengine_dataset_year - or int(target_period), - simulation_cls=self.config.policyengine_simulation_cls, - direct_override_variables=( - self.config.policyengine_direct_override_variables - ), - relative_error_cap=( - self.config.policyengine_oracle_relative_error_cap - ), - ) - ) - pre_metric = pre_stage_oracle_loss["full_oracle"][ - "capped_mean_abs_relative_error" - ] - post_metric = candidate_oracle_loss["full_oracle"][ - "capped_mean_abs_relative_error" - ] - stage_improved = ( - pre_metric is None - or post_metric is None - or float(post_metric) < float(pre_metric) - ) - if stage_improved: - updated_tables = candidate_tables - calibrated_persons = candidate_calibrated_persons - final_stage_summary = candidate_stage_summary - applied_stage_count += 1 - final_stage_index = stage_index - selected_stage_by_name = candidate_selected_stage_by_name - all_selected_targets = candidate_all_selected_targets - all_selected_constraints = candidate_all_selected_constraints - target_plan_summary = candidate_target_plan_summary - target_ledger = candidate_target_ledger - oracle_loss = candidate_oracle_loss - oracle_target_priority_lookup = candidate_target_priority_lookup - _append_stage_summary( - stage_index=stage_index, - kind="deferred", - status="applied" if stage_improved else "rejected", - min_active_households=min_active_households, - selected_targets_for_stage=stage_targets, - stage_metadata={ - **stage_metadata, - "accept_metric": deferred_stage_accept_metric, - "accepted": stage_improved, - "trigger_metric": deferred_stage_trigger_metric, - "trigger_threshold": ( - float(trigger_threshold) - if trigger_threshold is not None - else None - ), - "trigger_metric_value": ( - float(pre_stage_trigger_metric_value) - if pre_stage_trigger_metric_value is not None - else None - ), - }, - stage_result=candidate_stage_summary, - oracle_loss_snapshot=candidate_oracle_loss, - pre_oracle_loss_snapshot=pre_stage_oracle_loss, - ) - - validation = dict(final_stage_summary["validation"]) - linear_errors = list(validation.get("linear_errors", {}).values()) - household_weight_diagnostics = final_stage_summary[ - "household_weight_diagnostics" - ] - person_weight_diagnostics = final_stage_summary["person_weight_diagnostics"] - summary = { - "backend": f"policyengine_db_{self.config.calibration_backend}", - "period": int(target_period), - "n_loaded_targets": len(canonical_targets), - "n_supported_targets": len(all_selected_targets), - "n_unsupported_targets": len(unsupported_targets), - "n_constraints": len(all_selected_constraints), - "feasibility_filter": feasibility_filter_summary, - "calibration_stages": calibration_stages, - "n_calibration_stages_applied": applied_stage_count, - "final_calibration_stage_index": final_stage_index, - "deferred_stage_support_schedule": deferred_stage_schedule, - "deferred_stage_accept_metric": deferred_stage_accept_metric, - "deferred_stage_trigger_metric": deferred_stage_trigger_metric, - "deferred_stage_trigger_threshold": ( - self.config.policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error - ), - "target_variables": list( - self._policyengine_target_scope(for_calibration=True)[0] - ), - "target_domains": list( - self._policyengine_target_scope(for_calibration=True)[1] - ), - "target_geo_levels": list( - self._policyengine_target_scope(for_calibration=True)[2] - ), - "target_profile": self._policyengine_target_profile(for_calibration=True), - "target_cell_count": len( - self._policyengine_target_cells(for_calibration=True) - ), - "materialized_variables": sorted(materialized_variables), - "materialization_failures": materialization_failures, - "ssi_takeup": ssi_takeup_summary, - "max_error": float(validation["max_error"]), - "mean_error": ( - float(np.mean([error["relative_error"] for error in linear_errors])) - if linear_errors - else 0.0 - ), - "converged": bool(validation["converged"]), - "sparsity": float(validation.get("sparsity", 0.0)), - "weight_collapse_suspected": bool( - household_weight_diagnostics["collapse_suspected"] - or ( - person_weight_diagnostics is not None - and person_weight_diagnostics["collapse_suspected"] - ) - ), - "input_household_weight_sum": input_household_weight_sum, - "total_weight_constraint_target": self.config.policyengine_calibration_target_total_weight, - "pre_rescale_household_weight_sum": final_stage_summary[ - "pre_rescale_household_weight_sum" - ], - "post_rescale_household_weight_sum": final_stage_summary[ - "post_rescale_household_weight_sum" - ], - "weight_sum_rescaled": final_stage_summary["weight_sum_rescaled"], - "weight_sum_rescale_mode": final_stage_summary["weight_sum_rescale_mode"], - "household_weight_diagnostics": household_weight_diagnostics, - "person_weight_diagnostics": person_weight_diagnostics, - "target_plan": target_plan_summary, - "target_ledger": target_ledger, - "oracle_loss": oracle_loss, - "oracle_relative_error_cap": self.config.policyengine_oracle_relative_error_cap, - "full_oracle_mean_abs_relative_error": oracle_loss["full_oracle"][ - "mean_abs_relative_error" - ], - "full_oracle_capped_mean_abs_relative_error": oracle_loss["full_oracle"][ - "capped_mean_abs_relative_error" - ], - "active_solve_mean_abs_relative_error": oracle_loss["active_solve"][ - "mean_abs_relative_error" - ], - "active_solve_capped_mean_abs_relative_error": oracle_loss["active_solve"][ - "capped_mean_abs_relative_error" - ], - "puf_support_clone": { - "enabled": bool(self.config.puf_support_clone_enabled), - "calibration_initialization": puf_clone_calibration_initialization, - "final_household_diagnostics": self._puf_clone_household_summary( - updated_tables - ), - }, - } - if selection_summary is not None: - summary["selection"] = selection_summary - if forbes_fixed_spine is not None: - updated_tables = append_forbes_fixed_spine_tables( - updated_tables, - forbes_fixed_spine, - ) - calibrated_persons = ( - updated_tables.persons.copy() - if updated_tables.persons is not None - else pd.DataFrame() - ) - summary["fixed_spine"] = { - "enabled": True, - "source_metadata": forbes_fixed_spine.source_metadata, - "record_metadata_rows": int(len(forbes_fixed_spine.record_metadata)), - "residualization": fixed_spine_residualization_summary, - "post_append_households": int(len(updated_tables.households)), - "post_append_household_weight_sum": float( - updated_tables.households["household_weight"].sum() - ), - } - else: - summary["fixed_spine"] = {"enabled": False} - warning_messages = list(feasibility_filter_summary.get("warning_messages", ())) - for stage in calibration_stages[1:]: - stage_warnings = stage.get("feasibility_filter", {}).get( - "warning_messages", () - ) - warning_messages.extend( - f"Deferred calibration stage {stage['stage_index']}: {message}" - for message in stage_warnings - ) - if any( - stage.get("status") == "applied" and not stage.get("converged", True) - for stage in calibration_stages - ): - warning_messages.append( - "Calibration did not converge on one or more selected constraint sets." - ) - summary["warnings"] = warning_messages - for message in warning_messages: - warnings.warn(message, stacklevel=2) - return updated_tables, calibrated_persons, summary - - def _check_policyengine_export_column_contract( - self, - tables: PolicyEngineUSEntityTableBundle, - *, - stage: str, - ) -> None: - contract_path = self.config.policyengine_export_column_contract_path - if contract_path is None: - return - - tax_benefit_system = self._resolve_policyengine_tax_benefit_system() - contract = load_contract(Path(contract_path)) - present = build_policyengine_us_export_column_names( - tables, - tax_benefit_system=tax_benefit_system, - direct_override_variables=self.config.policyengine_direct_override_variables, - ) - diff = compute_column_diff( - present, - required=set(contract["required"]), - forbidden=set(contract["forbidden"]), - optional=set(contract["ecps_internal_optional"]), - excluded=set(contract.get("formula_owned_excluded", [])), - ) - _emit_us_pipeline_progress( - "US microplex build: policyengine export columns check complete", - stage=stage, - status="pass" if diff.ok else "fail", - columns_present=int(len(present)), - missing_required=int(len(diff.missing_required)), - forbidden_present=int(len(diff.forbidden_present)), - ) - if diff.ok: - return - report = _format_export_column_report( - diff, - source=f"{stage}:{contract_path}", - n_present=len(present), - n_required=len(contract["required"]), - n_forbidden=len(contract["forbidden"]), - ) - raise ValueError(report) - - def _build_forbes_fixed_spine(self) -> ForbesFixedSpine | None: - path = self.config.forbes_fixed_spine_records_path - if path is None: - return None - return build_forbes_fixed_spine( - path, - config=ForbesFixedSpineConfig( - period=( - self.config.policyengine_target_period - or self.config.policyengine_dataset_year - or 2024 - ), - snapshot_id=self.config.forbes_fixed_spine_snapshot_id, - replicates_per_unit=self.config.forbes_fixed_spine_replicates_per_unit, - ), - source_metadata={ - "configured_by": "USMicroplexBuildConfig.forbes_fixed_spine_records_path" - }, - ) - - def _resolve_policyengine_calibration_targets( - self, - tables: PolicyEngineUSEntityTableBundle, - *, - provider: PolicyEngineUSDBTargetProvider, - target_period: int, - forbes_fixed_spine: ForbesFixedSpine | None = None, - ) -> tuple[ - PolicyEngineUSEntityTableBundle, - dict[str, PolicyEngineUSVariableBinding], - list[TargetSpec], - list[TargetSpec], - list[TargetSpec], - tuple[Any, ...], - list[TargetSpec], - tuple[Any, ...], - dict[str, Any], - set[str], - dict[str, str], - dict[str, Any] | None, - ]: - bindings = infer_policyengine_us_variable_bindings(tables) - canonical_targets = self._load_policyengine_target_set( - provider, - bindings=bindings, - period=target_period, - for_calibration=True, - ).targets - force_materialize_variables = policyengine_us_formula_variables_for_targets( - canonical_targets, - simulation_cls=self.config.policyengine_simulation_cls, - direct_override_variables=self.config.policyengine_direct_override_variables, - ) - missing_variables = policyengine_us_variables_to_materialize( - canonical_targets, - bindings, - force_materialize_variables=force_materialize_variables, - ) - materialization_failures: dict[str, str] = {} - materialized_variables: set[str] = set() - if missing_variables: - materialization_result = materialize_policyengine_us_variables_safely( - tables, - variables=tuple(sorted(missing_variables)), - period=target_period, - dataset_year=self.config.policyengine_dataset_year or target_period, - simulation_cls=self.config.policyengine_simulation_cls, - direct_override_variables=self.config.policyengine_direct_override_variables, - batch_size=self.config.policyengine_materialize_batch_size, - ) - tables = materialization_result.tables - unmaterialized_forced_variables = ( - force_materialize_variables - & missing_variables - set(materialization_result.bindings) - ) - bindings = { - variable: binding - for variable, binding in bindings.items() - if variable not in unmaterialized_forced_variables - } - bindings = { - **bindings, - **materialization_result.bindings, - } - materialized_variables = set(materialization_result.materialized_variables) - materialization_failures = dict(materialization_result.failed_variables) - canonical_targets = self._load_policyengine_target_set( - provider, - bindings=bindings, - period=target_period, - for_calibration=True, - ).targets - fixed_spine_residualization_summary: dict[str, Any] | None = None - if forbes_fixed_spine is not None: - residualization_result = residualize_targets_for_fixed_spine( - canonical_targets, - forbes_fixed_spine.tables, - ) - canonical_targets = list(residualization_result.targets.targets) - fixed_spine_residualization_summary = { - "target_count": len(canonical_targets), - "supported_target_count": sum( - contribution.status == "supported" - for contribution in residualization_result.contributions - ), - "unsupported_target_count": sum( - contribution.status != "supported" - for contribution in residualization_result.contributions - ), - "contributions": residualization_result.diagnostics(), - } - supported_targets = filter_supported_policyengine_us_targets( - canonical_targets, - tables, - bindings, - ) - supported_targets, unsupported_targets, constraints = ( - compile_supported_policyengine_us_household_linear_constraints( - supported_targets, - tables, - variable_bindings=bindings, - ) - ) - compiled_targets = list(supported_targets) - compiled_constraints = tuple(constraints) - ( - supported_targets, - constraints, - feasibility_filter_summary, - ) = _select_feasible_policyengine_calibration_constraints( - supported_targets, - constraints, - household_count=len(tables.households), - max_constraints=self.config.policyengine_calibration_max_constraints, - max_constraints_per_household=( - self.config.policyengine_calibration_max_constraints_per_household - ), - min_active_households=( - self.config.policyengine_calibration_min_active_households - ), - ) - return ( - tables, - bindings, - canonical_targets, - compiled_targets, - unsupported_targets, - compiled_constraints, - supported_targets, - constraints, - feasibility_filter_summary, - materialized_variables, - materialization_failures, - fixed_spine_residualization_summary, - ) - - def _has_policyengine_calibration_targets(self) -> bool: - if self.config.calibration_target_source == "arch": - return self.config.arch_targets_db is not None - return self.config.policyengine_targets_db is not None - - def _resolve_calibration_target_provider(self): - if self.config.calibration_target_source == "arch": - if self.config.arch_targets_db is None: - raise ValueError( - "arch_targets_db is required when calibration_target_source='arch'" - ) - return ( - resolve_arch_sqlite_target_provider(self.config.arch_targets_db), - "arch", - ) - if self.config.policyengine_targets_db is None: - raise ValueError( - "policyengine_targets_db is required for PolicyEngine DB calibration" - ) - return ( - PolicyEngineUSDBTargetProvider(self.config.policyengine_targets_db), - "policyengine", - ) - - def _load_policyengine_target_set( - self, - provider: Any, - *, - bindings: dict[str, PolicyEngineUSVariableBinding], - period: int, - for_calibration: bool, - ): - return provider.load_target_set( - self._build_policyengine_target_query( - bindings, - period=period, - for_calibration=for_calibration, - ) - ) - - def _policyengine_target_scope( - self, - *, - for_calibration: bool, - ) -> tuple[tuple[str, ...], tuple[str, ...], tuple[str, ...]]: - variables = ( - self.config.policyengine_calibration_target_variables - if for_calibration and self.config.policyengine_calibration_target_variables - else self.config.policyengine_target_variables - ) - domain_variables = ( - self.config.policyengine_calibration_target_domains - if for_calibration and self.config.policyengine_calibration_target_domains - else self.config.policyengine_target_domains - ) - geo_levels = ( - self.config.policyengine_calibration_target_geo_levels - if for_calibration - and self.config.policyengine_calibration_target_geo_levels - else self.config.policyengine_target_geo_levels - ) - return variables, domain_variables, geo_levels - - def _policyengine_target_profile( - self, - *, - for_calibration: bool, - ) -> str | None: - return ( - self.config.policyengine_calibration_target_profile - if for_calibration and self.config.policyengine_calibration_target_profile - else self.config.policyengine_target_profile - ) - - def _policyengine_target_cells( - self, - *, - for_calibration: bool, - ) -> tuple[PolicyEngineUSTargetCell, ...]: - profile_name = self._policyengine_target_profile( - for_calibration=for_calibration - ) - if profile_name is None: - return () - return resolve_policyengine_us_target_profile(profile_name) - - def _policyengine_calibration_scope_includes_ssi(self) -> bool: - variables, domain_variables, _geo_levels = self._policyengine_target_scope( - for_calibration=True - ) - if "ssi" in variables: - return True - if any("ssi" in str(domain).split(",") for domain in domain_variables): - return True - for cell in self._policyengine_target_cells(for_calibration=True): - cell_domains = tuple( - item.strip() - for item in str(cell.domain_variable or "").split(",") - if item.strip() - ) - if cell.variable == "ssi" or "ssi" in cell_domains: - return True - return False - - def _calibrate_policyengine_ssi_takeup_from_reported_amounts( - self, - tables: PolicyEngineUSEntityTableBundle, - *, - target_period: int, - ) -> tuple[PolicyEngineUSEntityTableBundle, dict[str, Any]]: - if not self.config.policyengine_calibrate_ssi_takeup: - return tables, {"enabled": False, "reason": "disabled_by_config"} - if not self._policyengine_calibration_scope_includes_ssi(): - return tables, {"enabled": False, "reason": "target_scope_excludes_ssi"} - if tables.persons is None or tables.persons.empty: - return tables, {"enabled": False, "reason": "missing_person_table"} - persons = tables.persons.copy() - required_columns = {"person_id", "age", "weight", "ssi"} - missing_columns = sorted(required_columns - set(persons.columns)) - if missing_columns: - return tables, { - "enabled": False, - "reason": "missing_required_columns", - "missing_columns": missing_columns, - } - reported_ssi = ( - pd.to_numeric(persons["ssi"], errors="coerce").fillna(0.0).clip(lower=0.0) - ) - if not reported_ssi.gt(0.0).any(): - persons["takes_up_ssi_if_eligible"] = False - return ( - PolicyEngineUSEntityTableBundle( - households=tables.households, - persons=persons, - tax_units=tables.tax_units, - spm_units=tables.spm_units, - families=tables.families, - marital_units=tables.marital_units, - ), - { - "enabled": True, - "method": "reported_ssi_amount_by_age_group", - "reason": "no_reported_positive_ssi", - "selected_recipients": 0.0, - "selected_amount": 0.0, - }, - ) - - full_takeup_persons = persons.copy() - full_takeup_persons["takes_up_ssi_if_eligible"] = True - full_takeup_tables = PolicyEngineUSEntityTableBundle( - households=tables.households, - persons=full_takeup_persons, - tax_units=tables.tax_units, - spm_units=tables.spm_units, - families=tables.families, - marital_units=tables.marital_units, - ) - materialization_result = materialize_policyengine_us_variables_safely( - full_takeup_tables, - variables=("ssi",), - period=target_period, - dataset_year=self.config.policyengine_dataset_year or target_period, - simulation_cls=self.config.policyengine_simulation_cls, - direct_override_variables=self.config.policyengine_direct_override_variables, - batch_size=self.config.policyengine_materialize_batch_size, - ) - materialized_persons = materialization_result.tables.persons - if ( - materialized_persons is None - or "ssi" not in materialized_persons.columns - or "ssi" not in materialization_result.bindings - ): - return tables, { - "enabled": False, - "reason": "full_takeup_ssi_materialization_failed", - "materialization_failures": dict( - materialization_result.failed_variables - ), - } - - selected, selection_summary = _select_ssi_takeup_by_age_amount( - person_ids=persons["person_id"], - ages=persons["age"], - weights=persons["weight"], - reported_ssi=reported_ssi, - full_takeup_ssi=materialized_persons["ssi"], - ) - persons["takes_up_ssi_if_eligible"] = selected - updated_tables = PolicyEngineUSEntityTableBundle( - households=tables.households, - persons=persons, - tax_units=tables.tax_units, - spm_units=tables.spm_units, - families=tables.families, - marital_units=tables.marital_units, - ) - return updated_tables, selection_summary - - def _build_policyengine_target_query( - self, - bindings: dict[str, PolicyEngineUSVariableBinding], - *, - period: int, - for_calibration: bool = False, - ) -> TargetQuery: - variables, domain_variables, geo_levels = self._policyengine_target_scope( - for_calibration=for_calibration - ) - profile_name = self._policyengine_target_profile( - for_calibration=for_calibration - ) - target_cells = self._policyengine_target_cells(for_calibration=for_calibration) - return TargetQuery( - period=period, - provider_filters={ - "variables": list(variables) if variables else None, - "domain_variables": ( - list(domain_variables) if domain_variables else None - ), - "geo_levels": list(geo_levels) if geo_levels else None, - "target_profile": profile_name, - "target_cells": ( - [cell.to_provider_filter() for cell in target_cells] - if target_cells - else None - ), - "reform_id": self.config.policyengine_target_reform_id, - "entity_overrides": { - variable: binding.entity for variable, binding in bindings.items() - }, - }, - ) - - def build_policyengine_entity_tables( - self, - population: pd.DataFrame, - ) -> PolicyEngineUSEntityTableBundle: - """Build a PolicyEngine-oriented multientity bundle from person rows.""" - persons = population.copy().reset_index(drop=True) - if "person_id" not in persons.columns: - persons["person_id"] = np.arange(len(persons), dtype=np.int64) - if "household_id" not in persons.columns: - persons["household_id"] = np.arange(len(persons), dtype=np.int64) - if "weight" not in persons.columns: - persons["weight"] = 1.0 - if "income" not in persons.columns: - persons["income"] = 0.0 - if "age" not in persons.columns: - persons["age"] = 0 - - persons["person_id"] = persons["person_id"].astype(np.int64) - persons["household_id"] = persons["household_id"].astype(np.int64) - persons["weight"] = pd.to_numeric(persons["weight"], errors="coerce").fillna( - 0.0 - ) - persons["income"] = pd.to_numeric(persons["income"], errors="coerce").fillna( - 0.0 - ) - persons["age"] = ( - pd.to_numeric(persons["age"], errors="coerce").fillna(0).astype(int) - ) - household_ids = persons["household_id"] - for column, threshold in (("count_under_18", 18), ("count_under_6", 6)): - under_threshold = persons["age"].lt(threshold).astype(np.int64) - persons[column] = under_threshold.groupby( - household_ids, sort=False - ).transform("sum") - persons = self._augment_policyengine_person_inputs(persons) - persons["relationship_to_head"] = self._normalize_relationship_to_head(persons) - persons = self._assign_policyengine_household_head_flag(persons) - persons = self._attach_policyengine_person_takeup_inputs(persons) - - households = self._build_policyengine_households(persons) - tax_units, persons = self._build_policyengine_tax_units(persons) - tax_units = self._attach_policyengine_tax_unit_source_inputs(tax_units) - tax_units = self._attach_policyengine_tax_unit_takeup_inputs(tax_units) - persons = self._construct_aotc_eligibility_inputs(persons) - persons = self._assign_family_and_spm_units(persons) - persons = self._attach_policyengine_wic_inputs(persons) - families = self._collapse_group_table(persons, "family_id") - spm_units = self._collapse_group_table(persons, "spm_unit_id") - spm_units = self._attach_spm_unit_source_columns(persons, spm_units) - if "tenure_type" in persons.columns: - spm_tenure = ( - persons.groupby("spm_unit_id", as_index=False)["tenure_type"] - .first() - .rename(columns={"tenure_type": "spm_unit_tenure_type"}) - ) - spm_units = spm_units.merge(spm_tenure, on="spm_unit_id", how="left") - persons = self._assign_marital_units(persons) - marital_units = self._collapse_group_table(persons, "marital_unit_id") - - tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=tax_units, - spm_units=spm_units, - families=families, - marital_units=marital_units, - ) - return tables - - # AOTC eligibility-input columns populated by - # ``_construct_aotc_eligibility_inputs``, matching the per-student inputs - # written by the enhanced-CPS baseline ``_impute_aotc_eligibility_inputs`` - # (PolicyEngine/policyengine-us-data, unmerged branch - # ``codex/fix-aotc-eligibility``). - _AOTC_TRUE_FLAG_COLUMNS = ( - "is_pursuing_credential_for_american_opportunity_credit", - "attends_eligible_educational_institution_for_american_opportunity_credit", - "is_enrolled_at_least_half_time_for_american_opportunity_credit", - "has_american_opportunity_credit_1098_t_or_exception", - "has_american_opportunity_credit_institution_ein", - ) - _AOTC_FALSE_FLAG_COLUMNS = ( - "has_completed_first_four_years_of_postsecondary_education", - "has_felony_drug_conviction", - ) - _AOTC_PRIOR_YEARS_COLUMN = "american_opportunity_credit_claimed_prior_years" - - def _construct_aotc_eligibility_inputs( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - """Convert the PUF AOTC signal into person eligibility inputs. - - Mirrors the enhanced-CPS baseline - ``ExtendedCPS._impute_aotc_eligibility_inputs`` - (``PolicyEngine/policyengine-us-data``, unmerged branch - ``codex/fix-aotc-eligibility``). - - The enhanced CPS operates on a flat ``{variable: {period: array}}`` - payload keyed by ``person_tax_unit_id``; Microplex carries the same - signals (``american_opportunity_credit``, - ``qualified_tuition_expenses``, ``is_full_time_college_student``, - ``is_tax_unit_dependent``) as columns on the person table keyed by - ``tax_unit_id`` once ``_build_policyengine_tax_units`` has assigned - authoritative tax units, so the per-tax-unit back-solve is the same - algorithm applied to a single DataFrame. - - Driven by the PUF-imputed ``american_opportunity_credit`` (PUF - ``E87521``; see ``data_sources/puf.py`` / ``manifests/puf.json``). For - each tax unit with positive credit the enhanced-CPS rule applies: if - any member already reports positive qualified tuition, every such - member is marked an AOTC student and the reported tuition is left - unchanged; otherwise a single student is selected by priority - (full-time college student -> tax-unit dependent -> any member) and - that student's qualified tuition is back-solved to the minimum amount - reproducing the unit's credit under PolicyEngine-US. With no credit - signal it falls back to the enhanced-CPS - ``aotc_student = qualified_tuition_expenses > 0`` rule. The selected - students receive the five factual eligibility flags as ``True``, - ``has_completed_first_four_years_of_postsecondary_education`` and - ``has_felony_drug_conviction`` as ``False`` (constants the enhanced - CPS also hard-codes), and - ``american_opportunity_credit_claimed_prior_years`` clamped to a - maximum of 3. ``american_opportunity_credit`` is a PUF - calculated-tax output (see ``microdata_roles.py``) and is not itself - exported; PolicyEngine-US recomputes the credit from these inputs. - """ - if persons is None or persons.empty: - return persons - if "tax_unit_id" not in persons.columns: - return persons - - result = persons.copy() - n = len(result) - time_period = int(self.config.policyengine_dataset_year or 2024) - - person_tax_unit_ids = result["tax_unit_id"].to_numpy() - tuition = ( - pd.to_numeric( - result["qualified_tuition_expenses"], - errors="coerce", - ) - .fillna(0.0) - .to_numpy(dtype=float, copy=True) - if "qualified_tuition_expenses" in result.columns - else np.zeros(n, dtype=float) - ) - if "qualified_tuition_expenses" not in result.columns: - # No tuition signal and no credit-derived tuition can be - # back-solved, so there is no student population to mark. - credit_present = "american_opportunity_credit" in result.columns - if not credit_present: - return persons - - credit = ( - pd.to_numeric( - result["american_opportunity_credit"], - errors="coerce", - ) - .fillna(0.0) - .to_numpy(dtype=float) - if "american_opportunity_credit" in result.columns - else None - ) - full_time = ( - pd.to_numeric(result["is_full_time_college_student"], errors="coerce") - .fillna(0) - .astype(bool) - .to_numpy() - if "is_full_time_college_student" in result.columns - else np.zeros(n, dtype=bool) - ) - dependent = ( - pd.to_numeric(result["is_tax_unit_dependent"], errors="coerce") - .fillna(0) - .astype(bool) - .to_numpy() - if "is_tax_unit_dependent" in result.columns - else np.zeros(n, dtype=bool) - ) - - aotc_student = np.zeros(n, dtype=bool) - - if credit is not None: - positive_credit = credit > 0 - if not positive_credit.any(): - # No positive credit anywhere: nothing to construct. The - # enhanced CPS returns early here without writing inputs. - return persons - - # ``american_opportunity_credit`` rides on the person table as the - # per-tax-unit value repeated across members; collapse to one - # value per tax unit (the maximum guards against any per-member - # zero-fill on non-filer rows). - credit_by_tax_unit: dict[Any, float] = {} - for tax_unit_id, member_credit in zip(person_tax_unit_ids, credit): - prior = credit_by_tax_unit.get(tax_unit_id, 0.0) - if member_credit > prior: - credit_by_tax_unit[tax_unit_id] = float(member_credit) - - positive_credit_units = [ - tax_unit_id - for tax_unit_id, unit_credit in credit_by_tax_unit.items() - if unit_credit > 0 - ] - for tax_unit_id in positive_credit_units: - member_indices = np.flatnonzero(person_tax_unit_ids == tax_unit_id) - if member_indices.size == 0: - continue - - # eCPS rule: if any member already reports positive qualified - # tuition, every such member is an AOTC student and the reported - # tuition is left untouched (no back-solve, no rewrite). - tuition_indices = member_indices[tuition[member_indices] > 0] - if tuition_indices.size > 0: - aotc_student[tuition_indices] = True - continue - - # Otherwise select a single student by the eCPS priority - # (full-time college student -> tax-unit dependent -> any - # member) and back-solve the minimum qualified tuition that - # reproduces the unit's credit under PolicyEngine-US. - preferred = member_indices[full_time[member_indices]] - if preferred.size == 0: - preferred = member_indices[dependent[member_indices]] - if preferred.size == 0: - preferred = member_indices - selected = preferred[0] - aotc_student[selected] = True - tuition[selected] = max( - tuition[selected], - qualifying_expenses_from_american_opportunity_credit( - credit_by_tax_unit[tax_unit_id], - time_period, - ), - ) - else: - aotc_student = tuition > 0 - if not aotc_student.any(): - return persons - - # Five factual eligibility flags -> True for selected students. - for column in self._AOTC_TRUE_FLAG_COLUMNS: - values = ( - result[column].fillna(False).astype(bool).to_numpy().copy() - if column in result.columns - else np.zeros(n, dtype=bool) - ) - values[aotc_student] = True - result[column] = values - - # has_completed_first_four_years / has_felony_drug_conviction -> False. - for column in self._AOTC_FALSE_FLAG_COLUMNS: - values = ( - result[column].fillna(False).astype(bool).to_numpy().copy() - if column in result.columns - else np.zeros(n, dtype=bool) - ) - values[aotc_student] = False - result[column] = values - - # Prior-year claims clamped to the 4-year (max 3 prior) AOTC limit. - prior_years = ( - pd.to_numeric(result[self._AOTC_PRIOR_YEARS_COLUMN], errors="coerce") - .fillna(0) - .astype(np.int64) - .to_numpy() - .copy() - if self._AOTC_PRIOR_YEARS_COLUMN in result.columns - else np.zeros(n, dtype=np.int64) - ) - prior_years[aotc_student] = np.minimum(prior_years[aotc_student], 3) - result[self._AOTC_PRIOR_YEARS_COLUMN] = prior_years - - # Write the back-solved per-student tuition the credit implies, so the - # exported ``qualified_tuition_expenses`` reproduces the PUF credit - # under PolicyEngine-US (enhanced CPS does the same). - if "qualified_tuition_expenses" in result.columns: - result["qualified_tuition_expenses"] = tuition - - return result - - def export_policyengine_dataset( - self, - result: USMicroplexBuildResult, - path: str | Path, - *, - period: int | None = None, - direct_override_variables: tuple[str, ...] | None = None, - ) -> Path: - """Export a build result as a PolicyEngine-readable HDF5 dataset.""" - export_period = ( - period - or self.config.policyengine_dataset_year - or result.config.policyengine_dataset_year - or 2024 - ) - export_direct_override_variables = ( - direct_override_variables - if direct_override_variables is not None - else ( - self.config.policyengine_direct_override_variables - or result.config.policyengine_direct_override_variables - ) - ) - tables = result.policyengine_tables or self.build_policyengine_entity_tables( - result.calibrated_data - ) - tables = self._normalize_policyengine_tables_for_export(tables) - tables = self._attach_policyengine_marketplace_plan_benchmark_ratio( - tables, - target_period=export_period, - ) - tax_benefit_system = self._resolve_policyengine_tax_benefit_system() - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=tax_benefit_system, - direct_override_variables=export_direct_override_variables, - ) - excluded_variables = resolve_policyengine_excluded_export_variables( - tax_benefit_system, - sorted( - { - target - for variable_map in export_maps.values() - for target in variable_map.values() - } - ), - direct_override_variables=export_direct_override_variables, - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=export_period, - household_variable_map=export_maps["household"], - person_variable_map=export_maps["person"], - tax_unit_variable_map=export_maps["tax_unit"], - spm_unit_variable_map=export_maps["spm_unit"], - family_variable_map=export_maps["family"], - ) - return write_policyengine_us_time_period_dataset( - arrays, - path, - excluded_variables=excluded_variables, - tax_benefit_system=tax_benefit_system, - ) - - def _normalize_policyengine_tables_for_export( - self, - tables: PolicyEngineUSEntityTableBundle, - ) -> PolicyEngineUSEntityTableBundle: - if tables.persons is None: - return tables - return replace( - tables, - persons=self._augment_policyengine_person_inputs(tables.persons), - ) - - def _fit_synthesizer( - self, - seed_data: pd.DataFrame, - synthesis_variables: USMicroplexSynthesisVariables, - ) -> Synthesizer: - """Fit a microplex synthesizer on the seed data.""" - condition_vars = list(synthesis_variables.condition_vars) - target_vars = list(synthesis_variables.target_vars) - if not target_vars: - raise ValueError( - "USMicroplexPipeline requires at least one observed target variable" - ) - - synthesizer = Synthesizer( - target_vars=target_vars, - condition_vars=condition_vars, - n_layers=self.config.synthesizer_n_layers, - hidden_dim=self.config.synthesizer_hidden_dim, - ) - synthesizer.fit( - seed_data[condition_vars + target_vars + ["hh_weight"]].rename( - columns={"hh_weight": "weight"} - ), - weight_col="weight", - epochs=self.config.synthesizer_epochs, - batch_size=self.config.synthesizer_batch_size, - learning_rate=self.config.synthesizer_learning_rate, - verbose=False, - ) - return synthesizer - - def _build_donor_imputer( - self, - *, - condition_vars: list[str], - target_vars: tuple[str, ...], - ) -> Synthesizer | ColumnwiseQRFDonorImputer: - backend = self.config.donor_imputer_backend - if backend == "maf": - return Synthesizer( - target_vars=list(target_vars), - condition_vars=condition_vars, - n_layers=self.config.donor_imputer_n_layers, - hidden_dim=self.config.donor_imputer_hidden_dim, - ) - - support_families = { - variable: variable_semantic_spec_for(variable).support_family - for variable in target_vars - } - nonnegative_vars = { - variable - for variable, support_family in support_families.items() - if support_family is VariableSupportFamily.BOUNDED_SHARE - } - if backend == "regime_aware": - return RegimeAwareDonorImputer( - condition_vars=condition_vars, - target_vars=list(target_vars), - n_estimators=self.config.donor_imputer_qrf_n_estimators, - max_train_samples=self.config.donor_imputer_qrf_max_train_samples, - seed=self.config.random_seed, - ) - zero_inflated_vars = ( - { - variable - for variable, support_family in support_families.items() - if support_family - in { - VariableSupportFamily.SUPPORT_SENSITIVE, - } - } - if backend == "zi_qrf" - else set() - ) - return ColumnwiseQRFDonorImputer( - condition_vars=condition_vars, - target_vars=list(target_vars), - n_estimators=self.config.donor_imputer_qrf_n_estimators, - zero_inflated_vars=zero_inflated_vars, - nonnegative_vars=nonnegative_vars, - zero_threshold=self.config.donor_imputer_qrf_zero_threshold, - ) - - def _resolve_synthesis_variables( - self, - source_input: USMicroplexSourceInput, - *, - fusion_plan: FusionPlan | None = None, - include_all_observed_targets: bool = False, - available_columns: set[str] | None = None, - observed_frame: pd.DataFrame | None = None, - ) -> USMicroplexSynthesisVariables: - """Select the observed variables to feed into synthesis.""" - active_plan = fusion_plan or source_input.fusion_plan - available_variables = prune_redundant_variables( - active_plan.variables_for(EntityType.HOUSEHOLD) - | active_plan.variables_for(EntityType.PERSON) - ) - if available_columns is not None: - available_variables = available_variables & available_columns - condition_vars = self._resolve_synthesis_condition_vars( - available_variables, - observed_frame=observed_frame, - ) - configured_targets = [ - variable - for variable in self.config.synthesizer_target_vars - if variable in available_variables and variable not in condition_vars - ] - configured_targets.extend( - variable - for variable in STATE_PROGRAM_SUPPORT_PROXY_VARIABLES - if variable in available_variables - and variable not in condition_vars - and variable not in configured_targets - ) - extra_targets: list[str] = [] - if include_all_observed_targets: - excluded = { - "person_id", - "household_id", - "hh_weight", - "weight", - "state", - "age_group", - "income_bracket", - } - extra_targets = sorted( - variable - for variable in available_variables - if variable not in excluded - and variable not in condition_vars - and variable not in configured_targets - ) - return USMicroplexSynthesisVariables( - condition_vars=condition_vars, - target_vars=tuple(configured_targets + extra_targets), - ) - - def _resolve_synthesis_condition_vars( - self, - available_columns: Iterable[str], - *, - observed_frame: pd.DataFrame | None = None, - ) -> tuple[str, ...]: - available = set(available_columns) - ordered = list(self.config.synthesizer_condition_vars) - for variable in STATE_PROGRAM_AUTO_CONDITION_VARIABLES: - if ( - variable in available - and variable not in ordered - and ( - observed_frame is None - or self._is_informative_state_program_proxy( - observed_frame, - variable, - ) - ) - ): - ordered.append(variable) - return tuple(variable for variable in ordered if variable in available) - - def _is_informative_state_program_proxy( - self, - frame: pd.DataFrame, - variable: str, - ) -> bool: - if variable not in frame.columns: - return False - series = pd.to_numeric(frame[variable], errors="coerce").replace( - [np.inf, -np.inf], - np.nan, - ) - series = series.dropna() - if series.empty: - return False - return bool(series.nunique(dropna=True) > 1) - - def _select_scaffold_source( - self, - source_inputs: list[USMicroplexSourceInput], - ) -> USMicroplexSourceInput: - candidates = [ - source - for source in source_inputs - if source.household_observation is not None - and source.household_person_relationship is not None - ] - if not candidates: - raise ValueError( - "USMicroplexPipeline requires at least one structured source with household and person observations" - ) - - def score(source: USMicroplexSourceInput) -> tuple[int, int, int, int]: - public_score = int(source.frame.source.shareability == Shareability.PUBLIC) - geography_score = self._household_geography_coverage(source) - observed_variables = source.fusion_plan.variables_for( - EntityType.HOUSEHOLD - ) | source.fusion_plan.variables_for(EntityType.PERSON) - support_proxy_score = sum( - variable in observed_variables - for variable in STATE_PROGRAM_SUPPORT_PROXY_VARIABLES - ) - observed_vars = len(observed_variables) - household_rows = ( - len(source.households) if source.households is not None else 0 - ) - return ( - public_score, - geography_score, - support_proxy_score, - observed_vars, - household_rows, - ) - - if self.config.puf_support_clone_enabled: - cps_candidates = [ - source - for source in candidates - if self._is_cps_asec_scaffold_source(source.frame.source.name) - ] - if cps_candidates: - return max(cps_candidates, key=score) - - return max(candidates, key=score) - - def _household_geography_coverage( - self, - source: USMicroplexSourceInput, - ) -> int: - households = source.households - if households is None or "state_fips" not in households.columns: - return 0 - state_fips = pd.to_numeric(households["state_fips"], errors="coerce").fillna(0) - return int((state_fips > 0).sum()) - - def _is_puf_support_clone_source(self, source_name: str) -> bool: - return any( - source_name.startswith(prefix) - for prefix in self.config.puf_support_clone_source_prefixes - ) - - def _is_cps_asec_scaffold_source(self, source_name: str) -> bool: - return source_name.startswith(("cps", "cps_asec")) - - def _ordered_donor_inputs_for_puf_support_clone( - self, - *, - scaffold_input: USMicroplexSourceInput, - donor_inputs: list[USMicroplexSourceInput], - ) -> tuple[list[USMicroplexSourceInput], list[str], list[str]]: - """Return PUF-first donor inputs and clone source order for clone mode.""" - input_order = [donor.frame.source.name for donor in donor_inputs] - if not self.config.puf_support_clone_enabled: - return donor_inputs, input_order, [] - - scaffold_name = scaffold_input.frame.source.name - if self._is_puf_support_clone_source(scaffold_name): - raise ValueError( - "puf_support_clone_enabled requires the PUF source to be a donor, " - f"but selected scaffold source is {scaffold_name!r}" - ) - if not self._is_cps_asec_scaffold_source(scaffold_name): - raise ValueError( - "puf_support_clone_enabled requires a CPS/ASEC-shaped scaffold; " - f"selected scaffold source is {scaffold_name!r}" - ) - - puf_donors = [ - donor - for donor in donor_inputs - if self._is_puf_support_clone_source(donor.frame.source.name) - ] - if not puf_donors: - raise ValueError( - "puf_support_clone_enabled requires exactly one PUF donor source, " - "but none matched puf_support_clone_source_prefixes" - ) - if len(puf_donors) > 1: - raise ValueError( - "puf_support_clone_enabled requires an unambiguous PUF donor source; " - "matched " + ", ".join(donor.frame.source.name for donor in puf_donors) - ) - - non_puf_donors = [ - donor - for donor in donor_inputs - if not self._is_puf_support_clone_source(donor.frame.source.name) - ] - ordered = [*puf_donors, *non_puf_donors] - return ( - ordered, - [donor.frame.source.name for donor in ordered], - [donor.frame.source.name for donor in puf_donors], - ) - - def _prepare_puf_support_clone_frame(self, original: pd.DataFrame) -> pd.DataFrame: - """Create a zero-stored-weight PUF clone frame from CPS support rows.""" - clone = original.copy() - clone[PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN] = np.arange( - len(clone), - dtype=np.int64, - ) - structural_id_columns = {"person_id", *ENTITY_ID_COLUMNS.values()} - for column in sorted(structural_id_columns & set(clone.columns)): - series = clone[column] - if pd.api.types.is_numeric_dtype(series): - numeric = pd.to_numeric(series, errors="coerce") - finite = numeric[np.isfinite(numeric)] - offset = int(finite.max()) + 1 if not finite.empty else len(clone) - clone[column] = numeric.fillna(-1).astype(np.int64) + int(offset) - else: - clone[column] = series.astype(str) + "__puf_clone" - if self.config.puf_support_clone_zero_initial_weight: - for column in clone.columns: - if column == "weight" or "_weight" in column: - clone[column] = 0.0 - clone[self.config.puf_support_clone_flag_column] = 1.0 - return clone - - def _refresh_puf_support_clone_cps_only_fields( - self, - *, - original: pd.DataFrame, - clone: pd.DataFrame, - integrated_variables: Iterable[str], - preclone_columns: set[str], - ) -> tuple[pd.DataFrame, dict[str, Any]]: - """Refresh copied CPS-only clone fields after PUF income is grafted on. - - PUF support clones start as literal CPS copies, then receive PUF tax and - income fields. Any remaining copied CPS-only fields can become - incoherent with the clone's new income surface. Re-match those fields - from CPS donors using demographic predictors plus PUF-imputed income. - """ - summary: dict[str, Any] = { - "enabled": bool(self.config.puf_support_clone_refresh_cps_only_fields), - "condition_variables": [], - "refreshed_variables": [], - "social_security_reconciled_variables": [], - "matched_source_row_count": 0, - } - if not self.config.puf_support_clone_refresh_cps_only_fields: - return clone, summary - if original.empty or clone.empty: - return clone, summary - - integrated_set = set(integrated_variables) - condition_vars = [ - variable - for variable in self.config.puf_support_clone_cps_refresh_condition_variables - if variable in original.columns - and variable in clone.columns - and pd.api.types.is_numeric_dtype(original[variable]) - and pd.api.types.is_numeric_dtype(clone[variable]) - and self._is_compatible_donor_condition( - clone[variable], - original[variable], - ) - ] - if not condition_vars: - return clone, summary - - refresh_variables = [ - variable - for variable in self.config.puf_support_clone_cps_refresh_variables - if variable in preclone_columns - and variable not in integrated_set - and variable in original.columns - and variable in clone.columns - ] - if not refresh_variables: - return clone, summary - - train = original.loc[:, condition_vars].apply( - lambda series: pd.to_numeric(series, errors="coerce").fillna(0.0) - ) - test = clone.loc[:, condition_vars].apply( - lambda series: pd.to_numeric(series, errors="coerce").fillna(0.0) - ) - for variable in ( - set(condition_vars) & PUF_SUPPORT_CLONE_CPS_REFRESH_INCOME_VARIABLES - ): - train[variable] = np.arcsinh(train[variable]) - test[variable] = np.arcsinh(test[variable]) - scale = train.std(ddof=0).replace(0.0, 1.0) - center = train.mean() - train_values = ((train - center) / scale).to_numpy(dtype=float) - test_values = ((test - center) / scale).to_numpy(dtype=float) - - from sklearn.neighbors import NearestNeighbors - - matcher = NearestNeighbors(n_neighbors=1) - matcher.fit(train_values) - matched = matcher.kneighbors(test_values, return_distance=False).reshape(-1) - - refreshed = clone.copy() - for variable in refresh_variables: - refreshed[variable] = original[variable].to_numpy(copy=True)[matched] - - reconciled_variables = self._reconcile_puf_support_clone_social_security( - refreshed - ) - summary["condition_variables"] = condition_vars - summary["refreshed_variables"] = refresh_variables - summary["social_security_reconciled_variables"] = reconciled_variables - summary["matched_source_row_count"] = int(np.unique(matched).size) - return refreshed, summary - - def _reconcile_puf_support_clone_social_security( - self, - clone: pd.DataFrame, - ) -> list[str]: - """Scale cloned Social Security components to the PUF-imputed total.""" - if "social_security" not in clone.columns: - return [] - subcomponents = [ - variable - for variable in ( - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - ) - if variable in clone.columns - ] - if not subcomponents: - return [] - - total = pd.to_numeric(clone["social_security"], errors="coerce").fillna(0.0) - sub_values = { - variable: pd.to_numeric(clone[variable], errors="coerce").fillna(0.0) - for variable in subcomponents - } - sub_sum = sum(sub_values.values()) - positive_total = total.gt(0.0) - positive_sub_sum = sub_sum.gt(0.0) - scale_mask = positive_total & positive_sub_sum - zero_mask = ~positive_total - - for variable, values in sub_values.items(): - adjusted = values.copy() - adjusted.loc[zero_mask] = 0.0 - adjusted.loc[scale_mask] = ( - values.loc[scale_mask] * total.loc[scale_mask] / sub_sum.loc[scale_mask] - ) - clone[variable] = adjusted - - fallback_mask = positive_total & ~positive_sub_sum - if fallback_mask.any(): - age = pd.to_numeric(clone.get("age", 0.0), errors="coerce").fillna(0.0) - if "social_security_retirement" in subcomponents: - clone.loc[ - fallback_mask & age.ge(62), - "social_security_retirement", - ] = total.loc[fallback_mask & age.ge(62)] - if "social_security_disability" in subcomponents: - clone.loc[ - fallback_mask & age.lt(62), - "social_security_disability", - ] = total.loc[fallback_mask & age.lt(62)] - return subcomponents - - def _preserve_cps_measured_puf_clone_totals( - self, - *, - original: pd.DataFrame, - clone: pd.DataFrame, - integrated_variables: Iterable[str], - preclone_columns: set[str], - ) -> tuple[pd.DataFrame, dict[str, Any]]: - """Optionally anchor copied PUF leaves, then keep leaf identities intact.""" - integrated_set = set(integrated_variables) - result = clone.copy() - passthrough_variables: list[str] = [] - dividend_scaled = False - scaling_enabled = bool( - self.config.puf_support_clone_scale_tax_details_to_cps_totals - ) - - if not scaling_enabled: - identity_variables = ( - self._reconcile_puf_support_clone_tax_detail_identities( - original=original, - clone=result, - integrated_variables=integrated_set, - ) - ) - return result, { - "enabled": False, - "passthrough_variables": [], - "dividend_components_scaled_to_cps_total": False, - "identity_reconciled_variables": identity_variables, - } - - for target, aliases in PUF_SUPPORT_CLONE_CPS_DIRECT_PASSTHROUGH_ALIASES.items(): - if target not in result.columns or target not in integrated_set: - continue - source = next( - (alias for alias in aliases if alias in original.columns), None - ) - if source is None: - continue - result[target] = ( - pd.to_numeric(original[source], errors="coerce") - .fillna(0.0) - .to_numpy(copy=True) - ) - passthrough_variables.append(target) - - for ( - components, - total_alias, - fallback_first_share, - ) in PUF_SUPPORT_CLONE_CPS_SPLIT_TOTALS: - if total_alias not in original.columns: - continue - if not all(column in result.columns for column in components): - continue - if not any(column in integrated_set for column in components): - continue - cps_total = ( - pd.to_numeric(original[total_alias], errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - .astype(float) - ) - first_component = ( - pd.to_numeric(result[components[0]], errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - .astype(float) - ) - second_component = ( - pd.to_numeric(result[components[1]], errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - .astype(float) - ) - component_total = first_component + second_component - first_share = pd.Series(fallback_first_share, index=result.index) - positive_component_total = component_total.gt(0.0) - first_share.loc[positive_component_total] = ( - first_component.loc[positive_component_total] - / component_total.loc[positive_component_total] - ).clip(lower=0.0, upper=1.0) - result[components[0]] = (cps_total * first_share).to_numpy(copy=True) - result[components[1]] = (cps_total * (1.0 - first_share)).to_numpy( - copy=True - ) - if total_alias == "pension_income": - if "taxable_private_pension_income" in result.columns: - result["taxable_private_pension_income"] = result[ - components[0] - ].to_numpy(copy=True) - if "tax_exempt_private_pension_income" in result.columns: - result["tax_exempt_private_pension_income"] = result[ - components[1] - ].to_numpy(copy=True) - passthrough_variables.extend(components) - - dividend_components = ( - "qualified_dividend_income", - "non_qualified_dividend_income", - ) - if ( - PUF_SUPPORT_CLONE_CPS_DIVIDEND_TOTAL_ALIAS in original.columns - and all(column in result.columns for column in dividend_components) - and any(column in integrated_set for column in dividend_components) - ): - cps_total = ( - pd.to_numeric( - original[PUF_SUPPORT_CLONE_CPS_DIVIDEND_TOTAL_ALIAS], - errors="coerce", - ) - .fillna(0.0) - .clip(lower=0.0) - .astype(float) - ) - qualified = ( - pd.to_numeric(result["qualified_dividend_income"], errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - .astype(float) - ) - non_qualified = ( - pd.to_numeric(result["non_qualified_dividend_income"], errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - .astype(float) - ) - component_total = qualified + non_qualified - share = pd.Series(UNSPLIT_DIVIDEND_QUALIFIED_SHARE, index=result.index) - positive_component_total = component_total.gt(0.0) - share.loc[positive_component_total] = ( - qualified.loc[positive_component_total] - / component_total.loc[positive_component_total] - ).clip(lower=0.0, upper=1.0) - result["qualified_dividend_income"] = (cps_total * share).to_numpy( - copy=True - ) - result["non_qualified_dividend_income"] = ( - cps_total * (1.0 - share) - ).to_numpy(copy=True) - if "ordinary_dividend_income" in result.columns: - result["ordinary_dividend_income"] = cps_total.to_numpy(copy=True) - if "dividend_income" in result.columns: - result["dividend_income"] = cps_total.to_numpy(copy=True) - dividend_scaled = True - passthrough_variables.extend( - [ - "qualified_dividend_income", - "non_qualified_dividend_income", - ] - ) - - identity_variables = self._reconcile_puf_support_clone_tax_detail_identities( - original=original, - clone=result, - integrated_variables=integrated_set, - ) - return result, { - "enabled": True, - "passthrough_variables": sorted(set(passthrough_variables)), - "dividend_components_scaled_to_cps_total": dividend_scaled, - "identity_reconciled_variables": identity_variables, - } - - def _reconcile_puf_support_clone_tax_detail_identities( - self, - *, - original: pd.DataFrame, - clone: pd.DataFrame, - integrated_variables: set[str], - ) -> list[str]: - """Keep PUF tax leaves and their parent totals consistent before collapse.""" - reconciled: list[str] = [] - - def numeric(column: str) -> pd.Series: - return pd.to_numeric(clone[column], errors="coerce").fillna(0.0) - - def assign_if_available(column: str, values: pd.Series) -> None: - if column not in clone.columns and column not in original.columns: - return - clone[column] = values.to_numpy(copy=True) - reconciled.append(column) - - interest_components = ("taxable_interest_income", "tax_exempt_interest_income") - if set(interest_components) & integrated_variables and all( - column in clone.columns for column in interest_components - ): - assign_if_available( - "interest_income", - numeric("taxable_interest_income") - + numeric("tax_exempt_interest_income"), - ) - - dividend_components = ( - "qualified_dividend_income", - "non_qualified_dividend_income", - ) - if set(dividend_components) & integrated_variables and all( - column in clone.columns for column in dividend_components - ): - dividend_total = numeric("qualified_dividend_income") + numeric( - "non_qualified_dividend_income" - ) - assign_if_available("ordinary_dividend_income", dividend_total) - assign_if_available("dividend_income", dividend_total) - - pension_components = ("taxable_pension_income", "tax_exempt_pension_income") - if set(pension_components) & integrated_variables and all( - column in clone.columns for column in pension_components - ): - taxable_pension = numeric("taxable_pension_income") - tax_exempt_pension = numeric("tax_exempt_pension_income") - assign_if_available("taxable_private_pension_income", taxable_pension) - assign_if_available("tax_exempt_private_pension_income", tax_exempt_pension) - assign_if_available("pension_income", taxable_pension + tax_exempt_pension) - - if "taxable_unemployment_compensation" in integrated_variables: - if "taxable_unemployment_compensation" in clone.columns: - assign_if_available( - "unemployment_compensation", - numeric("taxable_unemployment_compensation"), - ) - - return sorted(set(reconciled)) - - def _finalize_puf_support_clone_frame( - self, - *, - original: pd.DataFrame, - imputed_clone: pd.DataFrame, - donor_source_name: str, - integrated_variables: list[str], - preclone_columns: set[str], - donor_seed_columns: set[str], - donor_observed: set[str], - ) -> tuple[pd.DataFrame, dict[str, Any]]: - """Finalize the PUF donor surface against the CPS scaffold.""" - flag_column = self.config.puf_support_clone_flag_column - original = original.copy() - clone = imputed_clone.copy() - original[flag_column] = 0.0 - clone[flag_column] = 1.0 - - integrated_set = set(integrated_variables) - both_halves_override = ( - integrated_set - & set(self.config.puf_support_clone_both_halves_override_variables) - & preclone_columns - ) - for variable in sorted(both_halves_override): - if variable in original.columns and variable in clone.columns: - original[variable] = clone[variable].to_numpy(copy=True) - - clone, cps_refresh_summary = self._refresh_puf_support_clone_cps_only_fields( - original=original, - clone=clone, - integrated_variables=integrated_variables, - preclone_columns=preclone_columns, - ) - clone, cps_passthrough_summary = self._preserve_cps_measured_puf_clone_totals( - original=original, - clone=clone, - integrated_variables=integrated_variables, - preclone_columns=preclone_columns, - ) - - generated_entity_id_columns = sorted( - set(ENTITY_ID_COLUMNS.values()) & (set(clone.columns) - preclone_columns) - ) - if generated_entity_id_columns: - clone = clone.drop(columns=generated_entity_id_columns) - - overlap_variables = sorted(integrated_set & preclone_columns) - donor_only_variables = sorted(integrated_set - preclone_columns) - ecps_surface = ( - set(PUF_SUPPORT_CLONE_IMPUTED_VARIABLES) - | set(PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES) - | set(PUF_SUPPORT_CLONE_SPECIAL_VARIABLES) - ) - included_surface = sorted(ecps_surface & integrated_set) - excluded_surface: dict[str, str] = {} - for variable in sorted(ecps_surface - set(included_surface)): - if variable not in donor_observed and variable not in donor_seed_columns: - reason = "missing_puf_source_column" - elif variable in self.config.donor_imputer_excluded_variables: - reason = "excluded_by_config" - elif variable not in preclone_columns: - reason = "not_present_before_clone" - else: - reason = "not_selected_for_imputation" - excluded_surface[variable] = reason - - clone_weight_sum = 0.0 - for column in ("household_weight", "hh_weight", "weight"): - if column in clone.columns: - clone_weight_sum = float( - pd.to_numeric(clone[column], errors="coerce").fillna(0.0).sum() - ) - break - - output_mode = self.config.puf_support_clone_output_mode - collapse_copy_variables: list[str] = [] - source_row_alignment: dict[str, Any] = { - "enabled": False, - "column": PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN, - } - if output_mode == "collapse_to_scaffold": - if PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN not in clone.columns: - raise ValueError( - "PUF support clone collapse requires " - f"{PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN}" - ) - source_row_id = pd.to_numeric( - clone[PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN], - errors="coerce", - ) - if source_row_id.isna().any(): - raise ValueError( - "PUF support clone source-row ids must be complete before collapse" - ) - source_row_index = source_row_id.astype(np.int64) - if source_row_index.duplicated().any(): - raise ValueError( - "PUF support clone source-row ids must be unique before collapse" - ) - expected_index = pd.Index(range(len(original)), dtype=np.int64) - if not set(source_row_index.to_numpy()).issubset(set(expected_index)): - raise ValueError( - "PUF support clone source-row ids are outside the CPS scaffold" - ) - aligned_clone = clone.assign( - **{PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN: source_row_index} - ).set_index(PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN, drop=False) - aligned_clone = aligned_clone.reindex(expected_index) - if aligned_clone.isna().all(axis=1).any(): - raise ValueError( - "PUF support clone source-row ids do not cover the CPS scaffold" - ) - source_row_alignment = { - "enabled": True, - "column": PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN, - "row_count": int(len(aligned_clone)), - "clone_was_reordered": bool( - not source_row_index.reset_index(drop=True).equals( - pd.Series(range(len(source_row_index)), dtype=np.int64) - ) - ), - } - passthrough_override = set( - cps_passthrough_summary.get("passthrough_variables", ()) - ) - identity_override = set( - cps_passthrough_summary.get("identity_reconciled_variables", ()) - ) - donor_only_collapse_variables = (integrated_set - preclone_columns) - set( - PUF_SUPPORT_CLONE_DONOR_ONLY_COLLAPSE_EXCLUDED_VARIABLES - ) - irs_detail_override = ( - integrated_set - & set(self.config.puf_support_clone_collapse_irs_detail_variables) - & preclone_columns - ) - overlap_collapse_override = ( - integrated_set - & set(self.config.puf_support_clone_collapse_overlap_variables) - & preclone_columns - ) - collapse_candidates = ( - donor_only_collapse_variables - | both_halves_override - | passthrough_override - | identity_override - | irs_detail_override - | overlap_collapse_override - ) - set(generated_entity_id_columns) - for variable in sorted(collapse_candidates): - if variable in aligned_clone.columns: - original[variable] = aligned_clone[variable].to_numpy(copy=True) - collapse_copy_variables.append(variable) - combined = original.reset_index(drop=True) - emitted_clone_row_count = 0 - else: - if PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN in clone.columns: - clone = clone.drop(columns=[PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN]) - for column in sorted(set(clone.columns) - set(original.columns)): - original[column] = 0.0 - for column in sorted(set(original.columns) - set(clone.columns)): - clone[column] = original[column].to_numpy(copy=True) - original = original.loc[:, clone.columns] - combined = pd.concat([original, clone], ignore_index=True, sort=False) - combined = combined.reset_index(drop=True) - emitted_clone_row_count = int(len(clone)) - - summary = { - "enabled": True, - "donor_source_name": donor_source_name, - "output_mode": output_mode, - "original_row_count": int(len(original)), - "clone_row_count": int(len(clone)), - "emitted_clone_row_count": emitted_clone_row_count, - "final_row_count": int(len(combined)), - "clone_initial_weight_sum": clone_weight_sum, - "integrated_variable_count": int(len(integrated_set)), - "clone_overlap_variable_count": int(len(overlap_variables)), - "clone_donor_only_variable_count": int(len(donor_only_variables)), - "overlap_variables": overlap_variables, - "donor_only_variables": donor_only_variables, - "donor_only_collapse_excluded_variables": sorted( - (integrated_set - preclone_columns) - & set(PUF_SUPPORT_CLONE_DONOR_ONLY_COLLAPSE_EXCLUDED_VARIABLES) - ) - if output_mode == "collapse_to_scaffold" - else [], - "both_halves_override_variables": sorted(both_halves_override), - "irs_detail_collapse_override_variables": sorted(irs_detail_override) - if output_mode == "collapse_to_scaffold" - else [], - "overlap_collapse_override_variables": sorted(overlap_collapse_override) - if output_mode == "collapse_to_scaffold" - else [], - "collapse_copy_variables": collapse_copy_variables, - "cps_only_refresh": cps_refresh_summary, - "cps_measured_total_passthrough": cps_passthrough_summary, - "source_row_alignment": source_row_alignment, - "dropped_generated_entity_id_columns": generated_entity_id_columns, - "variable_surface": { - "ecps_imputed_variables": list(PUF_SUPPORT_CLONE_IMPUTED_VARIABLES), - "ecps_overridden_variables": list( - PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES - ), - "ecps_special_variables": list(PUF_SUPPORT_CLONE_SPECIAL_VARIABLES), - "included_variables": included_surface, - "excluded_variables": excluded_surface, - }, - } - return combined, summary - - def _integrate_donor_sources( - self, - seed_data: pd.DataFrame, - *, - scaffold_input: USMicroplexSourceInput, - donor_inputs: list[USMicroplexSourceInput], - ) -> dict[str, Any]: - current = seed_data.copy() - integrated_variables: list[str] = [] - conditioning_diagnostics: list[dict[str, Any]] = [] - donor_inputs, processed_donor_source_order, puf_clone_source_order = ( - self._ordered_donor_inputs_for_puf_support_clone( - scaffold_input=scaffold_input, - donor_inputs=donor_inputs, - ) - ) - puf_support_clone_summary: dict[str, Any] | None = None - scaffold_observed = prune_redundant_variables( - scaffold_input.fusion_plan.variables_for(EntityType.HOUSEHOLD) - | scaffold_input.fusion_plan.variables_for(EntityType.PERSON) - ) - excluded = { - "person_id", - "household_id", - "hh_weight", - "weight", - "household_weight", - "tax_unit_id", - "family_id", - "spm_unit_id", - "marital_unit_id", - "state", - "age_group", - "income_bracket", - "is_head", - "is_spouse", - "is_dependent", - PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN, - } - rng = np.random.default_rng(self.config.random_seed) - _emit_us_pipeline_progress( - "US microplex donor integration: start", - donor_sources=len(donor_inputs), - seed_rows=len(current), - condition_selection=self.config.donor_imputer_condition_selection, - puf_support_clone_enabled=self.config.puf_support_clone_enabled, - ) - - for donor_input in donor_inputs: - donor_source_name = donor_input.frame.source.name - is_puf_support_clone_source = ( - self.config.puf_support_clone_enabled - and self._is_puf_support_clone_source(donor_source_name) - ) - source_original_current: pd.DataFrame | None = None - source_preclone_columns: set[str] = set(current.columns) - source_integrated_variables: list[str] = [] - if is_puf_support_clone_source: - source_original_current = current.copy() - current = self._prepare_puf_support_clone_frame(source_original_current) - _emit_us_pipeline_progress( - "US microplex donor integration: puf support clone prepared", - donor_source=donor_source_name, - original_rows=len(source_original_current), - clone_rows=len(current), - ) - _emit_us_pipeline_progress( - "US microplex donor integration: source start", - donor_source=donor_source_name, - current_rows=len(current), - ) - donor_seed = self.prepare_seed_data_from_source(donor_input) - donor_observed = prune_redundant_variables( - donor_input.fusion_plan.variables_for(EntityType.HOUSEHOLD) - | donor_input.fusion_plan.variables_for(EntityType.PERSON) - ) - numeric_current = { - column - for column in current.columns - if pd.api.types.is_numeric_dtype(current[column]) - } - numeric_donor = { - column - for column in donor_seed.columns - if pd.api.types.is_numeric_dtype(donor_seed[column]) - } - shared_vars = sorted( - variable - for variable in scaffold_observed & donor_observed - if variable not in excluded - and variable in current.columns - and variable in donor_seed.columns - and variable in numeric_current - and variable in numeric_donor - and scaffold_input.frame.source.allows_conditioning_on(variable) - and donor_input.frame.source.allows_conditioning_on(variable) - and self._is_compatible_donor_condition( - current[variable], - donor_seed[variable], - ) - ) - raw_shared_var_set = set(shared_vars) - donor_only_vars = sorted( - variable - for variable in donor_observed - scaffold_observed - if variable not in excluded - and variable not in self.config.donor_imputer_excluded_variables - and variable in donor_seed.columns - and variable in numeric_donor - and donor_input.frame.source.is_authoritative_for(variable) - and self._should_integrate_donor_variable(current, variable) - and self._is_compatible_donor_target(donor_seed[variable]) - ) - donor_override_vars = sorted( - variable - for variable in scaffold_observed & donor_observed - if variable not in excluded - and variable not in self.config.donor_imputer_excluded_variables - and variable - in self.config.donor_imputer_authoritative_override_variables - and variable in current.columns - and variable in donor_seed.columns - and variable in numeric_current - and variable in numeric_donor - and donor_input.frame.source.is_authoritative_for(variable) - and self._is_compatible_donor_target(donor_seed[variable]) - ) - if is_puf_support_clone_source: - puf_clone_overlap_vars = sorted( - variable - for variable in set(self.config.puf_support_clone_overlap_variables) - if variable not in excluded - and variable not in self.config.donor_imputer_excluded_variables - and variable in scaffold_observed - and variable in donor_observed - and variable in current.columns - and variable in donor_seed.columns - and variable in numeric_current - and variable in numeric_donor - and donor_input.frame.source.is_authoritative_for(variable) - and self._is_compatible_donor_target(donor_seed[variable]) - ) - donor_override_vars = sorted( - set(donor_override_vars) | set(puf_clone_overlap_vars) - ) - donor_target_vars = sorted(set(donor_only_vars) | set(donor_override_vars)) - if not shared_vars or not donor_target_vars: - if is_puf_support_clone_source: - raise ValueError( - "PUF support clone donor produced no imputation targets; " - f"shared_vars={len(shared_vars)}, " - f"donor_target_vars={len(donor_target_vars)}" - ) - _emit_us_pipeline_progress( - "US microplex donor integration: source skipped", - donor_source=donor_source_name, - donor_rows=len(donor_seed), - shared_vars=len(shared_vars), - donor_target_vars=len(donor_target_vars), - ) - continue - - donor_block_specs = donor_imputation_block_specs(donor_target_vars) - _emit_us_pipeline_progress( - "US microplex donor integration: source ready", - donor_source=donor_source_name, - donor_rows=len(donor_seed), - shared_vars=len(shared_vars), - donor_target_vars=len(donor_target_vars), - blocks=len(donor_block_specs), - ) - required_entities = { - donor_block_spec.native_entity - for donor_block_spec in donor_block_specs - if donor_block_spec.native_entity is not EntityType.PERSON - } - if required_entities: - _emit_us_pipeline_progress( - "US microplex donor integration: entity ids required", - donor_source=donor_source_name, - entities=_format_progress_values( - sorted(entity.value for entity in required_entities) - ), - current_rows=len(current), - donor_rows=len(donor_seed), - ) - current = self._ensure_seed_entity_ids( - current, - entities=required_entities, - frame_role="current", - donor_source_name=donor_source_name, - ) - donor_seed = self._ensure_seed_entity_ids( - donor_seed, - entities=required_entities, - frame_role="donor", - donor_source_name=donor_source_name, - ) - - for donor_block_spec in donor_block_specs: - block_label = _format_progress_values( - donor_block_spec.model_variables, - limit=4, - ) - _emit_us_pipeline_progress( - "US microplex donor integration: block start", - donor_source=donor_source_name, - block=block_label, - restored=_format_progress_values( - donor_block_spec.restored_variables, - limit=4, - ), - ) - prepared_inputs = PE_SOURCE_IMPUTE_BLOCK_ENGINE.prepare_block_inputs( - donor_seed=donor_seed, - current_frame=current, - shared_vars=shared_vars, - donor_block_spec=donor_block_spec, - donor_source_name=donor_source_name, - prepare_pe_surface=(self._uses_pe_condition_surface()), - can_project_to_entity=self._can_project_donor_block_to_entity, - project_frame_to_entity=self._project_frame_to_entity, - entity_key_fn=self._entity_key_column, - ) - shared_vars_for_block = list(prepared_inputs.shared_vars_for_block) - donor_fit_source = prepared_inputs.donor_fit_source - current_generation_source = prepared_inputs.current_generation_source - entity_key = prepared_inputs.entity_key - donor_condition_source = donor_fit_source - current_condition_source = current_generation_source - requested_supplemental_vars = ( - self._resolve_requested_supplemental_shared_condition_vars( - donor_block_spec.model_variables - ) - ) - requested_challenger_vars = ( - self._resolve_requested_challenger_shared_condition_vars( - donor_block_spec.model_variables, - donor_source_name=donor_source_name, - ) - ) - if prepared_inputs.condition_surface is not None: - surface = prepared_inputs.condition_surface - if ( - self.config.donor_imputer_condition_selection - == "pe_plus_puf_native_challenger" - ): - donor_condition_source = surface.donor_frame.copy() - current_condition_source = surface.current_frame.copy() - challenger_condition_vars = ( - self._resolve_challenger_shared_condition_vars( - donor_frame=donor_fit_source, - current_frame=current_generation_source, - shared_vars=shared_vars_for_block, - donor_block=donor_block_spec.model_variables, - donor_source_name=donor_source_name, - ) - ) - for variable in challenger_condition_vars: - donor_condition_source[variable] = donor_fit_source[ - variable - ] - current_condition_source[variable] = ( - current_generation_source[variable] - ) - donor_condition_vars = list( - dict.fromkeys( - surface.compatible_predictors( - compatibility_fn=self._is_compatible_donor_condition, - ) - + challenger_condition_vars - ) - ) - _emit_us_pipeline_progress( - "US microplex donor integration: block run", - donor_source=donor_source_name, - block=block_label, - condition_vars=len(donor_condition_vars), - donor_rows=len(donor_fit_source), - current_rows=len(current_generation_source), - ) - result = PE_SOURCE_IMPUTE_BLOCK_ENGINE.run_conditioned_block( - request=PESourceImputeConditionedBlockRunRequest( - block_request=PESourceImputeBlockRunRequest( - donor_block_spec=donor_block_spec, - donor_fit_source=donor_fit_source, - current_generation_source=current_generation_source, - current_frame=current, - entity_key=entity_key, - ), - donor_condition_source=donor_condition_source, - current_condition_source=current_condition_source, - condition_vars=tuple(donor_condition_vars), - ), - build_imputer=self._build_donor_imputer, - rank_match=self._rank_match_donor_values, - fit_kwargs={ - "epochs": self.config.donor_imputer_epochs, - "batch_size": self.config.donor_imputer_batch_size, - "learning_rate": self.config.donor_imputer_learning_rate, - "verbose": False, - }, - seed=self.config.random_seed, - rng=rng, - ) - else: - donor_condition_source = surface.donor_frame - current_condition_source = surface.current_frame - compatible_predictors = surface.compatible_predictors( - compatibility_fn=self._is_compatible_donor_condition, - ) - _emit_us_pipeline_progress( - "US microplex donor integration: block run", - donor_source=donor_source_name, - block=block_label, - condition_vars=len(compatible_predictors), - donor_rows=len(donor_fit_source), - current_rows=len(current_generation_source), - ) - result = PE_SOURCE_IMPUTE_BLOCK_ENGINE.run_prepared_block( - surface=surface, - request=PESourceImputeBlockRunRequest( - donor_block_spec=donor_block_spec, - donor_fit_source=donor_fit_source, - current_generation_source=current_generation_source, - current_frame=current, - entity_key=entity_key, - ), - build_imputer=self._build_donor_imputer, - rank_match=self._rank_match_donor_values, - compatibility_fn=self._is_compatible_donor_condition, - fit_kwargs={ - "epochs": self.config.donor_imputer_epochs, - "batch_size": self.config.donor_imputer_batch_size, - "learning_rate": self.config.donor_imputer_learning_rate, - "verbose": False, - }, - seed=self.config.random_seed, - rng=rng, - ) - if result is not None: - selected_condition_vars = list(result.condition_vars) - conditioning_diagnostics.append( - { - "donor_source": donor_input.frame.source.name, - "model_variables": list( - donor_block_spec.model_variables - ), - "restored_variables": list( - donor_block_spec.restored_variables - ), - "condition_selection": ( - self.config.donor_imputer_condition_selection - ), - "used_condition_surface": True, - "raw_shared_vars": list( - prepared_inputs.raw_shared_vars - ), - "shared_vars_after_model_exclusion": list( - prepared_inputs.shared_vars_after_model_exclusion - ), - "projection_applied": ( - prepared_inputs.projection_applied - ), - "entity_compatible_shared_vars": list( - prepared_inputs.entity_compatible_shared_vars - ), - "shared_vars_for_block": list(shared_vars_for_block), - "selected_condition_vars": selected_condition_vars, - "dropped_shared_vars": [ - variable - for variable in shared_vars_for_block - if variable not in selected_condition_vars - ], - "requested_supplemental_shared_condition_vars": ( - requested_supplemental_vars - ), - "requested_challenger_shared_condition_vars": ( - requested_challenger_vars - ), - "raw_supplemental_shared_condition_var_status": ( - self._summarize_requested_raw_condition_var_status( - donor_frame=donor_seed, - current_frame=current, - scaffold_source=scaffold_input.frame.source, - donor_source=donor_input.frame.source, - numeric_current=numeric_current, - numeric_donor=numeric_donor, - shared_var_set=raw_shared_var_set, - excluded=excluded, - requested_vars=requested_supplemental_vars, - ) - ), - "raw_challenger_shared_condition_var_status": ( - self._summarize_requested_raw_condition_var_status( - donor_frame=donor_seed, - current_frame=current, - scaffold_source=scaffold_input.frame.source, - donor_source=donor_input.frame.source, - numeric_current=numeric_current, - numeric_donor=numeric_donor, - shared_var_set=raw_shared_var_set, - excluded=excluded, - requested_vars=requested_challenger_vars, - ) - ), - "supplemental_shared_condition_var_status": ( - self._summarize_requested_condition_var_status( - donor_frame=donor_condition_source, - current_frame=current_condition_source, - shared_vars=shared_vars_for_block, - selected_condition_vars=selected_condition_vars, - requested_vars=requested_supplemental_vars, - ) - ), - "challenger_shared_condition_var_status": ( - self._summarize_requested_condition_var_status( - donor_frame=donor_condition_source, - current_frame=current_condition_source, - shared_vars=shared_vars_for_block, - selected_condition_vars=selected_condition_vars, - requested_vars=requested_challenger_vars, - ) - ), - } - ) - current = result.updated_frame - integrated_variables.extend(result.integrated_variables) - source_integrated_variables.extend(result.integrated_variables) - _emit_us_pipeline_progress( - "US microplex donor integration: block complete", - donor_source=donor_source_name, - block=block_label, - integrated_vars=len(result.integrated_variables), - ) - continue - donor_condition_source = ( - self._augment_donor_condition_frame_for_targets( - donor_condition_source, - donor_block_spec.model_variables, - ) - ) - current_condition_source = ( - self._augment_donor_condition_frame_for_targets( - current_condition_source, - donor_block_spec.model_variables, - ) - ) - donor_condition_vars = self._select_donor_condition_vars( - donor_condition_source, - current_condition_source, - shared_vars_for_block, - donor_block_spec.model_variables, - donor_source_name=donor_source_name, - ) - if not donor_condition_vars: - _emit_us_pipeline_progress( - "US microplex donor integration: block skipped", - donor_source=donor_source_name, - block=block_label, - reason="no_condition_vars", - ) - continue - - _emit_us_pipeline_progress( - "US microplex donor integration: block run", - donor_source=donor_source_name, - block=block_label, - condition_vars=len(donor_condition_vars), - donor_rows=len(donor_fit_source), - current_rows=len(current_generation_source), - ) - result = PE_SOURCE_IMPUTE_BLOCK_ENGINE.run_conditioned_block( - request=PESourceImputeConditionedBlockRunRequest( - block_request=PESourceImputeBlockRunRequest( - donor_block_spec=donor_block_spec, - donor_fit_source=donor_fit_source, - current_generation_source=current_generation_source, - current_frame=current, - entity_key=entity_key, - ), - donor_condition_source=donor_condition_source, - current_condition_source=current_condition_source, - condition_vars=tuple(donor_condition_vars), - ), - build_imputer=self._build_donor_imputer, - rank_match=self._rank_match_donor_values, - fit_kwargs={ - "epochs": self.config.donor_imputer_epochs, - "batch_size": self.config.donor_imputer_batch_size, - "learning_rate": self.config.donor_imputer_learning_rate, - "verbose": False, - }, - seed=self.config.random_seed, - rng=rng, - ) - if result is not None: - selected_condition_vars = list(result.condition_vars) - conditioning_diagnostics.append( - { - "donor_source": donor_input.frame.source.name, - "model_variables": list(donor_block_spec.model_variables), - "restored_variables": list( - donor_block_spec.restored_variables - ), - "condition_selection": ( - self.config.donor_imputer_condition_selection - ), - "used_condition_surface": False, - "raw_shared_vars": list(prepared_inputs.raw_shared_vars), - "shared_vars_after_model_exclusion": list( - prepared_inputs.shared_vars_after_model_exclusion - ), - "projection_applied": prepared_inputs.projection_applied, - "entity_compatible_shared_vars": list( - prepared_inputs.entity_compatible_shared_vars - ), - "shared_vars_for_block": list(shared_vars_for_block), - "selected_condition_vars": selected_condition_vars, - "dropped_shared_vars": [ - variable - for variable in shared_vars_for_block - if variable not in selected_condition_vars - ], - "requested_supplemental_shared_condition_vars": ( - requested_supplemental_vars - ), - "requested_challenger_shared_condition_vars": ( - requested_challenger_vars - ), - "raw_supplemental_shared_condition_var_status": ( - self._summarize_requested_raw_condition_var_status( - donor_frame=donor_seed, - current_frame=current, - scaffold_source=scaffold_input.frame.source, - donor_source=donor_input.frame.source, - numeric_current=numeric_current, - numeric_donor=numeric_donor, - shared_var_set=raw_shared_var_set, - excluded=excluded, - requested_vars=requested_supplemental_vars, - ) - ), - "raw_challenger_shared_condition_var_status": ( - self._summarize_requested_raw_condition_var_status( - donor_frame=donor_seed, - current_frame=current, - scaffold_source=scaffold_input.frame.source, - donor_source=donor_input.frame.source, - numeric_current=numeric_current, - numeric_donor=numeric_donor, - shared_var_set=raw_shared_var_set, - excluded=excluded, - requested_vars=requested_challenger_vars, - ) - ), - "supplemental_shared_condition_var_status": ( - self._summarize_requested_condition_var_status( - donor_frame=donor_condition_source, - current_frame=current_condition_source, - shared_vars=shared_vars_for_block, - selected_condition_vars=selected_condition_vars, - requested_vars=requested_supplemental_vars, - ) - ), - "challenger_shared_condition_var_status": ( - self._summarize_requested_condition_var_status( - donor_frame=donor_condition_source, - current_frame=current_condition_source, - shared_vars=shared_vars_for_block, - selected_condition_vars=selected_condition_vars, - requested_vars=requested_challenger_vars, - ) - ), - } - ) - current = result.updated_frame - integrated_variables.extend(result.integrated_variables) - source_integrated_variables.extend(result.integrated_variables) - _emit_us_pipeline_progress( - "US microplex donor integration: block complete", - donor_source=donor_source_name, - block=block_label, - integrated_vars=len(result.integrated_variables), - ) - - if is_puf_support_clone_source: - if source_original_current is None: - raise AssertionError("PUF support clone original frame missing") - current, puf_support_clone_summary = ( - self._finalize_puf_support_clone_frame( - original=source_original_current, - imputed_clone=current, - donor_source_name=donor_source_name, - integrated_variables=source_integrated_variables, - preclone_columns=source_preclone_columns, - donor_seed_columns=set(donor_seed.columns), - donor_observed=donor_observed, - ) - ) - _emit_us_pipeline_progress( - "US microplex donor integration: puf support clone complete", - donor_source=donor_source_name, - rows=len(current), - integrated_vars=len(source_integrated_variables), - ) - - return { - "seed_data": current, - "integrated_variables": sorted(set(integrated_variables)), - "conditioning_diagnostics": conditioning_diagnostics, - "processed_donor_source_order": processed_donor_source_order, - "puf_clone_source_order": puf_clone_source_order, - "puf_support_clone_summary": puf_support_clone_summary, - } - - def _apply_dependent_tax_leaf_soft_caps( - self, - seed_data: pd.DataFrame, - ) -> pd.DataFrame: - multiplier = self.config.dependent_tax_leaf_soft_cap_multiplier - if multiplier is None: - return seed_data - if "is_tax_unit_dependent" in seed_data.columns: - dependent = ( - pd.to_numeric( - seed_data["is_tax_unit_dependent"], errors="coerce" - ).fillna(0.0) - > 0 - ) - elif "is_dependent" in seed_data.columns: - dependent = ( - pd.to_numeric(seed_data["is_dependent"], errors="coerce").fillna(0.0) - > 0 - ) - else: - return seed_data - base_vars = [ - var - for var in self.config.dependent_tax_leaf_soft_cap_base_variables - if var in seed_data.columns - ] - if not base_vars: - return seed_data - base = ( - pd.to_numeric(seed_data[base_vars].sum(axis=1), errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - ) - cap = base * float(multiplier) - for variable in self.config.dependent_tax_leaf_soft_cap_variables: - if variable not in seed_data.columns: - continue - series = pd.to_numeric(seed_data[variable], errors="coerce").fillna(0.0) - adjusted = series.where(~dependent, other=series.clip(upper=cap)) - seed_data[variable] = adjusted - return seed_data - - def _uses_pe_condition_surface(self) -> bool: - return self.config.donor_imputer_condition_selection in { - "pe_prespecified", - "pe_plus_puf_native_challenger", - } - - def _select_donor_condition_vars( - self, - donor_frame: pd.DataFrame, - current_frame: pd.DataFrame, - shared_vars: list[str], - donor_block: tuple[str, ...], - donor_source_name: str | None = None, - ) -> list[str]: - condition_vars = [ - variable for variable in shared_vars if variable in donor_frame.columns - ] - if len(condition_vars) <= 1: - return condition_vars - - max_condition_vars = self.config.donor_imputer_max_condition_vars - if self.config.donor_imputer_condition_selection in { - "pe_prespecified", - "pe_plus_puf_native_challenger", - }: - preferred_condition_vars = self._resolve_preferred_donor_condition_vars( - donor_frame=donor_frame, - current_frame=current_frame, - shared_vars=shared_vars, - donor_block=donor_block, - ) - if ( - self.config.donor_imputer_condition_selection - == "pe_plus_puf_native_challenger" - ): - for variable in self._resolve_challenger_shared_condition_vars( - donor_frame=donor_frame, - current_frame=current_frame, - shared_vars=shared_vars, - donor_block=donor_block, - donor_source_name=donor_source_name, - ): - if variable not in preferred_condition_vars: - preferred_condition_vars.append(variable) - if preferred_condition_vars: - return preferred_condition_vars - if ( - self.config.donor_imputer_condition_selection == "all_shared" - or max_condition_vars is None - or len(condition_vars) <= max_condition_vars - ): - return condition_vars - - scored_conditions = [ - ( - score_donor_condition_var( - donor_frame[variable], - [ - donor_frame[target] - for target in donor_block - if target in donor_frame.columns - ], - score_modes={ - variable_semantic_spec_for(target).condition_score_mode - for target in donor_block - }, - ), - variable, - ) - for variable in condition_vars - ] - scored_conditions = [ - (score, variable) for score, variable in scored_conditions if score > 0.0 - ] - if not scored_conditions: - return condition_vars[:max_condition_vars] - - scored_conditions.sort(key=lambda item: (-item[0], item[1])) - return [variable for _, variable in scored_conditions[:max_condition_vars]] - - def _resolve_preferred_donor_condition_vars( - self, - *, - donor_frame: pd.DataFrame, - current_frame: pd.DataFrame, - shared_vars: list[str] | None = None, - donor_block: tuple[str, ...], - ) -> list[str]: - semantic_specs = tuple( - variable_semantic_spec_for(target_variable) - for target_variable in donor_block - ) - preferred_condition_vars = tuple( - dict.fromkeys( - variable - for spec in semantic_specs - for variable in spec.preferred_condition_vars - ) - ) - if not preferred_condition_vars: - return [] - resolved: list[str] = [] - for variable in preferred_condition_vars: - if ( - variable not in donor_frame.columns - or variable not in current_frame.columns - ): - continue - if not pd.api.types.is_numeric_dtype(donor_frame[variable]): - continue - if not pd.api.types.is_numeric_dtype(current_frame[variable]): - continue - if not self._is_compatible_donor_condition( - current_frame[variable], - donor_frame[variable], - ): - continue - resolved.append(variable) - shared_var_set = set(shared_vars or ()) - supplemental_shared_condition_vars = tuple( - dict.fromkeys( - variable - for spec in semantic_specs - for variable in spec.supplemental_shared_condition_vars - ) - ) - for variable in supplemental_shared_condition_vars: - if variable in resolved or variable not in shared_var_set: - continue - resolved.append(variable) - return resolved - - def _resolve_requested_supplemental_shared_condition_vars( - self, - donor_block: tuple[str, ...], - ) -> list[str]: - return list( - dict.fromkeys( - variable - for target_variable in donor_block - for variable in variable_semantic_spec_for( - target_variable - ).supplemental_shared_condition_vars - ) - ) - - def _resolve_requested_challenger_shared_condition_vars( - self, - donor_block: tuple[str, ...], - *, - donor_source_name: str | None, - ) -> list[str]: - if ( - self.config.donor_imputer_condition_selection - != "pe_plus_puf_native_challenger" - or donor_source_name is None - or not donor_source_name.startswith("irs_soi_puf") - ): - return [] - return list( - dict.fromkeys( - variable - for target_variable in donor_block - for variable in variable_semantic_spec_for( - target_variable - ).challenger_shared_condition_vars - ) - ) - - def _resolve_challenger_shared_condition_vars( - self, - *, - donor_frame: pd.DataFrame, - current_frame: pd.DataFrame, - shared_vars: list[str] | None = None, - donor_block: tuple[str, ...], - donor_source_name: str | None, - ) -> list[str]: - requested_vars = self._resolve_requested_challenger_shared_condition_vars( - donor_block, - donor_source_name=donor_source_name, - ) - if not requested_vars: - return [] - shared_var_set = set(shared_vars or ()) - resolved: list[str] = [] - for variable in requested_vars: - if ( - variable not in shared_var_set - or variable not in donor_frame.columns - or variable not in current_frame.columns - or not pd.api.types.is_numeric_dtype(donor_frame[variable]) - or not pd.api.types.is_numeric_dtype(current_frame[variable]) - or not self._is_compatible_donor_condition( - current_frame[variable], - donor_frame[variable], - ) - ): - continue - resolved.append(variable) - return resolved - - def _summarize_requested_condition_var_status( - self, - *, - donor_frame: pd.DataFrame, - current_frame: pd.DataFrame, - shared_vars: list[str], - selected_condition_vars: list[str], - requested_vars: list[str], - ) -> list[dict[str, Any]]: - shared_var_set = set(shared_vars) - selected_var_set = set(selected_condition_vars) - statuses: list[dict[str, Any]] = [] - for variable in requested_vars: - status = { - "variable": variable, - "selected": variable in selected_var_set, - "in_shared_overlap": variable in shared_var_set, - } - if variable in selected_var_set: - status["reason"] = "selected" - elif variable in shared_var_set: - status["reason"] = "available_but_not_selected" - elif variable not in donor_frame.columns: - status["reason"] = "missing_donor_column" - elif variable not in current_frame.columns: - status["reason"] = "missing_current_column" - elif not pd.api.types.is_numeric_dtype(donor_frame[variable]): - status["reason"] = "non_numeric_donor_column" - elif not pd.api.types.is_numeric_dtype(current_frame[variable]): - status["reason"] = "non_numeric_current_column" - elif not self._is_compatible_donor_condition( - current_frame[variable], - donor_frame[variable], - ): - status["reason"] = "incompatible_condition_support" - else: - status["reason"] = "excluded_from_block_shared_overlap" - statuses.append(status) - return statuses - - def _summarize_requested_raw_condition_var_status( - self, - *, - donor_frame: pd.DataFrame, - current_frame: pd.DataFrame, - scaffold_source: SourceDescriptor, - donor_source: SourceDescriptor, - numeric_current: set[str], - numeric_donor: set[str], - shared_var_set: set[str], - excluded: set[str], - requested_vars: list[str], - ) -> list[dict[str, Any]]: - statuses: list[dict[str, Any]] = [] - for variable in requested_vars: - status = { - "variable": variable, - "selected": variable in shared_var_set, - "in_shared_overlap": variable in shared_var_set, - } - if variable in shared_var_set: - status["reason"] = "selected" - elif variable in excluded: - status["reason"] = "excluded_variable" - elif variable not in current_frame.columns: - status["reason"] = "missing_current_column" - elif variable not in donor_frame.columns: - status["reason"] = "missing_donor_column" - elif variable not in numeric_current: - status["reason"] = "non_numeric_current_column" - elif variable not in numeric_donor: - status["reason"] = "non_numeric_donor_column" - elif not scaffold_source.allows_conditioning_on(variable): - status["reason"] = "scaffold_source_disallows_conditioning" - elif not donor_source.allows_conditioning_on(variable): - status["reason"] = "donor_source_disallows_conditioning" - elif not self._is_compatible_donor_condition( - current_frame[variable], - donor_frame[variable], - ): - status["reason"] = "incompatible_condition_support" - else: - status["reason"] = "excluded_from_shared_overlap" - statuses.append(status) - return statuses - - def _augment_donor_condition_frame_for_targets( - self, - frame: pd.DataFrame, - target_variables: tuple[str, ...], - ) -> pd.DataFrame: - preferred_condition_vars = [ - variable - for target_variable in target_variables - for variable in variable_semantic_spec_for( - target_variable - ).preferred_condition_vars - ] - if not preferred_condition_vars: - return frame - if not set(PE_STYLE_PUF_IRS_DEMOGRAPHIC_PREDICTORS) & set( - preferred_condition_vars - ): - return frame - predictor_frame = self._build_pe_style_puf_irs_condition_frame(frame) - if predictor_frame.empty: - return frame - result = frame.copy() - for column in predictor_frame.columns: - result[column] = predictor_frame[column] - return result - - def _build_pe_style_puf_irs_condition_frame( - self, - frame: pd.DataFrame, - ) -> pd.DataFrame: - result = pd.DataFrame(index=frame.index) - sex = ( - pd.to_numeric(frame["sex"], errors="coerce") - if "sex" in frame.columns - else pd.Series(np.nan, index=frame.index, dtype=float) - ) - if "age" in frame.columns: - result["age"] = pd.to_numeric(frame["age"], errors="coerce").astype(float) - if "sex" in frame.columns: - result["is_male"] = pd.Series( - np.where(sex == 1, 1.0, np.where(sex == 2, 0.0, np.nan)), - index=frame.index, - dtype=float, - ) - elif "is_male" in frame.columns: - result["is_male"] = pd.to_numeric(frame["is_male"], errors="coerce").astype( - float - ) - if "tax_unit_id" not in frame.columns: - return result - - relationship = ( - self._normalize_relationship_to_head(frame) - if "relationship_to_head" not in frame.columns - else pd.to_numeric(frame["relationship_to_head"], errors="coerce") - .fillna(3) - .astype(int) - ) - result["tax_unit_is_joint"] = 0.0 - result["tax_unit_count_dependents"] = 0.0 - result["is_tax_unit_head"] = 0.0 - result["is_tax_unit_spouse"] = 0.0 - result["is_tax_unit_dependent"] = 0.0 - - ages = ( - pd.to_numeric(frame["age"], errors="coerce").fillna(0.0) - if "age" in frame.columns - else pd.Series(0.0, index=frame.index, dtype=float) - ) - spouse_person_number = ( - pd.to_numeric(frame.get("spouse_person_number"), errors="coerce") - .fillna(0) - .astype(int) - if "spouse_person_number" in frame.columns - else pd.Series(0, index=frame.index, dtype=int) - ) - person_number = ( - pd.to_numeric(frame.get("person_number"), errors="coerce") - .fillna(0) - .astype(int) - if "person_number" in frame.columns - else pd.Series(0, index=frame.index, dtype=int) - ) - - tax_unit_ids = frame["tax_unit_id"] - valid_tax_unit_ids = tax_unit_ids.notna() & tax_unit_ids.astype( - str - ).str.strip().ne("") - for _, unit_persons in frame.loc[valid_tax_unit_ids].groupby( - "tax_unit_id", - sort=False, - ): - member_index = unit_persons.index - unit_relationship = relationship.loc[member_index] - dependent_index = unit_relationship[unit_relationship.eq(2)].index.tolist() - - spouse_index: list[int] = [] - by_number = { - int(number): idx - for idx, number in person_number.loc[member_index].items() - if int(number) > 0 - } - for idx in member_index: - spouse_number = int(spouse_person_number.loc[idx]) - current_number = int(person_number.loc[idx]) - if spouse_number <= 0 or current_number <= 0: - continue - spouse_idx = by_number.get(spouse_number) - if spouse_idx is None: - continue - if int(spouse_person_number.loc[spouse_idx]) != current_number: - continue - spouse_index.extend([int(idx), int(spouse_idx)]) - if not spouse_index: - spouse_index = ( - unit_relationship[unit_relationship.eq(1)] - .index.astype(int) - .tolist() - ) - spouse_index = [ - idx for idx in dict.fromkeys(spouse_index) if idx not in dependent_index - ] - - head_index: int | None = None - head_candidates = [ - int(idx) - for idx in unit_relationship[unit_relationship.eq(0)].index.tolist() - if int(idx) not in spouse_index - ] - if head_candidates: - head_index = head_candidates[0] - else: - nondependent_candidates = [ - int(idx) - for idx in member_index.tolist() - if int(idx) not in spouse_index and int(idx) not in dependent_index - ] - if nondependent_candidates: - head_index = max( - nondependent_candidates, - key=lambda idx: (float(ages.loc[idx]), -int(idx)), - ) - elif spouse_index: - head_index = spouse_index[0] - spouse_index = [idx for idx in spouse_index if idx != head_index] - else: - head_index = int(member_index[0]) - - spouse_index = [idx for idx in spouse_index if idx != head_index] - if len(spouse_index) > 1: - spouse_index = [ - max( - spouse_index, - key=lambda idx: (float(ages.loc[idx]), -int(idx)), - ) - ] - - result.loc[member_index, "tax_unit_is_joint"] = float(bool(spouse_index)) - result.loc[member_index, "tax_unit_count_dependents"] = float( - len(dependent_index) - ) - result.loc[dependent_index, "is_tax_unit_dependent"] = 1.0 - if head_index is not None: - result.loc[head_index, "is_tax_unit_head"] = 1.0 - result.loc[spouse_index, "is_tax_unit_spouse"] = 1.0 - - return result - - def _entity_key_column(self, entity: EntityType) -> str | None: - return ENTITY_ID_COLUMNS.get(entity) - - def _ensure_seed_entity_ids( - self, - frame: pd.DataFrame, - *, - entities: set[EntityType], - frame_role: str | None = None, - donor_source_name: str | None = None, - ) -> pd.DataFrame: - missing_columns = [ - self._entity_key_column(entity) - for entity in entities - if entity is not EntityType.PERSON - and self._entity_key_column(entity) not in frame.columns - ] - if not missing_columns: - _emit_us_pipeline_progress( - "US microplex donor integration: entity ids ready", - donor_source=donor_source_name, - frame=frame_role, - rows=len(frame), - status="already_present", - columns=_format_progress_values( - sorted( - self._entity_key_column(entity) or "" - for entity in entities - if entity is not EntityType.PERSON - ) - ), - ) - return frame - started_at = time.perf_counter() - missing_column_set = set(missing_columns) - can_use_group_only_path = missing_column_set <= {"family_id", "spm_unit_id"} - method = ( - "family_spm_only" - if can_use_group_only_path - else "policyengine_entity_bundle" - ) - _emit_us_pipeline_progress( - "US microplex donor integration: entity ids start", - donor_source=donor_source_name, - frame=frame_role, - rows=len(frame), - missing_columns=_format_progress_values(missing_columns), - method=method, - ) - working = frame.copy() - original_person_ids = working["person_id"].copy() - working["person_id"] = np.arange(len(working), dtype=np.int64) - if "household_id" in working.columns: - working["household_id"] = pd.factorize(working["household_id"])[0].astype( - np.int64 - ) - else: - working["household_id"] = np.arange(len(working), dtype=np.int64) - if "age" not in working.columns: - working["age"] = 0 - if can_use_group_only_path: - working["relationship_to_head"] = self._normalize_relationship_to_head( - working - ) - persons = self._assign_family_and_spm_units(working).copy() - else: - persons = self.build_policyengine_entity_tables(working).persons.copy() - persons["source_person_id"] = original_person_ids.to_numpy() - mapping = persons[["source_person_id", *missing_columns]] - if mapping["source_person_id"].duplicated().any(): - raise ValueError( - "PolicyEngine entity table build produced duplicate person mappings" - ) - result = frame.merge( - mapping, - left_on="person_id", - right_on="source_person_id", - how="left", - ).drop(columns=["source_person_id"]) - _emit_us_pipeline_progress( - "US microplex donor integration: entity ids complete", - donor_source=donor_source_name, - frame=frame_role, - rows=len(result), - added_columns=_format_progress_values(missing_columns), - method=method, - elapsed_seconds=f"{time.perf_counter() - started_at:.3f}", - ) - return result - - def _strip_generated_entity_ids( - self, - frame: pd.DataFrame, - *, - scaffold_input: USMicroplexSourceInput, - ) -> pd.DataFrame: - scaffold_person_columns = set(scaffold_input.persons.columns) - ephemeral_entity_ids = [ - column - for column in ("tax_unit_id", "family_id", "spm_unit_id", "marital_unit_id") - if column in frame.columns and column not in scaffold_person_columns - ] - if not ephemeral_entity_ids: - return frame - return frame.drop(columns=ephemeral_entity_ids) - - def _can_project_donor_block_to_entity( - self, - current_frame: pd.DataFrame, - donor_frame: pd.DataFrame, - entity: EntityType, - ) -> bool: - if entity is EntityType.PERSON: - return False - entity_key = self._entity_key_column(entity) - return bool( - entity_key - and entity_key in current_frame.columns - and entity_key in donor_frame.columns - and current_frame[entity_key].notna().all() - and donor_frame[entity_key].notna().all() - ) - - def _project_frame_to_entity( - self, - frame: pd.DataFrame, - *, - entity: EntityType, - variables: set[str], - ) -> pd.DataFrame: - entity_key = self._entity_key_column(entity) - if entity_key is None: - raise ValueError(f"Unsupported donor projection entity: {entity}") - columns = [ - entity_key, - *[ - variable - for variable in sorted(variables) - if variable != entity_key and variable in frame.columns - ], - ] - projected = frame[columns].copy() - if entity is EntityType.PERSON: - return projected - - sort_columns = [ - column - for column in (entity_key, "household_id", "person_id") - if column in projected.columns - ] - if sort_columns: - projected = projected.sort_values(sort_columns, kind="mergesort") - aggregations = { - column: self._projection_aggregation_for(column) - for column in projected.columns - if column != entity_key - } - return projected.groupby(entity_key, as_index=False).agg(aggregations) - - def _projection_aggregation_for(self, column: str) -> str: - if column in {"hh_weight", "household_id", "person_id", "year"}: - return "first" - return variable_semantic_spec_for(column).projection_aggregation.value - - def _should_integrate_donor_variable( - self, - current: pd.DataFrame, - variable: str, - ) -> bool: - if variable not in current.columns: - return True - current_values = pd.to_numeric( - current[variable], - errors="coerce", - ).replace([np.inf, -np.inf], np.nan) - informative = current_values.dropna() - if informative.empty: - return True - if (informative != 0).any(): - return False - return informative.nunique() <= 1 - - def _is_compatible_donor_condition( - self, - current_series: pd.Series, - donor_series: pd.Series, - ) -> bool: - current_values = ( - pd.to_numeric(current_series, errors="coerce") - .replace([np.inf, -np.inf], np.nan) - .dropna() - ) - donor_values = ( - pd.to_numeric(donor_series, errors="coerce") - .replace([np.inf, -np.inf], np.nan) - .dropna() - ) - if current_values.empty or donor_values.empty: - return False - if current_values.nunique() <= 1: - return False - if donor_values.nunique() <= 1: - return False - return True - - def _is_compatible_donor_target(self, series: pd.Series) -> bool: - values = pd.to_numeric(series, errors="coerce").replace( - [np.inf, -np.inf], np.nan - ) - values = values.dropna() - if values.empty: - return False - if values.nunique() <= 1: - return False - return bool((values > 0).any()) - - def _rank_match_donor_values( - self, - scores: pd.Series, - *, - donor_values: pd.Series, - donor_weights: pd.Series | None, - rng: np.random.Generator, - strategy: DonorMatchStrategy = DonorMatchStrategy.RANK, - ) -> pd.Series: - """Assign donor values by rank, preserving the donor marginal distribution.""" - if donor_values.empty: - return pd.Series(0.0, index=scores.index, dtype=float) - - donor_array = donor_values.to_numpy(dtype=float) - donor_weight_array = None - if donor_weights is not None and not donor_weights.empty: - donor_weight_array = donor_weights.to_numpy(dtype=float) - donor_weight_array = np.clip(donor_weight_array, a_min=0.0, a_max=None) - - if ( - strategy is DonorMatchStrategy.RANK - and self._is_zero_inflated_positive_distribution(donor_array) - ): - return self._rank_match_zero_inflated_positive_values( - scores, - donor_values=donor_array, - donor_weights=donor_weight_array, - rng=rng, - ) - - sampled_values = self._sample_donor_array( - donor_array, - size=len(scores), - donor_weights=donor_weight_array, - rng=rng, - ) - - sampled_values = np.sort(sampled_values.astype(float)) - order = np.argsort(scores.to_numpy(dtype=float), kind="mergesort") - matched = np.empty(len(scores), dtype=float) - matched[order] = sampled_values - return pd.Series(matched, index=scores.index, dtype=float) - - def _rank_match_zero_inflated_positive_values( - self, - scores: pd.Series, - *, - donor_values: np.ndarray, - donor_weights: np.ndarray | None, - rng: np.random.Generator, - ) -> pd.Series: - matched = np.zeros(len(scores), dtype=float) - positive_mask = donor_values > 0.0 - positive_values = donor_values[positive_mask] - if len(positive_values) == 0: - return pd.Series(matched, index=scores.index, dtype=float) - - positive_rate = self._weighted_positive_rate( - donor_values, - donor_weights=donor_weights, - ) - n_positive = int(round(positive_rate * len(scores))) - n_positive = min(max(n_positive, 0), len(scores)) - if n_positive == 0: - return pd.Series(matched, index=scores.index, dtype=float) - - positive_weights = ( - donor_weights[positive_mask] if donor_weights is not None else None - ) - sampled_positive = self._sample_donor_array( - positive_values, - size=n_positive, - donor_weights=positive_weights, - rng=rng, - ) - sampled_positive = np.sort(sampled_positive.astype(float)) - order = np.argsort(scores.to_numpy(dtype=float), kind="mergesort") - matched[order[-n_positive:]] = sampled_positive - return pd.Series(matched, index=scores.index, dtype=float) - - def _sample_donor_array( - self, - donor_values: np.ndarray, - *, - size: int, - donor_weights: np.ndarray | None, - rng: np.random.Generator, - ) -> np.ndarray: - if len(donor_values) == size: - return donor_values.copy() - - probabilities = None - if donor_weights is not None and len(donor_weights) == len(donor_values): - weight_sum = float(donor_weights.sum()) - if weight_sum > 0.0: - probabilities = donor_weights / weight_sum - return rng.choice( - donor_values, - size=size, - replace=True, - p=probabilities, - ) - - def _weighted_positive_rate( - self, - donor_values: np.ndarray, - *, - donor_weights: np.ndarray | None, - ) -> float: - positive_mask = donor_values > 0.0 - if donor_weights is None or len(donor_weights) != len(donor_values): - return float(np.mean(positive_mask)) - weight_sum = float(donor_weights.sum()) - if weight_sum <= 0.0: - return float(np.mean(positive_mask)) - return float(donor_weights[positive_mask].sum() / weight_sum) - - def _is_zero_inflated_positive_distribution(self, donor_values: np.ndarray) -> bool: - return bool( - len(donor_values) > 0 - and np.all(donor_values >= 0.0) - and np.any(donor_values == 0.0) - and np.any(donor_values > 0.0) - ) - - def _synthesize_bootstrap( - self, - seed_data: pd.DataFrame, - initial_weight: float, - *, - strata_columns: tuple[str, ...] = (), - ) -> pd.DataFrame: - """Generate synthetic households via weighted bootstrap resampling.""" - rng = np.random.default_rng(self.config.random_seed) - households = ( - seed_data.groupby("household_id", as_index=False) - .agg( - { - "hh_weight": "first", - **{ - column: "first" - for column in strata_columns - if column in seed_data.columns - }, - } - ) - .rename(columns={"hh_weight": "household_weight"}) - ) - sampled_households = self._sample_bootstrap_household_ids( - households, - rng=rng, - strata_columns=strata_columns, - ) - - cloned_households: list[pd.DataFrame] = [] - for new_household_id, source_household_id in enumerate(sampled_households): - household_persons = seed_data[ - seed_data["household_id"] == source_household_id - ].copy() - household_persons["household_id"] = new_household_id - cloned_households.append(household_persons) - - synthetic = pd.concat(cloned_households, ignore_index=True) - if "income" in synthetic.columns: - synthetic["income"] = synthetic["income"].astype(float) * rng.lognormal( - mean=0.0, - sigma=0.05, - size=len(synthetic), - ) - synthetic["income"] = synthetic["income"].clip(lower=0.0) - return self._finalize_synthetic_population( - synthetic, - initial_weight=initial_weight, - ) - - def _resolve_bootstrap_strata_columns( - self, - seed_data: pd.DataFrame, - ) -> tuple[str, ...]: - if self.config.bootstrap_strata_columns: - missing_columns = [ - column - for column in self.config.bootstrap_strata_columns - if column not in seed_data.columns - ] - if missing_columns: - raise ValueError( - "bootstrap_strata_columns are not available in seed data: " - f"{missing_columns}" - ) - return self.config.bootstrap_strata_columns - - requested_geo_levels: set[str] = set() - for scope in (False, True): - _, _, geo_levels = self._policyengine_target_scope(for_calibration=scope) - requested_geo_levels.update(geo_levels) - - inferred_columns: list[str] = [] - if { - "state", - "district", - "county", - } & requested_geo_levels and "state_fips" in seed_data.columns: - inferred_columns.append("state_fips") - if "county" in requested_geo_levels and "county_fips" in seed_data.columns: - inferred_columns.append("county_fips") - if ( - "district" in requested_geo_levels - and "congressional_district_geoid" in seed_data.columns - ): - inferred_columns.append("congressional_district_geoid") - return tuple(dict.fromkeys(inferred_columns)) - - def _sample_bootstrap_household_ids( - self, - households: pd.DataFrame, - *, - rng: np.random.Generator, - strata_columns: tuple[str, ...], - ) -> np.ndarray: - weights = households["household_weight"].astype(float).to_numpy() - household_ids = households["household_id"].to_numpy() - if ( - not strata_columns - or self.config.n_synthetic <= 0 - or len(household_ids) == 0 - ): - probabilities = weights / weights.sum() - return rng.choice( - household_ids, - size=self.config.n_synthetic, - replace=True, - p=probabilities, - ) - - stratum_frame = households.loc[:, list(strata_columns)].copy() - for column in stratum_frame.columns: - values = stratum_frame[column] - if pd.api.types.is_numeric_dtype(values): - stratum_frame[column] = values.fillna(-1) - else: - stratum_frame[column] = values.astype("string").fillna("__missing__") - stratum_keys = pd.MultiIndex.from_frame(stratum_frame) - weighted_households = households.assign(_bootstrap_stratum_key=stratum_keys) - stratum_weights = ( - weighted_households.groupby("_bootstrap_stratum_key", dropna=False)[ - "household_weight" - ] - .sum() - .astype(float) - ) - stratum_weights = stratum_weights[stratum_weights > 0] - if stratum_weights.empty: - probabilities = weights / weights.sum() - return rng.choice( - household_ids, - size=self.config.n_synthetic, - replace=True, - p=probabilities, - ) - - n_strata = len(stratum_weights) - base_counts = pd.Series(0, index=stratum_weights.index, dtype=int) - remaining = self.config.n_synthetic - if self.config.n_synthetic >= n_strata: - base_counts += 1 - remaining -= n_strata - - probabilities = (stratum_weights / stratum_weights.sum()).to_numpy(dtype=float) - extra_counts = ( - rng.multinomial(remaining, probabilities) - if remaining > 0 - else np.zeros(n_strata, dtype=int) - ) - - sampled_households: list[np.ndarray] = [] - for stratum_key, sample_count in zip( - stratum_weights.index, - base_counts.to_numpy(dtype=int) + extra_counts, - strict=False, - ): - if sample_count <= 0: - continue - candidates = weighted_households.loc[ - weighted_households["_bootstrap_stratum_key"] == stratum_key - ] - candidate_ids = candidates["household_id"].to_numpy() - candidate_weights = candidates["household_weight"].astype(float).to_numpy() - if candidate_weights.sum() <= 0: - candidate_probabilities = np.full( - len(candidate_ids), - 1.0 / max(len(candidate_ids), 1), - ) - else: - candidate_probabilities = candidate_weights / candidate_weights.sum() - sampled_households.append( - rng.choice( - candidate_ids, - size=int(sample_count), - replace=True, - p=candidate_probabilities, - ) - ) - - if not sampled_households: - probabilities = weights / weights.sum() - return rng.choice( - household_ids, - size=self.config.n_synthetic, - replace=True, - p=probabilities, - ) - - return rng.permutation(np.concatenate(sampled_households)) - - def _finalize_synthetic_population( - self, - synthetic: pd.DataFrame, - initial_weight: float, - ) -> pd.DataFrame: - """Add derived fields and canonical identifiers to synthetic output.""" - result = synthetic.copy().reset_index(drop=True) - for column, default in { - "state_fips": 0, - "county_fips": "00000", - "block_geoid": "", - "tract_geoid": "", - "congressional_district_geoid": 0, - "tenure": 0, - "age": 0, - "sex": 0, - "education": 0, - "employment_status": 0, - "income": 0.0, - }.items(): - if column not in result.columns: - result[column] = default - result["person_id"] = np.arange(len(result)) - if "household_id" in result.columns: - result["household_id"] = pd.factorize(result["household_id"])[0].astype( - np.int64 - ) - else: - result["household_id"] = np.arange(len(result), dtype=np.int64) - result["state"] = result["state_fips"].map(STATE_FIPS).fillna("UNK") - result["age_group"] = pd.cut( - result["age"], - bins=AGE_BINS, - labels=AGE_LABELS, - right=False, - ).astype(str) - result["income_bracket"] = pd.cut( - result["income"], - bins=INCOME_BINS, - labels=INCOME_LABELS, - ).astype(str) - if "weight" not in result.columns: - result["weight"] = float(initial_weight) - else: - result["weight"] = ( - pd.to_numeric(result["weight"], errors="coerce") - .fillna(float(initial_weight)) - .astype(float) - ) - return result - - def _build_policyengine_households(self, persons: pd.DataFrame) -> pd.DataFrame: - household_columns = [ - column - for column in ( - "state_fips", - "county_fips", - "block_geoid", - "tract_geoid", - "congressional_district_geoid", - "tenure", - "tenure_type", - "state", - "net_worth", - "auto_loan_balance", - "auto_loan_interest", - "household_vehicles_owned", - "household_vehicles_value", - ) - if column in persons.columns - ] - aggregations = {column: "first" for column in household_columns} - aggregations["weight"] = "mean" - households = ( - persons.groupby("household_id", as_index=False) - .agg(aggregations) - .rename(columns={"weight": "household_weight"}) - ) - return households - - def _build_policyengine_tax_units( - self, - persons: pd.DataFrame, - ) -> tuple[pd.DataFrame, pd.DataFrame]: - person_rows = persons.copy() - tax_unit_rows: list[dict[str, Any]] = [] - person_to_tax_unit: dict[int, int] = {} - next_tax_unit_id = 0 - preserved_households: set[Any] = set() - - role_based = self._build_policyengine_tax_units_from_role_flags( - persons, - start_tax_unit_id=next_tax_unit_id, - ) - if role_based is not None: - role_tax_units, role_person_rows, role_households = role_based - if len(role_households) == person_rows["household_id"].nunique(): - return role_tax_units, role_person_rows - if not role_tax_units.empty: - tax_unit_rows.extend(role_tax_units.to_dict(orient="records")) - person_to_tax_unit.update( - { - int(person_id): int(tax_unit_id) - for person_id, tax_unit_id in zip( - role_person_rows["person_id"].tolist(), - role_person_rows["tax_unit_id"].tolist(), - strict=True, - ) - } - ) - preserved_households.update(role_households) - next_tax_unit_id = ( - int( - pd.to_numeric( - role_tax_units["tax_unit_id"], - errors="coerce", - ).max() - ) - + 1 - ) - - if self.config.policyengine_prefer_existing_tax_unit_ids: - remaining_persons = persons.loc[ - ~persons["household_id"].isin(preserved_households) - ].copy() - preserved = self._build_policyengine_tax_units_from_existing_ids( - remaining_persons, - start_tax_unit_id=next_tax_unit_id, - ) - if preserved is not None: - preserved_tax_units, preserved_person_rows, existing_households = ( - preserved - ) - if ( - len(existing_households | preserved_households) - == person_rows["household_id"].nunique() - and not tax_unit_rows - ): - return preserved_tax_units, preserved_person_rows - if not preserved_tax_units.empty: - tax_unit_rows.extend(preserved_tax_units.to_dict(orient="records")) - person_to_tax_unit.update( - { - int(person_id): int(tax_unit_id) - for person_id, tax_unit_id in zip( - preserved_person_rows["person_id"].tolist(), - preserved_person_rows["tax_unit_id"].tolist(), - strict=True, - ) - } - ) - preserved_households.update(existing_households) - next_tax_unit_id = ( - int( - pd.to_numeric( - preserved_tax_units["tax_unit_id"], - errors="coerce", - ).max() - ) - + 1 - ) - - optimizer = TaxUnitOptimizer() - - for household_id in person_rows["household_id"].drop_duplicates().tolist(): - if household_id in preserved_households: - continue - hh_persons = person_rows[person_rows["household_id"] == household_id].copy() - if hh_persons.empty: - continue - optimized_units = optimizer.optimize_household( - int(household_id), hh_persons - ) - optimized_units = self._apply_tax_unit_filing_status_hints( - hh_persons, - optimized_units, - ) - if not optimized_units: - optimized_units = [ - { - "tax_unit_id": 0, - "household_id": int(household_id), - "filing_status": "single", - "filer_ids": [int(hh_persons.iloc[0]["person_id"])], - "dependent_ids": [], - "n_dependents": 0, - "total_income": float(hh_persons["income"].sum()), - "tax_liability": 0.0, - } - ] - - assigned_person_ids: set[int] = set() - for unit in optimized_units: - unit_person_ids = [ - int(person_id) - for person_id in list(unit.get("filer_ids", [])) - + list(unit.get("dependent_ids", [])) - ] - if not unit_person_ids: - continue - global_tax_unit_id = next_tax_unit_id - next_tax_unit_id += 1 - for person_id in unit_person_ids: - person_to_tax_unit[person_id] = global_tax_unit_id - assigned_person_ids.add(person_id) - unit_persons = hh_persons.loc[ - hh_persons["person_id"].astype(int).isin(unit_person_ids) - ].copy() - tax_unit_rows.append( - { - "tax_unit_id": global_tax_unit_id, - "household_id": int(household_id), - "filing_status": self._normalize_policyengine_filing_status( - unit.get("filing_status", "single") - ), - "n_dependents": int(unit.get("n_dependents", 0)), - "total_income": float(unit.get("total_income", 0.0)), - "tax_liability": float(unit.get("tax_liability", 0.0)), - **self._aggregate_policyengine_tax_unit_input_columns( - unit_persons - ), - } - ) - - unassigned = [ - int(person_id) - for person_id in hh_persons["person_id"].tolist() - if int(person_id) not in assigned_person_ids - ] - for person_id in unassigned: - global_tax_unit_id = next_tax_unit_id - next_tax_unit_id += 1 - person_to_tax_unit[person_id] = global_tax_unit_id - unit_persons = hh_persons.loc[ - hh_persons["person_id"].astype(int).eq(person_id) - ].copy() - tax_unit_rows.append( - { - "tax_unit_id": global_tax_unit_id, - "household_id": int(household_id), - "filing_status": "SINGLE", - "n_dependents": 0, - "total_income": float( - hh_persons.loc[ - hh_persons["person_id"] == person_id, "income" - ].iloc[0] - ), - "tax_liability": 0.0, - **self._aggregate_policyengine_tax_unit_input_columns( - unit_persons - ), - } - ) - - person_rows["tax_unit_id"] = person_rows["person_id"].map(person_to_tax_unit) - tax_units = pd.DataFrame(tax_unit_rows) - return tax_units, person_rows - - # Raw CPS ASEC columns that ``microunit.construct_tax_units`` consumes to - # reconstruct tax units. ``microunit`` is the standalone extraction of - # eCPS's tax-unit logic (issue #113); it is *source-agnostic* and expects - # this normalized CPS-like contract rather than microplex's collapsed - # ``relationship_to_head`` coding. We only delegate when the frame actually - # carries these columns, so the delegation is behavior-preserving on - # today's frames (which do not carry them) and only becomes active once an - # upstream change threads CPS columns through to entity construction. - _MICROUNIT_REQUIRED_CPS_COLUMNS = ( - "PH_SEQ", - "A_LINENO", - "A_AGE", - "A_MARITL", - "A_SPOUSE", - "PEPAR1", - "PEPAR2", - "A_EXPRRP", - ) - - def _build_policyengine_tax_units_via_microunit( - self, - persons: pd.DataFrame, - *, - start_tax_unit_id: int = 0, - allow_normalized_adapter: bool | None = None, - ) -> tuple[pd.DataFrame, pd.DataFrame, set[Any]] | None: - """Reconstruct tax units by delegating to ``microunit`` (issue #113). - - This is microplex's **default** tax-unit constructor for CPS-derived - frames. ``microunit`` is the rules-based engine that *replaces* the - unreliable CPS-provided ``tax_unit_id`` (Census ``TAX_ID``): when the - frame carries the real CPS pointer fields, the high-fidelity adapter - (#115) builds microunit's CPS contract and microunit re-partitions each - household from scratch, intentionally overriding any incoming - ``tax_unit_id``. The ``policyengine_prefer_existing_tax_unit_ids`` / - :meth:`_build_policyengine_tax_units_from_existing_ids` path is a - **fallback** for the households this method does not construct -- it runs - *after* this one, on the remaining households -- not a parallel - authority. SPM/family/marital group IDs are preserved separately (#112) - and are not touched here, so "keep the source SPM units, replace the tax - units" holds. - - Delegation runs when ``persons`` carries the raw CPS columns in - :attr:`_MICROUNIT_REQUIRED_CPS_COLUMNS`, or can synthesize them: the - high-fidelity adapter (#115) is used by DEFAULT when the real - ``person_number``/``spouse_person_number``/``family_relationship`` fields - are present (the production candidate carries them); the coarse - ``relationship_to_head``-only heuristic stays opt-in. When neither the - raw columns nor the high-fidelity fields are available (and the coarse - heuristic is not enabled), we return ``None`` and let the caller fall - back to the legacy role-flag reconstruction. - - .. warning:: - ``microunit`` *is* eCPS's tax-unit construction. Routing microplex - through it makes microplex's constructed tax units **converge toward - eCPS's**. Any loss change from enabling this delegation is an - *entity-convergence* effect and must be interpreted as such, not as - a quality improvement. See issue #113. - - Returns the same ``(tax_units, person_rows, households)`` triple shape as - :meth:`_build_policyengine_tax_units_from_role_flags`, or ``None`` to - defer to the caller's fallback. - """ - if "person_id" not in persons.columns or "household_id" not in persons.columns: - return None - cps_frame = persons - if not set(self._MICROUNIT_REQUIRED_CPS_COLUMNS).issubset(persons.columns): - # microunit is microplex's required tax-unit engine (#113). When the - # raw CPS columns are absent, synthesize its CPS contract from the - # normalized frame. The high-fidelity path (real person_number / - # spouse_person_number / family_relationship, which the production - # candidate carries) is used by DEFAULT; the coarse - # relationship_to_head-only heuristic stays opt-in via the config flag - # so minimal frames don't silently get the lossy reconstruction. - has_high_fidelity_fields = { - "person_number", - "family_relationship", - }.issubset(persons.columns) - if allow_normalized_adapter is None: - allow_normalized_adapter = has_high_fidelity_fields or bool( - getattr(self.config, "microunit_construct_from_normalized", False) - ) - if not allow_normalized_adapter: - return None - cps_frame = self._microunit_cps_frame_from_normalized(persons) - if cps_frame is None: - return None - - # Imported lazily to match this module's optional-dependency convention: - # ``microunit`` ships in the ``policyengine`` extra, and the base test - # suite must import this module without that extra installed. - from microunit import POLICYENGINE_MODE, construct_tax_units - - # microunit keys its CPS-style frame on (PH_SEQ, A_LINENO); resetting the - # index keeps row order so the returned per-person TAX_ID and role align - # positionally back onto person_rows. - person_rows = cps_frame.reset_index(drop=True).copy() - try: - person_assignments, tax_unit = construct_tax_units( - person_rows.copy(), - year=self._microunit_reference_year(person_rows), - mode=POLICYENGINE_MODE, - ) - except Exception: - # microunit raises on households it cannot resolve (e.g. no valid - # reference person). Never let that crash materialization — fall back - # to the caller's legacy reconstruction for the whole frame. - LOGGER.warning( - "microunit tax-unit construction failed; falling back to " - "legacy reconstruction", - exc_info=True, - ) - return None - - tax_id = pd.to_numeric(person_assignments["TAX_ID"], errors="coerce") - person_rows["tax_unit_id"] = ( - tax_id.to_numpy() + int(start_tax_unit_id) - ).astype(np.int64) - # microunit emits an authoritative per-person HEAD/SPOUSE/DEPENDENT role; - # use it directly for the filer/dependent split rather than re-deriving - # from the (possibly absent) collapsed relationship_to_head coding. - person_rows["_microunit_role"] = [ - self._decode_microunit_bytes(role) - for role in person_assignments["tax_unit_role_input"].tolist() - ] - - # microunit emits the canonical filing-status vocabulary already, but - # normalize defensively so this path can never diverge from the legacy - # paths if microunit ever changes its spelling/casing. - filing_status_by_unit = { - int(row_tax_id) + int(start_tax_unit_id): ( - self._normalize_policyengine_filing_status( - self._decode_microunit_bytes(filing_value) - ) - ) - for row_tax_id, filing_value in zip( - tax_unit["TAX_ID"].tolist(), - tax_unit["filing_status_input"].tolist(), - strict=True, - ) - } - - tax_unit_rows: list[dict[str, Any]] = [] - for unit_id, unit_persons in person_rows.groupby("tax_unit_id", sort=False): - ordered = unit_persons.sort_values( - ["_microunit_role", "age", "person_id"], - ascending=[True, False, True], - ).reset_index(drop=True) - is_filer = ordered["_microunit_role"].isin(["HEAD", "SPOUSE"]) - filer_ids = [ - int(person_id) for person_id in ordered.loc[is_filer, "person_id"] - ] - dependent_ids = [ - int(person_id) for person_id in ordered.loc[~is_filer, "person_id"] - ] - if not filer_ids: - filer_ids = [int(ordered.iloc[0]["person_id"])] - dependent_ids = [ - int(person_id) - for person_id in ordered["person_id"].tolist() - if int(person_id) not in filer_ids - ] - tax_unit_rows.append( - { - "tax_unit_id": int(unit_id), - "household_id": int(ordered.iloc[0]["household_id"]), - "filing_status": filing_status_by_unit.get(int(unit_id), "SINGLE"), - "member_ids": [ - int(person_id) for person_id in ordered["person_id"] - ], - "filer_ids": filer_ids, - "dependent_ids": dependent_ids, - "n_dependents": len(dependent_ids), - "total_income": float( - pd.to_numeric(ordered.get("income", 0.0), errors="coerce") - .fillna(0.0) - .sum() - ), - "tax_liability": 0.0, - **self._aggregate_policyengine_tax_unit_input_columns(ordered), - } - ) - - if not tax_unit_rows: - return None - - households = set(person_rows["household_id"].drop_duplicates().tolist()) - person_rows = person_rows.drop(columns=["_microunit_role"], errors="ignore") - return pd.DataFrame(tax_unit_rows), person_rows, households - - def _microunit_cps_frame_from_cps_fields( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - """High-fidelity (#115) build of microunit's CPS contract from the real - CPS-derived fields microplex carries at materialization: ``person_number`` - (a 1-based within-household line number), ``spouse_person_number`` (a real - spouse line pointer), ``family_relationship`` (CPS A_FAMREL) and - ``marital_status``. The household reference person (``person_number == 1``) - always anchors a valid head, so microunit never lacks one. - - Only ``PEPAR1``/``PEPAR2`` remain heuristic: a child's parents are taken to - be the household reference person (line 1) and that person's spouse (#115). - """ - frame = persons.reset_index(drop=True).copy() - hh = pd.to_numeric(frame["household_id"], errors="coerce") - pernum = ( - pd.to_numeric(frame["person_number"], errors="coerce").fillna(0).astype(int) - ) - # ``family_relationship`` arrives in either CPS A_FAMREL 1-based coding - # (1=reference person, 2=spouse, 3=child, ...) or the optimizer's 0-based - # coding (0=head, 1=spouse, 2=child); the rest of the pipeline detects - # this per household (see ``_normalize_relationship_to_head`` and - # ``data_sources.cps``). The A_EXPRRP / parent-pointer mapping below - # expects the 1-based scheme, so shift any 0-based household up by one -- - # otherwise a 0-based frame silently mis-codes children as spouses and - # drops their parent pointers. - famrel_raw = pd.to_numeric(frame["family_relationship"], errors="coerce") - zero_based_hh = (famrel_raw == 0).groupby(hh).transform("any").fillna(False) - famrel = famrel_raw.add(zero_based_hh.astype(int)).fillna(0).astype(int) - spouse_num = ( - pd.to_numeric(frame.get("spouse_person_number", 0), errors="coerce") - .fillna(0) - .astype(int) - ) - - frame["PH_SEQ"] = hh.astype(np.int64) - frame["A_LINENO"] = pernum - frame["A_AGE"] = ( - pd.to_numeric(frame["age"], errors="coerce").fillna(0).astype(int) - ) - frame["A_SPOUSE"] = spouse_num - - is_ref = pernum == 1 - # A_EXPRRP (microunit.CPSRelationshipCode): reference person 1, spouse 3, - # own child 5, everyone else other-relative 10. - exprrp = pd.Series(10, index=frame.index, dtype=int) - exprrp[famrel == 2] = 3 - exprrp[famrel == 3] = 5 - exprrp[is_ref] = 1 - frame["A_EXPRRP"] = exprrp - - # A_MARITL: 1 married spouse present; 4 widowed; else 7 never-married. - marital = pd.Series(7, index=frame.index, dtype=int) - marital[spouse_num > 0] = 1 - if "is_surviving_spouse" in frame.columns: - surviving = ( - pd.to_numeric(frame["is_surviving_spouse"], errors="coerce").fillna(0) - > 0 - ) - marital[surviving & (spouse_num == 0)] = 4 - frame["A_MARITL"] = marital - - # PEPAR1/PEPAR2: a child's parents are heuristically the household - # reference person (line 1) and that reference person's spouse. - ref_spouse_line = frame.loc[is_ref].groupby(hh[is_ref])["A_SPOUSE"].first() - frame["PEPAR1"] = 0 - frame["PEPAR2"] = 0 - is_child = famrel == 3 - frame.loc[is_child, "PEPAR1"] = 1 - frame.loc[is_child, "PEPAR2"] = ( - hh[is_child].map(ref_spouse_line).fillna(0).astype(int) - ) - - return frame - - def _microunit_cps_frame_from_normalized( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame | None: - """PROTOTYPE (issue #115): synthesize microunit's CPS-like input contract - from microplex's normalized person columns. - - ``microunit.construct_tax_units`` needs raw CPS columns (PH_SEQ/A_LINENO/ - A_AGE/A_MARITL/A_SPOUSE/PEPAR1/PEPAR2/A_EXPRRP); at PolicyEngine - materialization microplex instead carries ``household_id``/``age``/ - ``relationship_to_head``. This builds the former from the latter, mirroring - the ACS->CPS mapping microunit documents as the consumer's responsibility. - - .. warning:: - HEURISTIC AND UNVALIDATED. The ``relationship_to_head`` -> ``A_EXPRRP`` - and married -> ``A_MARITL`` maps are approximate, and ``PEPAR1``/ - ``PEPAR2`` are inferred by assuming a child's parents are the household - head and spouse. The fidelity of these maps must be validated against - the legacy reconstruction before this is trusted (see #115); it is - gated OFF by default. - - Returns ``persons`` with the eight microunit CPS columns added, or ``None`` - if the prerequisite normalized columns are absent. - """ - # Prefer the high-fidelity path when microplex carries the real CPS-derived - # pointer fields (person_number is a 1-based within-household line number; - # spouse_person_number a real spouse line pointer). Otherwise fall back to - # the coarse relationship_to_head heuristic below. - if { - "person_id", - "person_number", - "family_relationship", - "household_id", - "age", - }.issubset(persons.columns): - return self._microunit_cps_frame_from_cps_fields(persons) - - required = {"person_id", "household_id", "age", "relationship_to_head"} - if not required.issubset(persons.columns): - return None - - # CPS A_EXPRRP recode (microunit.CPSRelationshipCode): 1 reference person, - # 3 husband, 5 own child, 10 other relative. - exprrp_by_rel = {0: 1, 1: 3, 2: 5, 3: 10} - - frame = persons.reset_index(drop=True).copy() - rel = ( - pd.to_numeric(frame["relationship_to_head"], errors="coerce") - .fillna(3) - .astype(int) - ) - age = pd.to_numeric(frame["age"], errors="coerce").fillna(0).astype(int) - - # Per-household line numbers (1-based, unique within household): head - # first, then spouse, then everyone else oldest-first. - frame = frame.assign(_rel=rel.to_numpy(), _age=age.to_numpy()) - frame = frame.sort_values( - ["household_id", "_rel", "_age", "person_id"], - ascending=[True, True, False, True], - ).reset_index(drop=True) - frame["A_LINENO"] = frame.groupby("household_id", sort=False).cumcount() + 1 - # Guarantee exactly one household head (microunit requires a single - # reference person per PH_SEQ, else it raises). After the head-first sort - # the line-1 member is the most head-like; make it the head and demote - # any other rows that mapped to head (multi-family / headless households). - is_line1 = frame["A_LINENO"] == 1 - frame.loc[is_line1, "_rel"] = 0 - frame.loc[~is_line1 & (frame["_rel"] == 0), "_rel"] = 3 - frame["PH_SEQ"] = pd.to_numeric(frame["household_id"], errors="coerce").astype( - np.int64 - ) - frame["A_AGE"] = frame["_age"] - frame["A_EXPRRP"] = frame["_rel"].map(exprrp_by_rel).fillna(10).astype(int) - - # Head/spouse line numbers per household, for spouse pointers + marital. - head_line = ( - frame.loc[frame["_rel"] == 0].groupby("household_id")["A_LINENO"].first() - ) - spouse_line = ( - frame.loc[frame["_rel"] == 1].groupby("household_id")["A_LINENO"].first() - ) - hh = frame["household_id"] - is_head = frame["_rel"] == 0 - is_spouse = frame["_rel"] == 1 - is_child = frame["_rel"] == 2 - has_spouse = hh.map(spouse_line).notna() - - frame["A_SPOUSE"] = 0 - frame.loc[is_head, "A_SPOUSE"] = ( - hh[is_head].map(spouse_line).fillna(0).astype(int) - ) - frame.loc[is_spouse, "A_SPOUSE"] = ( - hh[is_spouse].map(head_line).fillna(0).astype(int) - ) - - # A_MARITL: 1 = married, spouse present (head/spouse of a couple); else - # 7 = never married. microunit only needs the married-vs-not distinction. - frame["A_MARITL"] = 7 - frame.loc[(is_head | is_spouse) & has_spouse, "A_MARITL"] = 1 - - # PEPAR1/PEPAR2: assume a child's parents are the household head + spouse. - frame["PEPAR1"] = 0 - frame["PEPAR2"] = 0 - frame.loc[is_child, "PEPAR1"] = ( - hh[is_child].map(head_line).fillna(0).astype(int) - ) - frame.loc[is_child, "PEPAR2"] = ( - hh[is_child].map(spouse_line).fillna(0).astype(int) - ) - - return frame.drop(columns=["_rel", "_age"]) - - @staticmethod - def _decode_microunit_bytes(value: Any) -> str: - """Decode a ``microunit`` bytes-typed status/role into an upper string.""" - if isinstance(value, bytes): - return value.decode() - return str(value) - - def _microunit_reference_year(self, persons: pd.DataFrame) -> int: - """Year passed to ``microunit`` for its dependency income thresholds. - - Prefers an explicit ``year``/``tax_year`` column when the frame carries - one; otherwise falls back to the pipeline's configured reference year so - the only year-dependent behavior (the qualifying-relative gross income - limit) matches the rest of the pipeline. TODO(#113): thread the dataset - reference year through entity construction explicitly. - """ - for column in ("year", "tax_year"): - if column in persons.columns: - values = pd.to_numeric(persons[column], errors="coerce").dropna() - if not values.empty: - return int(values.iloc[0]) - configured = getattr(self.config, "reference_year", None) - if configured is not None: - return int(configured) - return 2024 - - def _build_policyengine_tax_units_from_role_flags( - self, - persons: pd.DataFrame, - *, - start_tax_unit_id: int = 0, - ) -> tuple[pd.DataFrame, pd.DataFrame, set[Any]] | None: - # Issue #113: when the frame carries microunit's CPS-style input - # columns, delegate the reconstruction to microunit. Otherwise fall - # through to the legacy role-flag reconstruction below (the current - # production path, since these columns are not yet threaded through). - microunit_result = self._build_policyengine_tax_units_via_microunit( - persons, - start_tax_unit_id=start_tax_unit_id, - ) - if microunit_result is not None: - return microunit_result - - role_columns = { - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - } - if ( - not role_columns.issubset(persons.columns) - or "person_id" not in persons.columns - ): - return None - - person_rows = persons.copy() - raw_head_flag = self._role_flag_series( - person_rows, - "is_tax_unit_head", - ) - raw_spouse_flag = self._role_flag_series( - person_rows, - "is_tax_unit_spouse", - ) - raw_dependent_flag = self._role_flag_series( - person_rows, - "is_tax_unit_dependent", - ) - ( - person_rows["_is_tax_unit_head_flag"], - person_rows["_is_tax_unit_spouse_flag"], - person_rows["_is_tax_unit_dependent_flag"], - ) = self._resolve_tax_unit_role_flags( - person_rows, - head_flag=raw_head_flag, - spouse_flag=raw_spouse_flag, - dependent_flag=raw_dependent_flag, - ) - - tax_unit_rows: list[dict[str, Any]] = [] - person_to_tax_unit: dict[int, int] = {} - role_households: set[Any] = set() - next_tax_unit_id = int(start_tax_unit_id) - - for household_id, household_persons in person_rows.groupby( - "household_id", - sort=False, - ): - ordered = household_persons.sort_values( - ["relationship_to_head", "age", "person_id"], - ascending=[True, False, True], - ).copy() - ordered = self._cohere_tax_unit_role_flags_for_household(ordered) - head_rows = ordered.loc[ordered["_is_tax_unit_head_flag"]] - if head_rows.empty: - continue - - head_ids = [int(person_id) for person_id in head_rows["person_id"].tolist()] - head_to_spouses = self._assign_role_flag_spouses(ordered, head_ids) - head_to_dependents = self._assign_role_flag_dependents(ordered, head_ids) - assigned_person_ids: set[int] = set() - - for head_id in head_ids: - spouse_ids = head_to_spouses.get(head_id, []) - dependent_ids = head_to_dependents.get(head_id, []) - unit_person_ids = list( - dict.fromkeys([head_id, *spouse_ids, *dependent_ids]) - ) - unit_persons = ordered.loc[ - ordered["person_id"].astype(int).isin(unit_person_ids) - ].copy() - if unit_persons.empty: - continue - global_tax_unit_id = next_tax_unit_id - next_tax_unit_id += 1 - for person_id in unit_person_ids: - person_to_tax_unit[int(person_id)] = global_tax_unit_id - assigned_person_ids.add(int(person_id)) - - filing_status = self._infer_role_flag_tax_unit_filing_status( - unit_persons, - head_id=head_id, - spouse_ids=spouse_ids, - dependent_ids=dependent_ids, - ) - tax_unit_rows.append( - { - "tax_unit_id": global_tax_unit_id, - "household_id": int(household_id), - "filing_status": filing_status, - "member_ids": [int(person_id) for person_id in unit_person_ids], - "filer_ids": [head_id, *spouse_ids], - "dependent_ids": dependent_ids, - "n_dependents": len(dependent_ids), - "total_income": float( - pd.to_numeric( - unit_persons.get("income", 0.0), - errors="coerce", - ) - .fillna(0.0) - .sum() - ), - "tax_liability": 0.0, - **self._aggregate_policyengine_tax_unit_input_columns( - unit_persons - ), - } - ) - - unassigned = [ - int(person_id) - for person_id in ordered["person_id"].tolist() - if int(person_id) not in assigned_person_ids - ] - for person_id in unassigned: - unit_persons = ordered.loc[ - ordered["person_id"].astype(int).eq(person_id) - ].copy() - global_tax_unit_id = next_tax_unit_id - next_tax_unit_id += 1 - person_to_tax_unit[person_id] = global_tax_unit_id - tax_unit_rows.append( - { - "tax_unit_id": global_tax_unit_id, - "household_id": int(household_id), - "filing_status": "SINGLE", - "member_ids": [person_id], - "filer_ids": [person_id], - "dependent_ids": [], - "n_dependents": 0, - "total_income": float( - pd.to_numeric( - unit_persons.get("income", 0.0), - errors="coerce", - ) - .fillna(0.0) - .sum() - ), - "tax_liability": 0.0, - **self._aggregate_policyengine_tax_unit_input_columns( - unit_persons - ), - } - ) - - role_households.add(household_id) - - if not tax_unit_rows: - return None - - result_persons = person_rows.loc[ - person_rows["household_id"].isin(role_households) - ].copy() - result_persons["tax_unit_id"] = result_persons["person_id"].map( - person_to_tax_unit - ) - result_persons = result_persons.drop( - columns=[ - "_is_tax_unit_head_flag", - "_is_tax_unit_spouse_flag", - "_is_tax_unit_dependent_flag", - ], - errors="ignore", - ) - return pd.DataFrame(tax_unit_rows), result_persons, role_households - - def _build_policyengine_tax_units_from_existing_ids( - self, - persons: pd.DataFrame, - *, - start_tax_unit_id: int = 0, - ) -> tuple[pd.DataFrame, pd.DataFrame, set[Any]] | None: - if "tax_unit_id" not in persons.columns or "person_id" not in persons.columns: - return None - - raw_tax_unit_id = pd.to_numeric(persons["tax_unit_id"], errors="coerce") - if raw_tax_unit_id.isna().all(): - return None - - person_rows = persons.copy() - household_has_complete_tax_unit_ids = ( - raw_tax_unit_id.notna() - .groupby(person_rows["household_id"]) - .transform("all") - ) - if not bool(household_has_complete_tax_unit_ids.any()): - return None - - person_rows = person_rows.loc[household_has_complete_tax_unit_ids].copy() - raw_tax_unit_id = raw_tax_unit_id.loc[person_rows.index] - preserved_households = set( - person_rows["household_id"].drop_duplicates().tolist() - ) - tax_unit_key = pd.DataFrame( - { - "household_id": person_rows["household_id"], - "tax_unit_id": raw_tax_unit_id, - } - ) - - households_per_tax_unit = ( - tax_unit_key.assign(_household_id=person_rows["household_id"]) - .groupby("tax_unit_id")["_household_id"] - .nunique() - ) - if bool((households_per_tax_unit > 1).any()): - normalized_tax_unit_id = pd.factorize( - pd.MultiIndex.from_frame(tax_unit_key), sort=False - )[0].astype(np.int64) + int(start_tax_unit_id) - person_rows["tax_unit_id"] = normalized_tax_unit_id - else: - raw_tax_unit_id = raw_tax_unit_id.astype(np.int64) - if int(start_tax_unit_id) == 0: - person_rows["tax_unit_id"] = raw_tax_unit_id - else: - raw_min = int(raw_tax_unit_id.min()) if len(raw_tax_unit_id) else 0 - person_rows["tax_unit_id"] = ( - raw_tax_unit_id - raw_min + int(start_tax_unit_id) - ).astype(np.int64) - - tax_unit_rows: list[dict[str, Any]] = [] - for tax_unit_id, unit_persons in person_rows.groupby("tax_unit_id", sort=False): - ordered = unit_persons.sort_values( - ["relationship_to_head", "age", "person_id"], - ascending=[True, False, True], - ).reset_index(drop=True) - filer_ids, dependent_ids = self._split_preserved_tax_unit_members(ordered) - if not filer_ids: - filer_ids = [int(ordered.iloc[0]["person_id"])] - dependent_ids = [ - int(person_id) - for person_id in ordered["person_id"].tolist() - if int(person_id) not in filer_ids - ] - filing_status = self._infer_preserved_tax_unit_filing_status( - ordered, - filer_ids=filer_ids, - dependent_ids=dependent_ids, - ) - tax_unit_rows.append( - { - "tax_unit_id": int(tax_unit_id), - "household_id": int(ordered.iloc[0]["household_id"]), - "filing_status": filing_status, - "member_ids": [ - int(person_id) for person_id in ordered["person_id"] - ], - "filer_ids": filer_ids, - "dependent_ids": dependent_ids, - "n_dependents": len(dependent_ids), - "total_income": float( - pd.to_numeric(ordered.get("income", 0.0), errors="coerce") - .fillna(0.0) - .sum() - ), - "tax_liability": 0.0, - **self._aggregate_policyengine_tax_unit_input_columns(ordered), - } - ) - - return pd.DataFrame(tax_unit_rows), person_rows, preserved_households - - def _role_flag_series(self, frame: pd.DataFrame, column: str) -> pd.Series: - if column not in frame.columns: - return pd.Series(False, index=frame.index, dtype=bool) - return pd.to_numeric(frame[column], errors="coerce").fillna(0.0).gt(0.5) - - def _resolve_tax_unit_role_flags( - self, - frame: pd.DataFrame, - *, - head_flag: pd.Series, - spouse_flag: pd.Series, - dependent_flag: pd.Series, - ) -> tuple[pd.Series, pd.Series, pd.Series]: - relationship = ( - pd.to_numeric(frame["relationship_to_head"], errors="coerce") - .fillna(-1) - .astype(int) - if "relationship_to_head" in frame.columns - else pd.Series(-1, index=frame.index, dtype=int) - ) - family_relationship = ( - pd.to_numeric(frame["family_relationship"], errors="coerce") - .fillna(-1) - .astype(int) - if "family_relationship" in frame.columns - else pd.Series(-1, index=frame.index, dtype=int) - ) - head_hint = relationship.eq(0) | family_relationship.isin([0, 1]) - spouse_hint = relationship.eq(1) | family_relationship.eq(2) - dependent_hint = relationship.isin([2, 3]) | family_relationship.isin([3, 4]) - - resolved_dependent = ( - dependent_flag - & (~spouse_flag | dependent_hint | ~spouse_hint) - & (~head_flag | dependent_hint | ~head_hint) - ) - resolved_spouse = ( - spouse_flag & ~resolved_dependent & (~head_flag | spouse_hint | ~head_hint) - ) - resolved_head = head_flag & ~resolved_spouse & ~resolved_dependent - return resolved_head, resolved_spouse, resolved_dependent - - def _cohere_tax_unit_role_flags_for_household( - self, - household_persons: pd.DataFrame, - ) -> pd.DataFrame: - if household_persons.empty: - return household_persons - - result = household_persons.copy() - relationship = ( - pd.to_numeric(result["relationship_to_head"], errors="coerce") - .fillna(-1) - .astype(int) - if "relationship_to_head" in result.columns - else pd.Series(-1, index=result.index, dtype=int) - ) - family_relationship = ( - pd.to_numeric(result["family_relationship"], errors="coerce") - .fillna(-1) - .astype(int) - if "family_relationship" in result.columns - else pd.Series(-1, index=result.index, dtype=int) - ) - age = ( - pd.to_numeric(result["age"], errors="coerce").fillna(0.0) - if "age" in result.columns - else pd.Series(0.0, index=result.index, dtype=float) - ) - income = ( - pd.to_numeric(result["income"], errors="coerce").fillna(0.0) - if "income" in result.columns - else pd.Series(0.0, index=result.index, dtype=float) - ) - head_hint = relationship.eq(0) | family_relationship.isin([0, 1]) - spouse_hint = relationship.eq(1) | family_relationship.eq(2) - dependent_hint = relationship.isin([2, 3]) | family_relationship.isin([3, 4]) - - head_flag = result["_is_tax_unit_head_flag"].astype(bool) - spouse_flag = result["_is_tax_unit_spouse_flag"].astype(bool) - dependent_flag = result["_is_tax_unit_dependent_flag"].astype(bool) - - rank = pd.Series(4, index=result.index, dtype=int) - rank.loc[age.ge(18)] = 3 - rank.loc[head_flag] = 2 - rank.loc[head_hint] = 1 - rank.loc[head_flag & head_hint] = 0 - primary_index = ( - pd.DataFrame( - { - "rank": rank, - "relationship": relationship.where(relationship.ge(0), 99), - "age": -age, - "person_id": pd.to_numeric( - result["person_id"], - errors="coerce", - ).fillna(0), - }, - index=result.index, - ) - .sort_values(["rank", "relationship", "age", "person_id"]) - .index[0] - ) - - coherent_head = pd.Series(False, index=result.index, dtype=bool) - coherent_spouse = pd.Series(False, index=result.index, dtype=bool) - coherent_dependent = pd.Series(False, index=result.index, dtype=bool) - coherent_head.loc[primary_index] = True - - primary_person_number = self._household_role_person_number( - result, - primary_index, - ) - primary_spouse_number = self._household_role_spouse_number( - result, - primary_index, - ) - spouse_candidates = result.index[ - (result.index != primary_index) & ~dependent_flag - ] - spouse_index: Any | None = None - if primary_spouse_number > 0: - spouse_index = self._find_household_role_person_number_index( - result, - spouse_candidates, - primary_spouse_number, - ) - if ( - spouse_index is not None - and primary_person_number > 0 - and self._household_role_spouse_number(result, spouse_index) - not in {0, primary_person_number} - ): - spouse_index = None - if spouse_index is None: - spouse_pool = spouse_candidates[ - ( - spouse_flag.loc[spouse_candidates] - | ( - spouse_hint.loc[spouse_candidates] - & self._role_flag_series(result, "tax_unit_is_joint").loc[ - spouse_candidates - ] - ) - ) - ] - if len(spouse_pool): - spouse_index = ( - pd.DataFrame( - { - "source_spouse": ~spouse_flag.loc[spouse_pool], - "relationship": ~spouse_hint.loc[spouse_pool], - "age": -age.loc[spouse_pool], - "person_id": pd.to_numeric( - result.loc[spouse_pool, "person_id"], - errors="coerce", - ).fillna(0), - }, - index=spouse_pool, - ) - .sort_values(["source_spouse", "relationship", "age", "person_id"]) - .index[0] - ) - if spouse_index is not None: - coherent_spouse.loc[spouse_index] = True - - available = ~(coherent_head | coherent_spouse) - coherent_dependent.loc[ - available - & ( - dependent_flag - | (dependent_hint & (age.lt(24) | income.le(0.0))) - | (spouse_hint & income.le(0.0)) - ) - ] = True - - available = ~(coherent_head | coherent_spouse | coherent_dependent) - coherent_head.loc[available & age.ge(18) & (head_flag | income.gt(0.0))] = True - - coherent_dependent.loc[ - ~(coherent_head | coherent_spouse | coherent_dependent) - & (age.lt(18) | dependent_hint | income.le(0.0)) - ] = True - coherent_head.loc[~(coherent_head | coherent_spouse | coherent_dependent)] = ( - True - ) - - result["_is_tax_unit_head_flag"] = coherent_head - result["_is_tax_unit_spouse_flag"] = coherent_spouse - result["_is_tax_unit_dependent_flag"] = coherent_dependent - return result - - def _household_role_person_number( - self, - household_persons: pd.DataFrame, - index: Any, - ) -> int: - if "person_number" not in household_persons.columns: - return 0 - value = pd.to_numeric( - pd.Series([household_persons.loc[index, "person_number"]]), - errors="coerce", - ).fillna(0) - return int(value.iloc[0]) - - def _household_role_spouse_number( - self, - household_persons: pd.DataFrame, - index: Any, - ) -> int: - if "spouse_person_number" not in household_persons.columns: - return 0 - value = pd.to_numeric( - pd.Series([household_persons.loc[index, "spouse_person_number"]]), - errors="coerce", - ).fillna(0) - return int(value.iloc[0]) - - def _find_household_role_person_number_index( - self, - household_persons: pd.DataFrame, - candidate_indices: pd.Index, - person_number: int, - ) -> Any | None: - if "person_number" not in household_persons.columns: - return None - person_numbers = pd.to_numeric( - household_persons.loc[candidate_indices, "person_number"], - errors="coerce", - ).fillna(0) - matches = person_numbers.index[person_numbers.astype(int).eq(person_number)] - return matches[0] if len(matches) else None - - def _assign_role_flag_spouses( - self, - household_persons: pd.DataFrame, - head_ids: list[int], - ) -> dict[int, list[int]]: - head_set = set(head_ids) - assignments: dict[int, list[int]] = {head_id: [] for head_id in head_ids} - spouse_rows = household_persons.loc[ - household_persons["_is_tax_unit_spouse_flag"] - ] - if spouse_rows.empty: - return assignments - - person_number = ( - pd.to_numeric( - household_persons.get("person_number"), - errors="coerce", - ) - .fillna(0) - .astype(int) - if "person_number" in household_persons.columns - else pd.Series(0, index=household_persons.index, dtype=int) - ) - spouse_number = ( - pd.to_numeric( - household_persons.get("spouse_person_number"), - errors="coerce", - ) - .fillna(0) - .astype(int) - if "spouse_person_number" in household_persons.columns - else pd.Series(0, index=household_persons.index, dtype=int) - ) - head_by_person_number = { - int(person_number.loc[index]): int(row["person_id"]) - for index, row in household_persons.iterrows() - if int(row["person_id"]) in head_set and int(person_number.loc[index]) > 0 - } - row_by_person_id = { - int(row["person_id"]): index for index, row in household_persons.iterrows() - } - assigned_spouses: set[int] = set() - - for index, row in spouse_rows.iterrows(): - spouse_id = int(row["person_id"]) - pointed_head_id = head_by_person_number.get(int(spouse_number.loc[index])) - if pointed_head_id is None: - spouse_person_number = int(person_number.loc[index]) - for head_id in head_ids: - head_index = row_by_person_id.get(head_id) - if head_index is None: - continue - if int(spouse_number.loc[head_index]) == spouse_person_number: - pointed_head_id = head_id - break - if pointed_head_id is None: - continue - if assignments[pointed_head_id]: - continue - assignments[pointed_head_id].append(spouse_id) - assigned_spouses.add(spouse_id) - - unassigned_spouse_ids = [ - int(person_id) - for person_id in spouse_rows["person_id"].tolist() - if int(person_id) not in assigned_spouses - ] - heads_without_spouse = [ - head_id for head_id in head_ids if not assignments[head_id] - ] - for head_id, spouse_id in zip( - heads_without_spouse, - unassigned_spouse_ids, - strict=False, - ): - assignments[head_id].append(spouse_id) - - return assignments - - def _assign_role_flag_dependents( - self, - household_persons: pd.DataFrame, - head_ids: list[int], - ) -> dict[int, list[int]]: - assignments: dict[int, list[int]] = {head_id: [] for head_id in head_ids} - dependent_rows = household_persons.loc[ - household_persons["_is_tax_unit_dependent_flag"] - ].sort_values(["age", "person_id"], ascending=[True, True]) - if dependent_rows.empty: - return assignments - - target_counts: dict[int, int] = {} - if "tax_unit_count_dependents" in household_persons.columns: - count_series = pd.to_numeric( - household_persons["tax_unit_count_dependents"], - errors="coerce", - ).fillna(0) - for head_id in head_ids: - head_mask = household_persons["person_id"].astype(int).eq(head_id) - if not bool(head_mask.any()): - target_counts[head_id] = 0 - continue - target_counts[head_id] = max( - 0, - int(round(float(count_series.loc[head_mask].iloc[0]))), - ) - else: - target_counts = {head_id: 0 for head_id in head_ids} - - for _, dependent in dependent_rows.iterrows(): - dependent_id = int(dependent["person_id"]) - candidates = [ - head_id - for head_id in head_ids - if len(assignments[head_id]) < target_counts.get(head_id, 0) - ] - head_id = candidates[0] if candidates else head_ids[0] - assignments[head_id].append(dependent_id) - - return assignments - - def _infer_role_flag_tax_unit_filing_status( - self, - unit_persons: pd.DataFrame, - *, - head_id: int, - spouse_ids: list[int], - dependent_ids: list[int], - ) -> str: - if spouse_ids: - return "JOINT" - - head_rows = unit_persons.loc[unit_persons["person_id"].astype(int).eq(head_id)] - if head_rows.empty: - return "SINGLE" - hinted_status = self._infer_single_filer_filing_status( - head_rows.iloc[0], - has_dependents=bool(dependent_ids), - ) - if hinted_status is not None: - return hinted_status - return "SINGLE" - - def _aggregate_policyengine_tax_unit_input_columns( - self, - unit_persons: pd.DataFrame, - ) -> dict[str, Any]: - columns = ( - "domestic_production_ald", - "health_savings_account_ald", - "recapture_of_investment_credit", - "self_employed_health_insurance_ald", - "self_employed_pension_contribution_ald", - "unrecaptured_section_1250_gain", - "unreported_payroll_tax", - ) - aggregated: dict[str, Any] = {} - for column in columns: - if column not in unit_persons.columns: - continue - values = pd.to_numeric(unit_persons[column], errors="coerce").fillna(0.0) - nonzero_values = values.loc[~np.isclose(values.to_numpy(dtype=float), 0.0)] - if len(nonzero_values) > 1 and nonzero_values.nunique(dropna=True) == 1: - aggregated[column] = float(nonzero_values.iloc[0]) - continue - aggregated[column] = float(values.sum()) - for column in ("health_insurance_premiums_without_medicare_part_b",): - if column not in unit_persons.columns: - continue - values = pd.to_numeric(unit_persons[column], errors="coerce").fillna(0.0) - aggregated[column] = float(values.clip(lower=0.0).sum()) - for child_count_column in ("eitc_children", "eitc_child_count"): - if child_count_column not in unit_persons.columns: - continue - values = pd.to_numeric( - unit_persons[child_count_column], errors="coerce" - ).fillna(0.0) - aggregated[EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN] = float(values.max()) - break - employment_income = pd.to_numeric( - unit_persons.get("employment_income", 0.0), errors="coerce" - ) - if isinstance(employment_income, pd.Series): - aggregated[VOLUNTARY_FILING_WAGE_INCOME_HELPER_COLUMN] = float( - employment_income.fillna(0.0).clip(lower=0.0).sum() - ) - age = pd.to_numeric(unit_persons.get("age", 0.0), errors="coerce").fillna(0.0) - head_mask = self._normal_bool_series( - unit_persons.get("is_tax_unit_head", False), - index=unit_persons.index, - ) - if not bool(head_mask.any()) and "relationship_to_head" in unit_persons.columns: - head_mask = ( - pd.to_numeric(unit_persons["relationship_to_head"], errors="coerce") - .fillna(-1) - .eq(0) - ) - head_age = age.loc[head_mask].iloc[0] if bool(head_mask.any()) else age.iloc[0] - aggregated[VOLUNTARY_FILING_AGE_HEAD_HELPER_COLUMN] = float(head_age) - for column in ( - "interest_deduction", - "deductible_mortgage_interest", - "mortgage_interest_paid", - "first_home_mortgage_interest", - "second_home_mortgage_interest", - ): - if column not in unit_persons.columns: - continue - values = pd.to_numeric(unit_persons[column], errors="coerce").fillna(0.0) - aggregated[column] = float(values.clip(lower=0.0).sum()) - for column in ( - "first_home_mortgage_balance", - "second_home_mortgage_balance", - "scf_mortgage_debt", - "imputed_first_home_mortgage_balance_hint", - "imputed_second_home_mortgage_balance_hint", - ): - if column not in unit_persons.columns: - continue - values = pd.to_numeric(unit_persons[column], errors="coerce").fillna(0.0) - aggregated[column] = float(values.clip(lower=0.0).max()) - for column in ( - "first_home_mortgage_origination_year", - "second_home_mortgage_origination_year", - ): - if column not in unit_persons.columns: - continue - values = pd.to_numeric(unit_persons[column], errors="coerce").fillna(0.0) - positive = values.loc[values.gt(0.0)] - if not positive.empty: - aggregated[column] = int(positive.iloc[0]) - for boolean_column in ( - "takes_up_aca_if_eligible", - "takes_up_dc_ptc", - "takes_up_eitc", - "would_file_taxes_voluntarily", - ): - value = self._infer_policyengine_bool_for_group( - unit_persons, boolean_column - ) - if value is not None: - aggregated[boolean_column] = value - return aggregated - - def _attach_policyengine_tax_unit_source_inputs( - self, - tax_units: pd.DataFrame, - ) -> pd.DataFrame: - """Attach structural tax-unit inputs derived from source columns.""" - result = tax_units.copy() - zero = pd.Series(0.0, index=result.index, dtype=float) - - def first_nonzero_or_present(*columns: str) -> pd.Series: - values = zero.copy() - found = False - for column in columns: - if column not in result.columns: - continue - candidate = ( - pd.to_numeric(result[column], errors="coerce") - .fillna(0.0) - .astype(float) - ) - if not found: - values = candidate.copy() - found = True - continue - values = values.where(values.ne(0.0), candidate) - return values if found else zero.copy() - - mortgage_interest = first_nonzero_or_present( - "first_home_mortgage_interest", - "deductible_mortgage_interest", - "mortgage_interest_paid", - ).clip(lower=0.0) - if ( - "first_home_mortgage_interest" in result.columns - or "deductible_mortgage_interest" in result.columns - or "mortgage_interest_paid" in result.columns - ): - result["first_home_mortgage_interest"] = mortgage_interest - - if "interest_deduction" in result.columns: - interest_deduction = first_nonzero_or_present( - "interest_deduction", - "first_home_mortgage_interest", - ).clip(lower=0.0) - result["interest_deduction"] = np.maximum( - interest_deduction, - mortgage_interest, - ) - - balance_hint = first_nonzero_or_present( - "first_home_mortgage_balance", - "imputed_first_home_mortgage_balance_hint", - "scf_mortgage_debt", - ).clip(lower=0.0) - if ( - "first_home_mortgage_balance" in result.columns - or "imputed_first_home_mortgage_balance_hint" in result.columns - or "scf_mortgage_debt" in result.columns - or bool(mortgage_interest.gt(0.0).any()) - ): - interest_implied_balance = mortgage_interest / 0.06 - result["first_home_mortgage_balance"] = np.maximum( - balance_hint, - interest_implied_balance, - ).where(mortgage_interest.gt(0.0) | balance_hint.gt(0.0), 0.0) - - origination_year = first_nonzero_or_present( - "first_home_mortgage_origination_year", - ) - if "first_home_mortgage_origination_year" in result.columns or bool( - mortgage_interest.gt(0.0).any() - ): - target_year = int( - self.config.policyengine_dataset_year - or self.config.policyengine_target_period - or 2024 - ) - fallback_year = max(1988, target_year - 10) - result["first_home_mortgage_origination_year"] = ( - origination_year.where(origination_year.gt(0.0), fallback_year) - .where(mortgage_interest.gt(0.0), 0.0) - .astype(int) - ) - return result - - def _infer_policyengine_bool_for_group( - self, - group_rows: pd.DataFrame, - column: str, - ) -> bool | None: - if column in group_rows.columns: - return bool( - self._normal_bool_series( - group_rows[column], index=group_rows.index - ).any() - ) - return None - - def _attach_policyengine_aca_takeup( - self, - tax_units: pd.DataFrame, - ) -> pd.DataFrame: - """Attach eCPS-style ACA take-up input before PE materialization.""" - result = tax_units.copy() - column = "takes_up_aca_if_eligible" - if column in result.columns: - result[column] = ( - pd.to_numeric(result[column], errors="coerce") - .fillna(0.0) - .ne(0.0) - .astype(bool) - ) - return result - - year = int( - self.config.policyengine_dataset_year - or self.config.policyengine_target_period - or 2024 - ) - rate = _load_microplex_takeup_rate("aca", year) - rng = _microplex_seeded_rng(column) - result[column] = rng.random(len(result)) < rate - return result - - def _attach_policyengine_tax_unit_takeup_inputs( - self, - tax_units: pd.DataFrame, - ) -> pd.DataFrame: - """Attach eCPS-style tax-unit stochastic inputs before materialization.""" - result = self._attach_policyengine_aca_takeup(tax_units) - result = self._attach_policyengine_simple_tax_unit_takeup( - result, - column="takes_up_dc_ptc", - rate_key="dc_ptc", - ) - result = self._attach_policyengine_eitc_takeup(result) - return self._attach_policyengine_voluntary_filing(result) - - def _attach_policyengine_marketplace_plan_benchmark_ratio( - self, - tables: PolicyEngineUSEntityTableBundle, - *, - target_period: int, - ) -> PolicyEngineUSEntityTableBundle: - """Derive eCPS's persisted selected Marketplace plan ratio input.""" - tax_units = tables.tax_units - if tax_units is None or tax_units.empty: - return tables - if not { - "health_insurance_premiums_without_medicare_part_b", - "takes_up_aca_if_eligible", - }.issubset(tax_units.columns): - return tables - - missing_intermediates = { - column for column in ("aca_ptc", "slcsp") if column not in tax_units.columns - } - materialized_tables = tables - if missing_intermediates: - materialization_result = materialize_policyengine_us_variables_safely( - tables, - variables=tuple(sorted(missing_intermediates)), - period=target_period, - dataset_year=self.config.policyengine_dataset_year or target_period, - simulation_cls=self.config.policyengine_simulation_cls, - direct_override_variables=self.config.policyengine_direct_override_variables, - batch_size=self.config.policyengine_materialize_batch_size, - ) - materialized_tables = materialization_result.tables - tax_units = materialized_tables.tax_units - if tax_units is None: - return tables - still_missing = sorted(missing_intermediates - set(tax_units.columns)) - if still_missing: - LOGGER.warning( - "Could not derive selected Marketplace plan benchmark ratio; " - "missing PE intermediate(s): %s", - ", ".join(still_missing), - ) - return materialized_tables - - tax_units = tax_units.copy() - tax_units["selected_marketplace_plan_benchmark_ratio"] = ( - compute_marketplace_plan_benchmark_ratio( - reported_premium=pd.to_numeric( - tax_units["health_insurance_premiums_without_medicare_part_b"], - errors="coerce", - ).fillna(0.0), - aca_ptc=pd.to_numeric(tax_units["aca_ptc"], errors="coerce").fillna( - 0.0 - ), - slcsp=pd.to_numeric(tax_units["slcsp"], errors="coerce").fillna(0.0), - takes_up_aca=self._normal_bool_series( - tax_units["takes_up_aca_if_eligible"], - index=tax_units.index, - ), - ) - ) - return PolicyEngineUSEntityTableBundle( - households=materialized_tables.households, - persons=materialized_tables.persons, - tax_units=tax_units, - spm_units=materialized_tables.spm_units, - families=materialized_tables.families, - marital_units=materialized_tables.marital_units, - ) - - def _attach_policyengine_simple_tax_unit_takeup( - self, - tax_units: pd.DataFrame, - *, - column: str, - rate_key: str, - ) -> pd.DataFrame: - result = tax_units.copy() - if column in result.columns: - result[column] = self._normal_bool_series( - result[column], index=result.index - ) - return result - - year = self._policyengine_takeup_year() - rate = _load_microplex_takeup_rate(rate_key, year) - rng = _microplex_seeded_rng(column) - result[column] = rng.random(len(result)) < rate - return result - - def _attach_policyengine_eitc_takeup( - self, - tax_units: pd.DataFrame, - ) -> pd.DataFrame: - result = tax_units.copy() - column = "takes_up_eitc" - if column in result.columns: - result[column] = self._normal_bool_series( - result[column], index=result.index - ) - return result - - year = self._policyengine_takeup_year() - rates = _load_microplex_eitc_takeup_rates(year) - child_count_column = ( - EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN - if EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN in result.columns - else "n_dependents" - ) - raw_dependent_count = ( - result[child_count_column] - if child_count_column in result.columns - else pd.Series(0, index=result.index) - ) - dependent_count = ( - pd.to_numeric(raw_dependent_count, errors="coerce") - .fillna(0) - .clip(lower=0, upper=3) - .astype(int) - ) - takeup_rate = dependent_count.map(lambda count: rates.get(int(count), 0.85)) - rng = _microplex_seeded_rng(column) - result[column] = rng.random(len(result)) < takeup_rate.to_numpy(dtype=float) - return result - - def _attach_policyengine_voluntary_filing( - self, - tax_units: pd.DataFrame, - ) -> pd.DataFrame: - result = tax_units.copy() - column = "would_file_taxes_voluntarily" - if column in result.columns: - result[column] = self._normal_bool_series( - result[column], index=result.index - ) - return result.drop( - columns=[ - EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN, - VOLUNTARY_FILING_AGE_HEAD_HELPER_COLUMN, - VOLUNTARY_FILING_WAGE_INCOME_HELPER_COLUMN, - ], - errors="ignore", - ) - - year = self._policyengine_takeup_year() - rates = _load_microplex_voluntary_filing_rates(year) - takes_up_eitc = self._normal_bool_series( - result.get("takes_up_eitc", False), - index=result.index, - ) - child_count = self._tax_unit_child_count_for_takeup(result) - wage_income = pd.to_numeric( - result.get( - VOLUNTARY_FILING_WAGE_INCOME_HELPER_COLUMN, - pd.Series(0.0, index=result.index), - ), - errors="coerce", - ).fillna(0.0) - age_head = pd.to_numeric( - result.get( - VOLUNTARY_FILING_AGE_HEAD_HELPER_COLUMN, - pd.Series(0.0, index=result.index), - ), - errors="coerce", - ).fillna(0.0) - takeup_rate = self._voluntary_filing_rate_by_tax_unit( - rates, - child_count=child_count, - wage_income=wage_income, - age_head=age_head, - ) - rng = _microplex_seeded_rng(column) - result[column] = (~takes_up_eitc.to_numpy(dtype=bool)) & ( - rng.random(len(result)) < takeup_rate.to_numpy(dtype=float) - ) - result = result.drop( - columns=[ - EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN, - VOLUNTARY_FILING_AGE_HEAD_HELPER_COLUMN, - VOLUNTARY_FILING_WAGE_INCOME_HELPER_COLUMN, - ], - errors="ignore", - ) - return result - - def _tax_unit_child_count_for_takeup(self, tax_units: pd.DataFrame) -> pd.Series: - child_count_column = ( - EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN - if EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN in tax_units.columns - else "n_dependents" - ) - raw_child_count = ( - tax_units[child_count_column] - if child_count_column in tax_units.columns - else pd.Series(0, index=tax_units.index) - ) - return ( - pd.to_numeric(raw_child_count, errors="coerce") - .fillna(0) - .clip(lower=0, upper=3) - .astype(int) - ) - - @staticmethod - def _voluntary_filing_rate_by_tax_unit( - rates: dict, - *, - child_count: pd.Series, - wage_income: pd.Series, - age_head: pd.Series, - ) -> pd.Series: - children_bin = np.where( - child_count.to_numpy(dtype=int) > 0, "with_children", "no_children" - ) - wage_values = wage_income.to_numpy(dtype=float) - wage_bin = np.select( - [wage_values <= 0.0, wage_values < 15_000.0, wage_values < 30_000.0], - ["zero", "low", "medium"], - default="high", - ) - age_bin = np.where( - age_head.to_numpy(dtype=float) >= 65.0, "age_65_plus", "under_65" - ) - values = [ - rates.get(children, {}) - .get(wage, {}) - .get(age, DEFAULT_VOLUNTARY_FILING_RATE) - for children, wage, age in zip(children_bin, wage_bin, age_bin, strict=True) - ] - return pd.Series(values, index=child_count.index, dtype=float) - - def _attach_policyengine_person_takeup_inputs( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - """Attach eCPS-style person stochastic inputs before materialization.""" - result = self._attach_policyengine_medicaid_takeup(persons) - result = self._attach_policyengine_pregnancy_inputs(result) - for column, rate_key in ( - ("takes_up_head_start_if_eligible", "head_start"), - ("takes_up_early_head_start_if_eligible", "early_head_start"), - ): - result = self._attach_policyengine_simple_person_takeup( - result, - column=column, - rate_key=rate_key, - ) - return result - - def _attach_policyengine_simple_person_takeup( - self, - persons: pd.DataFrame, - *, - column: str, - rate_key: str, - ) -> pd.DataFrame: - result = persons.copy() - if column in result.columns: - result[column] = self._normal_bool_series( - result[column], index=result.index - ) - return result - - year = self._policyengine_takeup_year() - rate = _load_microplex_takeup_rate(rate_key, year) - rng = _microplex_seeded_rng(column) - result[column] = rng.random(len(result)) < rate - return result - - def _attach_policyengine_medicaid_takeup( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - result = persons.copy() - column = "takes_up_medicaid_if_eligible" - if column in result.columns: - result[column] = self._normal_bool_series( - result[column], index=result.index - ) - return result - - year = self._policyengine_takeup_year() - rates = _load_microplex_medicaid_takeup_rates(year) - states = self._person_state_abbreviation(result) - takeup_rate = states.map( - lambda state: rates.get(state, DEFAULT_MEDICAID_TAKEUP_RATE) - ) - rng = _microplex_seeded_rng(column) - result[column] = rng.random(len(result)) < takeup_rate.to_numpy(dtype=float) - return result - - def _attach_policyengine_pregnancy_inputs( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - result = persons.copy() - column = "is_pregnant" - if column in result.columns: - result[column] = self._normal_bool_series( - result[column], index=result.index - ) - return result - - index = result.index - age = pd.to_numeric( - result.get("age", pd.Series(0.0, index=index)), - errors="coerce", - ).fillna(0.0) - if "is_female" in result.columns: - female = self._normal_bool_series(result["is_female"], index=index) - elif "sex" in result.columns: - female = ( - pd.to_numeric(result["sex"], errors="coerce") - .fillna(0) - .astype(int) - .eq(2) - ) - else: - female = pd.Series(False, index=index) - - year = self._policyengine_takeup_year() - rates = _load_microplex_pregnancy_rates(year) - states = self._person_state_abbreviation(result) - pregnancy_rate = states.map( - lambda state: rates.get(str(state).upper(), DEFAULT_PREGNANCY_RATE) - ).fillna(DEFAULT_PREGNANCY_RATE) - eligible = female & age.ge(15.0) & age.le(44.0) - rng = _microplex_seeded_rng(column) - result[column] = eligible.to_numpy(dtype=bool) & ( - rng.random(len(result)) < pregnancy_rate.to_numpy(dtype=float) - ) - return result - - def _attach_policyengine_wic_inputs( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - result = persons.copy() - category = self._policyengine_wic_category_for_takeup(result) - year = self._policyengine_takeup_year() - - claim_column = "would_claim_wic" - if claim_column in result.columns: - result[claim_column] = self._normal_bool_series( - result[claim_column], - index=result.index, - ) - else: - claim_rates = _load_microplex_wic_takeup_rates(year) - claim_rate = category.map( - lambda value: claim_rates.get(str(value), 0.0) - ).fillna(0.0) - rng = _microplex_seeded_rng(claim_column) - result[claim_column] = rng.random(len(result)) < claim_rate.to_numpy( - dtype=float - ) - - risk_column = "is_wic_at_nutritional_risk" - if risk_column in result.columns: - result[risk_column] = self._normal_bool_series( - result[risk_column], - index=result.index, - ) - else: - risk_rates = _load_microplex_wic_nutritional_risk_rates(year) - risk_rate = category.map( - lambda value: risk_rates.get(str(value), 0.0) - ).fillna(0.0) - receives_wic = self._normal_bool_series( - result.get("receives_wic", False), - index=result.index, - ) - rng = _microplex_seeded_rng(risk_column) - result[risk_column] = receives_wic | ( - rng.random(len(result)) < risk_rate.to_numpy(dtype=float) - ) - return result - - def _policyengine_wic_category_for_takeup( - self, - persons: pd.DataFrame, - ) -> pd.Series: - index = persons.index - age = pd.to_numeric( - persons.get("age", pd.Series(0.0, index=index)), - errors="coerce", - ).fillna(0.0) - pregnant = self._normal_bool_series( - persons.get("is_pregnant", False), - index=index, - ) - breastfeeding = self._normal_bool_series( - persons.get("is_breastfeeding", False), - index=index, - ) - if "is_female" in persons.columns: - female = self._normal_bool_series(persons["is_female"], index=index) - elif "sex" in persons.columns: - female = ( - pd.to_numeric(persons["sex"], errors="coerce") - .fillna(0) - .astype(int) - .eq(2) - ) - else: - female = pd.Series(False, index=index) - - own_children = pd.to_numeric( - persons.get("own_children_in_household", pd.Series(0, index=index)), - errors="coerce", - ).fillna(0.0) - mother = breastfeeding | (female & own_children.gt(0)) - - group_column = next( - ( - column - for column in ("family_id", "spm_unit_id", "household_id") - if column in persons.columns - ), - None, - ) - if group_column is None: - min_age_group = age - else: - group_keys = persons[group_column].where( - persons[group_column].notna(), - pd.Series(np.arange(len(persons)), index=index), - ) - min_age_group = age.groupby(group_keys, sort=False).transform("min") - - category = np.select( - [ - pregnant.to_numpy(dtype=bool), - ( - mother.to_numpy(dtype=bool) - & breastfeeding.to_numpy(dtype=bool) - & min_age_group.lt(1.0).to_numpy(dtype=bool) - ), - ( - mother.to_numpy(dtype=bool) - & min_age_group.lt(0.5).to_numpy(dtype=bool) - ), - age.lt(1.0).to_numpy(dtype=bool), - age.lt(5.0).to_numpy(dtype=bool), - ], - [ - WIC_TAKEUP_CATEGORY_PREGNANT, - WIC_TAKEUP_CATEGORY_BREASTFEEDING, - WIC_TAKEUP_CATEGORY_POSTPARTUM, - WIC_TAKEUP_CATEGORY_INFANT, - WIC_TAKEUP_CATEGORY_CHILD, - ], - default=WIC_TAKEUP_CATEGORY_NONE, - ) - return pd.Series(category, index=index, dtype="string") - - def _person_state_abbreviation(self, persons: pd.DataFrame) -> pd.Series: - if "state" in persons.columns: - state = persons["state"].astype("string").str.upper() - known = set(STATE_FIPS.values()) - return state.where(state.isin(known), "CA").fillna("CA") - if "state_code_str" in persons.columns: - state = persons["state_code_str"].astype("string").str.upper() - known = set(STATE_FIPS.values()) - return state.where(state.isin(known), "CA").fillna("CA") - if "state_fips" in persons.columns: - state_fips = ( - pd.to_numeric(persons["state_fips"], errors="coerce") - .fillna(6) - .astype(int) - ) - return state_fips.map(lambda value: STATE_FIPS.get(int(value), "CA")) - return pd.Series("CA", index=persons.index, dtype="string") - - def _attach_policyengine_spm_takeup_inputs( - self, - spm_units: pd.DataFrame, - ) -> pd.DataFrame: - result = self._attach_policyengine_snap_takeup(spm_units) - return self._attach_policyengine_tanf_takeup(result) - - def _attach_policyengine_tanf_takeup( - self, - spm_units: pd.DataFrame, - ) -> pd.DataFrame: - result = spm_units.copy() - column = "takes_up_tanf_if_eligible" - if column in result.columns: - result[column] = self._normal_bool_series( - result[column], index=result.index - ) - return result - - year = self._policyengine_takeup_year() - rate = _load_microplex_takeup_rate("tanf", year) - rng = _microplex_seeded_rng(column) - result[column] = rng.random(len(result)) < rate - return result - - def _policyengine_takeup_year(self) -> int: - return int( - self.config.policyengine_dataset_year - or self.config.policyengine_target_period - or 2024 - ) - - @staticmethod - def _normal_bool_series(value: Any, *, index: pd.Index) -> pd.Series: - if isinstance(value, pd.Series): - series = value.reindex(index) - else: - series = pd.Series(value, index=index) - return pd.to_numeric(series, errors="coerce").fillna(0.0).ne(0.0).astype(bool) - - def _split_preserved_tax_unit_members( - self, - unit_persons: pd.DataFrame, - ) -> tuple[list[int], list[int]]: - relationship = pd.to_numeric( - unit_persons.get("relationship_to_head"), - errors="coerce", - ).fillna(3) - head_mask = relationship.eq(0) - spouse_mask = relationship.eq(1) - dependent_mask = relationship.eq(2) - - filer_ids: list[int] = [] - spouse_pair_ids = self._find_preserved_tax_unit_spouse_pair(unit_persons) - if head_mask.any(): - head_id = int(unit_persons.loc[head_mask, "person_id"].iloc[0]) - filer_ids.append(head_id) - if head_id in spouse_pair_ids: - filer_ids.extend( - [ - int(person_id) - for person_id in spouse_pair_ids - if int(person_id) != head_id - ] - ) - elif ( - spouse_mask.any() and "spouse_person_number" not in unit_persons.columns - ): - filer_ids.append( - int(unit_persons.loc[spouse_mask, "person_id"].iloc[0]) - ) - elif spouse_pair_ids: - pair_rows = unit_persons.loc[ - unit_persons["person_id"].astype(int).isin(spouse_pair_ids) - ].copy() - pair_rows["age"] = pd.to_numeric( - pair_rows.get("age"), errors="coerce" - ).fillna(0.0) - filer_ids.extend( - pair_rows.sort_values(["age", "person_id"], ascending=[False, True])[ - "person_id" - ] - .astype(int) - .tolist()[:2] - ) - elif spouse_mask.any() and "spouse_person_number" not in unit_persons.columns: - filer_ids.append(int(unit_persons.loc[spouse_mask, "person_id"].iloc[0])) - if not filer_ids: - adult_mask = ( - pd.to_numeric( - unit_persons.get("age"), - errors="coerce", - ) - .fillna(0) - .ge(18) - ) - if adult_mask.any(): - filer_ids.append(int(unit_persons.loc[adult_mask, "person_id"].iloc[0])) - else: - filer_ids.append(int(unit_persons.iloc[0]["person_id"])) - - dependent_ids = [ - int(person_id) - for person_id in unit_persons.loc[dependent_mask, "person_id"].tolist() - if int(person_id) not in filer_ids - ] - if not dependent_ids: - dependent_ids = [ - int(person_id) - for person_id in unit_persons["person_id"].tolist() - if int(person_id) not in filer_ids - ] - return filer_ids, dependent_ids - - def _find_preserved_tax_unit_spouse_pair( - self, - unit_persons: pd.DataFrame, - ) -> list[int]: - required_columns = {"person_number", "spouse_person_number", "person_id"} - if not required_columns.issubset(unit_persons.columns): - return [] - pairs: set[tuple[int, int]] = set() - by_number = { - int(person_number): { - "person_id": int(person_id), - "spouse_person_number": int(spouse_person_number), - "age": float(age), - } - for person_number, spouse_person_number, person_id, age in unit_persons[ - ["person_number", "spouse_person_number", "person_id", "age"] - ] - .assign( - age=lambda frame: pd.to_numeric(frame["age"], errors="coerce").fillna( - 0.0 - ), - spouse_person_number=lambda frame: pd.to_numeric( - frame["spouse_person_number"], errors="coerce" - ).fillna(0), - person_number=lambda frame: pd.to_numeric( - frame["person_number"], errors="coerce" - ).fillna(0), - ) - .itertuples(index=False, name=None) - } - for person_number, data in by_number.items(): - spouse_number = data["spouse_person_number"] - if spouse_number <= 0: - continue - spouse = by_number.get(spouse_number) - if spouse is None or spouse["spouse_person_number"] != person_number: - continue - pair = tuple(sorted((data["person_id"], spouse["person_id"]))) - pairs.add(pair) - if not pairs: - return [] - if len(pairs) == 1: - return list(next(iter(pairs))) - - head_candidates = unit_persons.loc[ - pd.to_numeric(unit_persons.get("relationship_to_head"), errors="coerce") - .fillna(3) - .eq(0), - "person_id", - ].astype(int) - if not head_candidates.empty: - head_id = int(head_candidates.iloc[0]) - for pair in sorted(pairs): - if head_id in pair: - return list(pair) - best_pair = max( - pairs, - key=lambda pair: sum( - by_number[number]["age"] - for number in by_number - if by_number[number]["person_id"] in pair - ), - ) - return list(best_pair) - - def _infer_preserved_tax_unit_filing_status( - self, - unit_persons: pd.DataFrame, - *, - filer_ids: list[int], - dependent_ids: list[int], - ) -> str: - if "filing_status" in unit_persons.columns: - filing_status_values = ( - unit_persons["filing_status"].dropna().astype(str).str.strip() - ) - filing_status_values = filing_status_values[filing_status_values != ""] - if not filing_status_values.empty: - return self._normalize_policyengine_filing_status( - filing_status_values.iloc[0] - ) - - if len(filer_ids) >= 2: - return "JOINT" - - filer_row = unit_persons.loc[unit_persons["person_id"] == filer_ids[0]].iloc[0] - hinted_status = self._infer_single_filer_filing_status( - filer_row, - has_dependents=bool(dependent_ids), - ) - return hinted_status or "SINGLE" - - def _apply_tax_unit_filing_status_hints( - self, - household_persons: pd.DataFrame, - optimized_units: list[dict[str, Any]], - ) -> list[dict[str, Any]]: - if not optimized_units or "person_id" not in household_persons.columns: - return optimized_units - - person_lookup = household_persons.set_index("person_id", drop=False) - updated_units: list[dict[str, Any]] = [] - for unit in optimized_units: - unit_copy = dict(unit) - filer_ids = [int(person_id) for person_id in unit_copy.get("filer_ids", [])] - dependent_ids = [ - int(person_id) for person_id in unit_copy.get("dependent_ids", []) - ] - if len(filer_ids) == 2: - separated_split = self._split_joint_tax_unit_for_separated_filers( - person_lookup, - filer_ids=filer_ids, - dependent_ids=dependent_ids, - ) - if separated_split is not None: - updated_units.extend(separated_split) - continue - if len(filer_ids) != 1: - updated_units.append(unit_copy) - continue - filer_id = filer_ids[0] - if filer_id not in person_lookup.index: - updated_units.append(unit_copy) - continue - filer_row = person_lookup.loc[filer_id] - hinted_status = self._infer_single_filer_filing_status( - filer_row, - has_dependents=bool(dependent_ids), - ) - if hinted_status is not None: - unit_copy["filing_status"] = hinted_status - elif self._normalize_policyengine_filing_status( - unit_copy.get("filing_status", "single") - ) in {"HEAD_OF_HOUSEHOLD", "SEPARATE"}: - unit_copy["filing_status"] = "SINGLE" - updated_units.append(unit_copy) - return updated_units - - def _split_joint_tax_unit_for_separated_filers( - self, - person_lookup: pd.DataFrame, - *, - filer_ids: list[int], - dependent_ids: list[int], - ) -> list[dict[str, Any]] | None: - if len(filer_ids) != 2: - return None - if not all(filer_id in person_lookup.index for filer_id in filer_ids): - return None - - filer_rows = person_lookup.loc[filer_ids] - if isinstance(filer_rows, pd.Series): - filer_rows = filer_rows.to_frame().T - separated_mask = filer_rows.apply( - lambda row: self._has_explicit_separation_evidence(row), axis=1 - ) - if not bool( - separated_mask.any() - ) and self._has_marriage_compatible_joint_evidence(filer_rows): - return None - - primary_filer_id = self._select_primary_tax_unit_filer( - filer_rows, - fallback_id=filer_ids[0], - ) - secondary_filer_id = next( - filer_id for filer_id in filer_ids if filer_id != primary_filer_id - ) - split_units: list[dict[str, Any]] = [] - for filer_id, unit_dependent_ids in ( - (primary_filer_id, dependent_ids), - (secondary_filer_id, []), - ): - filer_row = person_lookup.loc[filer_id] - total_income = float( - pd.to_numeric(filer_row.get("income", 0.0), errors="coerce") or 0.0 - ) - if unit_dependent_ids: - dependent_income = pd.to_numeric( - person_lookup.loc[unit_dependent_ids, "income"], - errors="coerce", - ).fillna(0.0) - total_income += float(dependent_income.sum()) - hinted_status = self._infer_single_filer_filing_status( - filer_row, - has_dependents=bool(unit_dependent_ids), - ) - split_units.append( - { - "filer_ids": [int(filer_id)], - "dependent_ids": [ - int(person_id) for person_id in unit_dependent_ids - ], - "n_dependents": int(len(unit_dependent_ids)), - "total_income": total_income, - "tax_liability": 0.0, - "filing_status": hinted_status or "SINGLE", - } - ) - return split_units - - def _has_marriage_compatible_joint_evidence( - self, - filer_rows: pd.DataFrame, - ) -> bool: - if "marital_status" not in filer_rows.columns: - return True - marital_status = pd.to_numeric( - pd.Series(filer_rows["marital_status"]), - errors="coerce", - ) - observed = marital_status.dropna().astype(int) - if observed.empty: - return True - # CPS spouse-present statuses are the only strong evidence that a - # spouse-coded pair should survive as one joint PE tax unit. - return bool(observed.isin({1, 2}).all()) - - def _has_explicit_separation_evidence(self, filer_row: pd.Series) -> bool: - if bool(filer_row.get("is_separated", False)): - return True - filing_status_code = self._coerce_policyengine_status_code( - filer_row.get("filing_status_code") - ) - if filing_status_code == 3: - return True - marital_status = self._coerce_policyengine_status_code( - filer_row.get("marital_status") - ) - return marital_status == 6 - - def _select_primary_tax_unit_filer( - self, - filer_rows: pd.DataFrame, - *, - fallback_id: int, - ) -> int: - relationship = pd.to_numeric( - filer_rows.get("relationship_to_head"), - errors="coerce", - ) - if relationship is not None: - head_candidates = filer_rows.loc[relationship.eq(0)] - if not head_candidates.empty: - return int(head_candidates.iloc[0]["person_id"]) - is_head = pd.to_numeric( - filer_rows.get("is_head"), - errors="coerce", - ) - if is_head is not None: - head_candidates = filer_rows.loc[is_head.fillna(0).astype(float) > 0.0] - if not head_candidates.empty: - return int(head_candidates.iloc[0]["person_id"]) - if fallback_id in filer_rows["person_id"].astype(int).tolist(): - return int(fallback_id) - return int(filer_rows.iloc[0]["person_id"]) - - def _infer_single_filer_filing_status( - self, - filer_row: pd.Series, - *, - has_dependents: bool, - ) -> str | None: - filing_status_code = self._coerce_policyengine_status_code( - filer_row.get("filing_status_code") - ) - if filing_status_code == 3: - return "SEPARATE" - if filing_status_code == 4: - return "HEAD_OF_HOUSEHOLD" - if filing_status_code == 5: - return "SURVIVING_SPOUSE" - - marital_status = self._coerce_policyengine_status_code( - filer_row.get("marital_status") - ) - if marital_status == 6: - return "SEPARATE" - if marital_status == 4 and has_dependents: - return "SURVIVING_SPOUSE" - return None - - def _coerce_policyengine_status_code(self, value: Any) -> int | None: - numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0] - if pd.isna(numeric): - return None - return int(numeric) - - def _assign_family_and_spm_units(self, persons: pd.DataFrame) -> pd.DataFrame: - """Assign family and SPM units, preserving authoritative IDs when present. - - NOT delegated to ``microunit`` in this pass (issue #113). At the pinned - commit ``microunit.units.spm.assign_spm_partition`` is documented as "a - conservative adapter, not yet the full Census-parity constructor" and is - not exported from microunit's public API, and microunit has no - family-unit constructor. The authoritative-ID fast path is preserved - here. TODO(#113): delegate once microunit grows a Census-parity - SPM/family constructor. - """ - result = persons.copy() - preserved_family_ids = self._normalized_complete_existing_group_ids( - result, - "family_id", - ) - # SPM unit ids from the source are trustworthy and must survive synthesis - # even when partially missing (a single missing id must not collapse the - # whole frame to one SPM unit per household). Tax-unit ids, by contrast, - # are reconstructed, not preserved (see _build_policyengine_tax_units). - preserved_spm_unit_ids = self._preserve_present_group_ids( - result, - "spm_unit_id", - ) - if preserved_family_ids is not None and preserved_spm_unit_ids is not None: - result["family_id"] = preserved_family_ids - result["spm_unit_id"] = preserved_spm_unit_ids - return result - - family_ids: dict[int, int] = {} - spm_unit_ids: dict[int, int] = {} - next_family_id = 0 - next_spm_unit_id = 0 - - for _, household_persons in result.groupby("household_id", sort=False): - household_spm_id = next_spm_unit_id - next_spm_unit_id += 1 - primary_mask = self._primary_family_member_mask(household_persons) - if primary_mask.any(): - primary_family_id = next_family_id - next_family_id += 1 - else: - primary_family_id = None - - for _, row in household_persons.iterrows(): - spm_unit_ids[int(row.name)] = household_spm_id - if primary_family_id is not None and bool(primary_mask.loc[row.name]): - family_ids[int(row.name)] = primary_family_id - continue - - family_ids[int(row.name)] = next_family_id - next_family_id += 1 - - result["family_id"] = ( - preserved_family_ids - if preserved_family_ids is not None - else result.index.map(family_ids).astype(np.int64) - ) - result["spm_unit_id"] = ( - preserved_spm_unit_ids - if preserved_spm_unit_ids is not None - else result.index.map(spm_unit_ids).astype(np.int64) - ) - return result - - def _primary_family_member_mask( - self, - household_persons: pd.DataFrame, - ) -> pd.Series: - """Identify people who belong to the household's primary family.""" - - relationship_primary = household_persons["relationship_to_head"].isin({0, 1, 2}) - if "family_relationship" not in household_persons.columns: - return relationship_primary - - family_relationship = pd.to_numeric( - household_persons["family_relationship"], - errors="coerce", - ) - # CPS A_FAMREL is a family-membership code: 0 means not in a family; - # positive values are reference person, spouse, child, or other relative. - family_member = family_relationship.isin({1, 2, 3, 4}) - return relationship_primary | family_member - - def _assign_marital_units( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - """Assign marital units, preserving authoritative IDs when present. - - NOT delegated to ``microunit`` in this pass (issue #113): microunit does - not construct marital units at the pinned commit (filing status is its - only marital-related output; there is no ``construct_marital_units``). - The authoritative-ID fast path is preserved here. TODO(#113): revisit if - microunit grows marital-unit support. - """ - result = persons.copy() - preserved_marital_unit_ids = self._normalized_complete_existing_group_ids( - result, - "marital_unit_id", - ) - if preserved_marital_unit_ids is not None: - result["marital_unit_id"] = preserved_marital_unit_ids - return result - - marital_unit_by_person: dict[int, int] = {} - next_marital_unit_id = 0 - - for tax_unit_id, unit_persons in result.groupby("tax_unit_id", sort=False): - _ = tax_unit_id - filers = unit_persons[unit_persons["relationship_to_head"].isin({0, 1})] - if len(filers) >= 2: - marital_unit_id = next_marital_unit_id - next_marital_unit_id += 1 - for person_id in filers.head(2)["person_id"].tolist(): - marital_unit_by_person[int(person_id)] = marital_unit_id - elif len(filers) == 1: - marital_unit_by_person[int(filers.iloc[0]["person_id"])] = ( - next_marital_unit_id - ) - next_marital_unit_id += 1 - - for person_id in unit_persons["person_id"].tolist(): - if int(person_id) in marital_unit_by_person: - continue - marital_unit_by_person[int(person_id)] = next_marital_unit_id - next_marital_unit_id += 1 - - result["marital_unit_id"] = ( - result["person_id"].map(marital_unit_by_person).astype(np.int64) - ) - return result - - def _assign_policyengine_household_head_flag( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - result = persons.copy() - derived = ( - pd.to_numeric(result["relationship_to_head"], errors="coerce") - .fillna(-1) - .eq(0) - ) - if "is_household_head" not in result.columns: - result["is_household_head"] = derived - return result - - existing = pd.to_numeric(result["is_household_head"], errors="coerce") - result["is_household_head"] = existing.where(existing.notna(), derived).gt(0.5) - return result - - def _normalized_complete_existing_group_ids( - self, - persons: pd.DataFrame, - id_column: str, - ) -> pd.Series | None: - if id_column not in persons.columns: - return None - raw_ids = persons[id_column] - if raw_ids.isna().any(): - return None - - raw_key = raw_ids.astype("string") - key = pd.DataFrame( - { - "household_id": persons["household_id"], - id_column: raw_key, - }, - index=persons.index, - ) - raw_numeric = pd.to_numeric(raw_ids, errors="coerce") - households_per_raw_id = key.groupby(id_column, dropna=False)[ - "household_id" - ].nunique() - must_factorize = raw_numeric.isna().any() or bool( - households_per_raw_id.gt(1).any() - ) - if must_factorize: - return pd.Series( - pd.factorize(pd.MultiIndex.from_frame(key), sort=False)[0].astype( - np.int64 - ), - index=persons.index, - name=id_column, - ) - return raw_numeric.astype(np.int64).rename(id_column) - - def _preserve_present_group_ids( - self, - persons: pd.DataFrame, - id_column: str, - ) -> pd.Series | None: - """Preserve existing per-person unit ids where present, regenerating only - the rows that are missing one. - - Unlike :meth:`_normalized_complete_existing_group_ids` (which discards the - whole column if *any* id is missing), this keeps the authoritative - grouping for every row that carries an id and collapses rows with a - missing id into a single per-household fallback unit. Used for SPM units, - whose source ids are trustworthy and should survive synthesis even when - partially missing (otherwise a single missing id drops the whole frame to - one SPM unit per household). Returns ``None`` only when the column is - absent or entirely empty. - """ - if id_column not in persons.columns: - return None - raw_ids = persons[id_column] - present = raw_ids.notna() - if not present.any(): - return None - hh = persons["household_id"] - codes = pd.Series(-1, index=persons.index, dtype=np.int64) - # Present rows: stable unit code from factorizing (household_id, real id). - present_key = pd.MultiIndex.from_frame( - pd.DataFrame({"hh": hh[present], "id": raw_ids[present].astype("string")}) - ) - codes.loc[present] = pd.factorize(present_key, sort=False)[0] - if (~present).any(): - # Missing rows fold into their household's first present unit so they - # never fabricate a spurious unit; households with no present id at - # all get one fresh fallback unit each. - first_present = codes[present].groupby(hh[present]).first() - miss_hh = hh[~present] - fallback = miss_hh.map(first_present) - no_present = fallback.isna() - if no_present.any(): - fresh = pd.factorize(miss_hh[no_present], sort=False)[0] - fallback.loc[no_present] = fresh + (int(codes.max()) + 1) - codes.loc[~present] = fallback.astype(np.int64).to_numpy() - return codes.rename(id_column) - - def _collapse_group_table( - self, - persons: pd.DataFrame, - id_column: str, - ) -> pd.DataFrame: - return ( - persons.groupby(id_column, as_index=False) - .agg({"household_id": "first"}) - .astype({id_column: np.int64, "household_id": np.int64}) - ) - - def _attach_spm_unit_source_columns( - self, - persons: pd.DataFrame, - spm_units: pd.DataFrame, - ) -> pd.DataFrame: - """Attach observed SPM-unit inputs carried on CPS person rows.""" - if "spm_unit_id" not in persons.columns: - return self._attach_policyengine_spm_takeup_inputs(spm_units) - - aggregation_by_column = { - "receives_housing_assistance": "max", - "takes_up_housing_assistance_if_eligible": "max", - "takes_up_snap_if_eligible": "max", - "takes_up_tanf_if_eligible": "max", - "spm_unit_energy_subsidy": "first", - "spm_unit_capped_housing_subsidy_reported": "first", - "spm_unit_capped_work_childcare_expenses": "first", - "spm_unit_pre_subsidy_childcare_expenses": "first", - } - aggregations = { - column: aggregation - for column, aggregation in aggregation_by_column.items() - if column in persons.columns and column not in spm_units.columns - } - if not aggregations: - return self._attach_policyengine_spm_takeup_inputs(spm_units) - - source_values = persons.groupby("spm_unit_id", as_index=False).agg(aggregations) - merged = spm_units.merge(source_values, on="spm_unit_id", how="left") - return self._attach_policyengine_spm_takeup_inputs(merged) - - def _attach_policyengine_snap_takeup( - self, - spm_units: pd.DataFrame, - ) -> pd.DataFrame: - """Attach eCPS-style SNAP take-up input before PE materialization.""" - result = spm_units.copy() - column = "takes_up_snap_if_eligible" - if column in result.columns: - result[column] = ( - pd.to_numeric(result[column], errors="coerce") - .fillna(0.0) - .ne(0.0) - .astype(bool) - ) - return result - - year = int( - self.config.policyengine_dataset_year - or self.config.policyengine_target_period - or 2024 - ) - rate = _load_microplex_takeup_rate("snap", year) - rng = _microplex_seeded_rng(column) - result[column] = rng.random(len(result)) < rate - return result - - def _normalize_relationship_to_head(self, persons: pd.DataFrame) -> pd.Series: - family_normalized: pd.Series | None = None - if "family_relationship" in persons.columns: - family_relationship = ( - pd.to_numeric(persons["family_relationship"], errors="coerce") - .fillna(-1) - .astype(int) - ) - unique_values = set(family_relationship.unique().tolist()) - if unique_values.issubset({0, 1, 2, 3, 4}): - family_normalized = pd.Series(3, index=persons.index, dtype=int) - household_groups = ( - persons.groupby("household_id", sort=False).groups.values() - if "household_id" in persons.columns - else [persons.index] - ) - for member_index in household_groups: - member_index = list(member_index) - household_codes = set( - family_relationship.loc[member_index].tolist() - ) - if 0 in household_codes: - # Some sources already use the optimizer's 0-based coding. - mapped = family_relationship.loc[member_index].map( - {0: 0, 1: 1, 2: 2, 3: 3, 4: 3} - ) - else: - # CPS A_FAMREL is 1-based: 1=head, 2=spouse, 3=child, 4=other. - mapped = family_relationship.loc[member_index].map( - {1: 0, 2: 1, 3: 2, 4: 3} - ) - family_normalized.loc[member_index] = mapped.fillna(3).astype(int) - - if "relationship_to_head" not in persons.columns: - if family_normalized is not None: - return self._repair_relationship_to_head(persons, family_normalized) - if "is_spouse" in persons.columns or "is_dependent" in persons.columns: - order = persons.groupby("household_id").cumcount() - normalized = pd.Series(3, index=persons.index, dtype=int) - normalized.loc[order == 0] = 0 - if "is_spouse" in persons.columns: - spouse_mask = ( - pd.to_numeric(persons["is_spouse"], errors="coerce") - .fillna(0) - .astype(int) - > 0 - ) - normalized.loc[spouse_mask] = 1 - if "is_dependent" in persons.columns: - dependent_mask = ( - pd.to_numeric(persons["is_dependent"], errors="coerce") - .fillna(0) - .astype(int) - > 0 - ) - normalized.loc[dependent_mask & ~normalized.eq(1)] = 2 - return self._repair_relationship_to_head(persons, normalized) - order = persons.groupby("household_id").cumcount() - normalized = order.map(lambda idx: 0 if idx == 0 else 3).astype(int) - return self._repair_relationship_to_head(persons, normalized) - - relationship = ( - pd.to_numeric(persons["relationship_to_head"], errors="coerce") - .fillna(-1) - .astype(int) - ) - unique_values = set(relationship.unique().tolist()) - if unique_values.issubset({0, 1, 2, 3}): - if family_normalized is not None: - relationship_detail = set(relationship.unique().tolist()) & {1, 2} - family_detail = set(family_normalized.unique().tolist()) & {1, 2} - if len(family_detail) > len(relationship_detail): - return self._repair_relationship_to_head(persons, family_normalized) - return self._repair_relationship_to_head(persons, relationship) - - if unique_values.issubset({1, 2, 3, 4}): - normalized = ( - relationship.map({1: 0, 2: 1, 3: 3, 4: 2}).fillna(3).astype(int) - ) - return self._repair_relationship_to_head(persons, normalized) - - order = persons.groupby("household_id").cumcount() - normalized = pd.Series(3, index=persons.index, dtype=int) - normalized.loc[order == 0] = 0 - normalized.loc[(order == 1) & (persons["age"] >= 18)] = 1 - normalized.loc[persons["age"] < 18] = 2 - return self._repair_relationship_to_head(persons, normalized) - - def _repair_relationship_to_head( - self, - persons: pd.DataFrame, - relationship: pd.Series, - ) -> pd.Series: - """Repair household relationship patterns so tax-unit construction has one clear head.""" - normalized = relationship.astype(int).copy() - if "household_id" not in persons.columns: - return normalized - - ages = pd.to_numeric(persons.get("age", 0), errors="coerce").fillna(0.0) - grouped = persons.groupby("household_id", sort=False).groups - for member_index in grouped.values(): - member_index = list(member_index) - household_relationship = normalized.loc[member_index].copy() - household_ages = ages.loc[member_index] - - head_index = household_relationship[ - household_relationship.eq(0) - ].index.tolist() - if not head_index: - spouse_candidates = [ - index - for index in household_relationship[ - household_relationship.eq(1) - ].index.tolist() - if household_ages.loc[index] >= 18 - ] - adult_candidates = [ - index - for index in household_relationship.index.tolist() - if household_ages.loc[index] >= 18 - ] - candidate_pool = ( - spouse_candidates - or adult_candidates - or household_relationship.index.tolist() - ) - head_choice = max( - candidate_pool, key=lambda index: household_ages.loc[index] - ) - normalized.loc[head_choice] = 0 - head_index = [head_choice] - elif len(head_index) > 1: - keep_head = max(head_index, key=lambda index: household_ages.loc[index]) - for index in head_index: - if index == keep_head: - continue - normalized.loc[index] = 3 if household_ages.loc[index] >= 19 else 2 - head_index = [keep_head] - - spouse_index = normalized.loc[member_index][ - normalized.loc[member_index].eq(1) - ].index.tolist() - if len(spouse_index) > 1: - keep_spouse = max( - spouse_index, key=lambda index: household_ages.loc[index] - ) - for index in spouse_index: - if index == keep_spouse: - continue - normalized.loc[index] = 3 if household_ages.loc[index] >= 19 else 2 - - return normalized.astype(int) - - def _infer_policyengine_variable_bindings( - self, - tables: PolicyEngineUSEntityTableBundle, - ) -> dict[str, PolicyEngineUSVariableBinding]: - return infer_policyengine_us_variable_bindings(tables) - - def _filter_supported_policyengine_targets( - self, - targets: list[TargetSpec], - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], - ) -> list[TargetSpec]: - return filter_supported_policyengine_us_targets(targets, tables, bindings) - - def _policyengine_variables_to_materialize( - self, - targets: list[TargetSpec], - bindings: dict[str, PolicyEngineUSVariableBinding], - ) -> set[str]: - return policyengine_us_variables_to_materialize(targets, bindings) - - def _has_policyengine_entity_table( - self, - entity: EntityType, - tables: PolicyEngineUSEntityTableBundle, - ) -> bool: - entity_tables = { - EntityType.HOUSEHOLD: tables.households, - EntityType.PERSON: tables.persons, - EntityType.TAX_UNIT: tables.tax_units, - EntityType.SPM_UNIT: tables.spm_units, - EntityType.FAMILY: tables.families, - } - table = entity_tables.get(entity) - return table is not None - - def _normalize_policyengine_filing_status(self, value: Any) -> str: - normalized = str(value).strip().lower() - mapping = { - "single": "SINGLE", - "married_joint": "JOINT", - "married_filing_jointly": "JOINT", - "joint": "JOINT", - "married_filing_separately": "SEPARATE", - "separate": "SEPARATE", - "head_of_household": "HEAD_OF_HOUSEHOLD", - "widow": "SURVIVING_SPOUSE", - "qualifying_widow": "SURVIVING_SPOUSE", - "surviving_spouse": "SURVIVING_SPOUSE", - } - return mapping.get(normalized, "SINGLE") - - def _augment_policyengine_person_inputs( - self, - persons: pd.DataFrame, - ) -> pd.DataFrame: - result = normalize_social_security_columns(normalize_dividend_columns(persons)) - zero = pd.Series(0.0, index=result.index, dtype=float) - - def first_present(*columns: str) -> pd.Series: - for column in columns: - if column in result.columns: - return ( - pd.to_numeric( - result[column], - errors="coerce", - ) - .fillna(0.0) - .astype(float) - ) - return zero.copy() - - def first_nonzero_or_present(*columns: str) -> pd.Series: - values = zero.copy() - found = False - for column in columns: - if column not in result.columns: - continue - candidate = ( - pd.to_numeric( - result[column], - errors="coerce", - ) - .fillna(0.0) - .astype(float) - ) - if not found: - values = candidate.copy() - found = True - continue - values = values.where(values.ne(0.0), candidate) - return values if found else zero.copy() - - def has_any(*columns: str) -> bool: - return any(column in result.columns for column in columns) - - def signed_rental_income() -> pd.Series: - if has_any("rental_income_positive", "rental_income_negative"): - return first_present("rental_income_positive") - first_present( - "rental_income_negative" - ) - return first_present("rental_income") - - def first_signed_or_present(*columns: str) -> pd.Series: - candidates: list[pd.Series] = [] - for column in columns: - if column not in result.columns: - continue - candidates.append(first_present(column)) - if not candidates: - return zero.copy() - signed = next( - (candidate for candidate in candidates if candidate.lt(0.0).any()), - candidates[0], - ) - values = signed.copy() - for candidate in candidates: - if candidate is signed: - continue - values = values.where(values.ne(0.0), candidate) - return values - - signed_self_employment_income = first_signed_or_present( - "self_employment_income_before_lsr", - "self_employment_income", - ) - tax_exempt_interest_income = first_present("tax_exempt_interest_income") - explicit_taxable_interest_income = first_present("taxable_interest_income") - taxable_interest_income = explicit_taxable_interest_income.where( - explicit_taxable_interest_income.ne(0.0) - | tax_exempt_interest_income.ne(0.0), - first_present("interest_income"), - ) - - if "is_female" in result.columns: - result["is_female"] = result["is_female"].fillna(False).astype(bool) - elif "sex" in result.columns: - sex = pd.to_numeric(result["sex"], errors="coerce").fillna(0).astype(int) - result["is_female"] = sex.eq(2) - - if "cps_race" in result.columns: - result["cps_race"] = ( - pd.to_numeric(result["cps_race"], errors="coerce").fillna(0).astype(int) - ) - elif "race" in result.columns: - result["cps_race"] = ( - pd.to_numeric(result["race"], errors="coerce").fillna(0).astype(int) - ) - - if "is_hispanic" in result.columns: - result["is_hispanic"] = result["is_hispanic"].fillna(False).astype(bool) - elif "hispanic" in result.columns: - hispanic = pd.to_numeric(result["hispanic"], errors="coerce") - observed_codes = set(hispanic.dropna().astype(int).unique().tolist()) - if observed_codes and observed_codes <= {1, 2}: - result["is_hispanic"] = hispanic.fillna(0).astype(int).eq(1) - else: - result["is_hispanic"] = hispanic.fillna(0).astype(int).ne(0) - - if has_any("pre_subsidy_rent", "rent"): - result["pre_subsidy_rent"] = first_nonzero_or_present( - "pre_subsidy_rent", - "rent", - ).clip(lower=0.0) - if has_any( - "weekly_hours_worked_before_lsr", - "hours_worked_last_week", - "hours_worked", - ): - result["weekly_hours_worked_before_lsr"] = first_nonzero_or_present( - "weekly_hours_worked_before_lsr", - "hours_worked_last_week", - "hours_worked", - ).clip(lower=0.0) - - retirement_desired_columns = { - "self_employed_pension_contributions": ( - "self_employed_pension_contributions_desired" - ), - "traditional_401k_contributions": "traditional_401k_contributions_desired", - "roth_401k_contributions": "roth_401k_contributions_desired", - "traditional_ira_contributions": "traditional_ira_contributions_desired", - "roth_ira_contributions": "roth_ira_contributions_desired", - } - if all( - column in result.columns for column in retirement_desired_columns.values() - ): - limit_year = max( - min( - self.config.policyengine_dataset_year or 2024, - max(RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR), - ), - min(RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR), - ) - limits = RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR[limit_year] - age = first_present("age") - catch_up_eligible = age.ge(RETIREMENT_CATCH_UP_AGE) - limit_401k = pd.Series( - float(limits["401k"]), - index=result.index, - dtype=float, - ) + catch_up_eligible.astype(float) * float(limits["401k_catch_up"]) - limit_ira = pd.Series( - float(limits["ira"]), - index=result.index, - dtype=float, - ) + catch_up_eligible.astype(float) * float(limits["ira_catch_up"]) - - def capped_at(values: pd.Series, caps: pd.Series) -> pd.Series: - return pd.Series( - np.minimum(values.to_numpy(), caps.to_numpy()), - index=result.index, - dtype=float, - ) - - self_employed_pension = first_present( - "self_employed_pension_contributions_desired" - ).clip(lower=0.0) - traditional_401k = capped_at( - first_present("traditional_401k_contributions_desired").clip(lower=0.0), - limit_401k, - ) - roth_401k = capped_at( - first_present("roth_401k_contributions_desired").clip(lower=0.0), - (limit_401k - traditional_401k).clip(lower=0.0), - ) - traditional_ira = capped_at( - first_present("traditional_ira_contributions_desired").clip(lower=0.0), - limit_ira, - ) - roth_ira = capped_at( - first_present("roth_ira_contributions_desired").clip(lower=0.0), - (limit_ira - traditional_ira).clip(lower=0.0), - ) - result["self_employed_pension_contributions"] = self_employed_pension - result["traditional_401k_contributions"] = traditional_401k - result["roth_401k_contributions"] = roth_401k - result["traditional_ira_contributions"] = traditional_ira - result["roth_ira_contributions"] = roth_ira - - marital_status = ( - pd.to_numeric(result["marital_status"], errors="coerce") - if "marital_status" in result.columns - else None - ) - filing_status_code = ( - pd.to_numeric(result["filing_status_code"], errors="coerce") - if "filing_status_code" in result.columns - else None - ) - filing_status_text = ( - result["filing_status"].astype(str).str.strip().str.upper() - if "filing_status" in result.columns - else None - ) - - if "is_separated" in result.columns: - result["is_separated"] = result["is_separated"].fillna(False).astype(bool) - elif marital_status is not None: - result["is_separated"] = marital_status.fillna(0).astype(int).eq(6) - elif filing_status_code is not None: - result["is_separated"] = filing_status_code.fillna(0).astype(int).eq(3) - elif filing_status_text is not None: - result["is_separated"] = filing_status_text.eq("SEPARATE") - - if "is_surviving_spouse" in result.columns: - result["is_surviving_spouse"] = ( - result["is_surviving_spouse"].fillna(False).astype(bool) - ) - elif marital_status is not None: - result["is_surviving_spouse"] = marital_status.fillna(0).astype(int).eq(4) - elif filing_status_code is not None: - result["is_surviving_spouse"] = ( - filing_status_code.fillna(0).astype(int).eq(5) - ) - elif filing_status_text is not None: - result["is_surviving_spouse"] = filing_status_text.eq("SURVIVING_SPOUSE") - - if "medicaid" in result.columns: - result["medicaid"] = ( - pd.to_numeric(result["medicaid"], errors="coerce") - .fillna(0.0) - .astype(float) - ) - if "medicaid_enrolled" in result.columns: - result["medicaid_enrolled"] = ( - result["medicaid_enrolled"].fillna(False).astype(bool) - ) - if "has_medicare" in result.columns: - result["has_medicare"] = ( - pd.to_numeric(result["has_medicare"], errors="coerce") - .fillna(0.0) - .astype(float) - .ne(0.0) - ) - if "is_blind" in result.columns: - result["is_blind"] = ( - pd.to_numeric(result["is_blind"], errors="coerce").fillna(0.0).ne(0.0) - ) - elif "difficulty_seeing" in result.columns: - result["is_blind"] = first_present("difficulty_seeing").gt(0.0) - if "medicare_part_b_premiums" in result.columns: - medicare_part_b_premiums = ( - pd.to_numeric( - result["medicare_part_b_premiums"], - errors="coerce", - ) - .fillna(0.0) - .clip(lower=0.0) - .astype(float) - ) - if "has_medicare" in result.columns: - medicare_part_b_premiums = medicare_part_b_premiums.where( - result["has_medicare"], - 0.0, - ) - result["medicare_part_b_premiums"] = medicare_part_b_premiums - - if "takes_up_ssi_if_eligible" in result.columns: - result["takes_up_ssi_if_eligible"] = ( - pd.to_numeric( - result["takes_up_ssi_if_eligible"], - errors="coerce", - ) - .fillna(0.0) - .ne(0.0) - ) - elif "ssi_reported" in result.columns: - result["takes_up_ssi_if_eligible"] = first_present("ssi_reported").gt(0.0) - elif "ssi" in result.columns: - result["takes_up_ssi_if_eligible"] = first_present("ssi").gt(0.0) - - known_nonemployment = ( - signed_self_employment_income - + first_nonzero_or_present("taxable_interest_income", "interest_income") - + first_nonzero_or_present("ordinary_dividend_income", "dividend_income") - + signed_rental_income() - + first_present("gross_social_security", "social_security") - + first_present("ssi") - + first_present("public_assistance") - + first_nonzero_or_present("taxable_pension_income", "pension_income") - + first_present("unemployment_compensation") - ) - fallback_employment_income = ( - pd.to_numeric(result.get("income", zero), errors="coerce") - .fillna(0.0) - .astype(float) - - known_nonemployment - ).clip(lower=0.0) - - if "employment_income_before_lsr" in result.columns: - result["employment_income_before_lsr"] = ( - first_nonzero_or_present( - "employment_income_before_lsr", - "wage_income", - ) - if "employment_income" not in result.columns - else first_present("employment_income_before_lsr") - ) - elif "employment_income" in result.columns: - result["employment_income_before_lsr"] = first_present("employment_income") - elif "wage_income" in result.columns: - result["employment_income_before_lsr"] = first_present("wage_income") - else: - result["employment_income_before_lsr"] = fallback_employment_income - result["self_employment_income_before_lsr"] = signed_self_employment_income - result["taxable_interest_income"] = taxable_interest_income - result["tax_exempt_interest_income"] = tax_exempt_interest_income - result["qualified_dividend_income"] = first_present( - "qualified_dividend_income", - ).clip(lower=0.0) - result["non_qualified_dividend_income"] = first_present( - "non_qualified_dividend_income", - ).clip(lower=0.0) - dividend_alias = first_nonzero_or_present( - "ordinary_dividend_income", - "dividend_income", - ).clip(lower=0.0) - result["ordinary_dividend_income"] = dividend_alias - if has_any("qualified_dividend_income", "non_qualified_dividend_income"): - dividend_total = ( - result["qualified_dividend_income"] - + result["non_qualified_dividend_income"] - ).clip(lower=0.0) - result["ordinary_dividend_income"] = dividend_total.where( - dividend_total.ne(0.0), - dividend_alias, - ) - result["dividend_income"] = result["ordinary_dividend_income"] - else: - result = normalize_dividend_columns(result) - - result["short_term_capital_gains"] = first_present("short_term_capital_gains") - result["non_sch_d_capital_gains"] = first_present( - "non_sch_d_capital_gains", - "capital_gains_distributions", - ) - result["long_term_capital_gains_before_response"] = ( - first_nonzero_or_present( - "long_term_capital_gains_before_response", - "long_term_capital_gains", - "capital_gains", - ) - if has_any( - "long_term_capital_gains_before_response", - "long_term_capital_gains", - "capital_gains", - ) - else zero.copy() - ) - result["partnership_s_corp_income"] = first_present("partnership_s_corp_income") - result["partnership_se_income"] = first_present("partnership_se_income") - result["estate_income"] = first_present("estate_income") - result["farm_income"] = first_signed_or_present( - "farm_income", - "farm_operations_income", - ) - result["farm_operations_income"] = first_present("farm_operations_income") - result["farm_rent_income"] = first_present("farm_rent_income") - result["rental_income"] = signed_rental_income() - result["w2_wages_from_qualified_business"] = first_present( - "w2_wages_from_qualified_business" - ).clip(lower=0.0) - result["unadjusted_basis_qualified_property"] = first_present( - "unadjusted_basis_qualified_property" - ).clip(lower=0.0) - result["qualified_reit_and_ptp_income"] = first_present( - "qualified_reit_and_ptp_income" - ).clip(lower=0.0) - result["qualified_bdc_income"] = first_present("qualified_bdc_income").clip( - lower=0.0 - ) - result["sstb_self_employment_income_before_lsr"] = first_nonzero_or_present( - "sstb_self_employment_income_before_lsr", - "sstb_self_employment_income", - ) - result["sstb_w2_wages_from_qualified_business"] = first_present( - "sstb_w2_wages_from_qualified_business" - ).clip(lower=0.0) - result["sstb_unadjusted_basis_qualified_property"] = first_present( - "sstb_unadjusted_basis_qualified_property" - ).clip(lower=0.0) - for qbi_bool_column in ( - "business_is_sstb", - "self_employment_income_would_be_qualified", - "sstb_self_employment_income_would_be_qualified", - "farm_operations_income_would_be_qualified", - "farm_rent_income_would_be_qualified", - "partnership_s_corp_income_would_be_qualified", - "rental_income_would_be_qualified", - "estate_income_would_be_qualified", - ): - if qbi_bool_column in result.columns: - result[qbi_bool_column] = self._normal_bool_series( - result[qbi_bool_column], - index=result.index, - ) - result["health_savings_account_ald"] = first_present( - "health_savings_account_ald" - ) - result["self_employed_health_insurance_ald"] = first_present( - "self_employed_health_insurance_ald" - ) - result["self_employed_pension_contribution_ald"] = first_present( - "self_employed_pension_contribution_ald" - ) - result["taxable_private_pension_income"] = first_present( - "taxable_private_pension_income", - "taxable_pension_income", - "pension_income", - ) - result["taxable_public_pension_income"] = first_present( - "taxable_public_pension_income" - ) - result["tax_exempt_private_pension_income"] = first_present( - "tax_exempt_private_pension_income", - "tax_exempt_pension_income", - ) - result["tax_exempt_public_pension_income"] = first_present( - "tax_exempt_public_pension_income" - ) - result["social_security_retirement"] = ( - social_security_retirement_compatible_amount(result) - ) - result["social_security_disability"] = first_present( - "social_security_disability" - ) - result["social_security_survivors"] = first_present("social_security_survivors") - result["social_security_dependents"] = first_present( - "social_security_dependents" - ) - result["unemployment_compensation"] = first_present("unemployment_compensation") - result["state_income_tax_reported"] = first_present( - "state_income_tax_reported", - "state_income_tax_paid", - ) - result["student_loan_interest"] = first_present("student_loan_interest") - result["home_mortgage_interest"] = first_nonzero_or_present( - "home_mortgage_interest", - "deductible_mortgage_interest", - "mortgage_interest_paid", - ).clip(lower=0.0) - result["investment_interest_expense"] = first_nonzero_or_present( - "investment_interest_expense", - "investment_income_elected_form_4952", - ).clip(lower=0.0) - result["other_health_insurance_premiums"] = first_nonzero_or_present( - "other_health_insurance_premiums", - "health_insurance_premiums_without_medicare_part_b", - ).clip(lower=0.0) - return result - - def _resolve_policyengine_tax_benefit_system(self) -> Any: - simulation_cls = self.config.policyengine_simulation_cls - if simulation_cls is None: - import policyengine_us - - return getattr(policyengine_us.system, "system", policyengine_us.system) - - tax_benefit_system = getattr(simulation_cls, "tax_benefit_system", None) - if tax_benefit_system is None: - tax_benefit_system = getattr(simulation_cls, "system", None) - if tax_benefit_system is not None: - return getattr(tax_benefit_system, "system", tax_benefit_system) - raise ValueError( - "policyengine_simulation_cls must expose a tax_benefit_system or system attribute" - ) - - -def build_us_microplex( - persons: pd.DataFrame, - households: pd.DataFrame, - config: USMicroplexBuildConfig | None = None, -) -> USMicroplexBuildResult: - """Convenience wrapper for the US microplex pipeline.""" - pipeline = USMicroplexPipeline(config) - return pipeline.build(persons, households) - - -@dataclass -class USMicroplexRecalibrateResult: - """Output of ``recalibrate_policyengine_us_from_checkpoint``. - - Narrower than ``USMicroplexBuildResult`` because synthesis state is - unavailable when resuming: no ``seed_data``, no ``synthesizer``, no - source frames. Only calibration output is populated. - """ - - config: USMicroplexBuildConfig - loaded_stage: str - checkpoint_path: Path - policyengine_tables: PolicyEngineUSEntityTableBundle - calibrated_data: pd.DataFrame - calibration_summary: dict[str, Any] - - -def recalibrate_policyengine_us_from_checkpoint( - config: USMicroplexBuildConfig, - checkpoint_path: str | Path, -) -> USMicroplexRecalibrateResult: - """Load a saved pipeline checkpoint and rerun calibration against it. - - Use for fast iteration on calibration config (backend, lambda - schedule, targets) without paying the ~11 h synthesis + donor - imputation cost that produced the bundle. Both - ``post_imputation`` and ``post_microsim`` checkpoints are - supported: the latter skips microsim too because - ``infer_policyengine_us_variable_bindings`` picks up the - materialized target vars as columns on the bundle, so - ``policyengine_us_variables_to_materialize`` returns an empty set - and ``_resolve_policyengine_calibration_targets`` short-circuits - past the materialization call. - """ - checkpoint_path = Path(checkpoint_path) - bundle, metadata = load_us_pipeline_checkpoint(checkpoint_path) - stage = metadata.get("stage") - if stage not in {"post_imputation", "post_microsim"}: - raise ValueError( - f"Cannot resume from checkpoint stage {stage!r}; expected " - "'post_imputation' or 'post_microsim'." - ) - - pipeline = USMicroplexPipeline(config) - policyengine_tables, calibrated_data, calibration_summary = ( - pipeline.calibrate_policyengine_tables(bundle) - ) - return USMicroplexRecalibrateResult( - config=config, - loaded_stage=stage, - checkpoint_path=checkpoint_path, - policyengine_tables=policyengine_tables, - calibrated_data=calibrated_data, - calibration_summary=calibration_summary, - ) diff --git a/src/microplex_us/pipelines/version_benchmark.py b/src/microplex_us/pipelines/version_benchmark.py deleted file mode 100644 index bde20b94..00000000 --- a/src/microplex_us/pipelines/version_benchmark.py +++ /dev/null @@ -1,178 +0,0 @@ -"""Canonical US version-bump benchmark entrypoint.""" - -from __future__ import annotations - -import argparse -from dataclasses import dataclass -from pathlib import Path - -try: - from microplex_us.data_sources.cps import CPSASECParquetSourceProvider - from microplex_us.data_sources.psid import PSIDSourceProvider - from microplex_us.data_sources.puf import PUFSourceProvider - from microplex_us.pipelines.artifacts import ( - build_and_save_versioned_us_microplex_from_source_providers, - ) - from microplex_us.pipelines.us import USMicroplexBuildConfig -except ImportError: - CPSASECParquetSourceProvider = None - PSIDSourceProvider = None - PUFSourceProvider = None - build_and_save_versioned_us_microplex_from_source_providers = None - - @dataclass(frozen=True) - class USMicroplexBuildConfig: - """Lightweight import-time fallback for CLI argument tests.""" - - n_synthetic: int - random_seed: int - policyengine_baseline_dataset: str - policyengine_targets_db: str - policyengine_dataset_year: int - policyengine_target_period: int - policyengine_target_variables: tuple[str, ...] - policyengine_target_domains: tuple[str, ...] - policyengine_target_geo_levels: tuple[str, ...] - policyengine_target_profile: str - policyengine_calibration_target_profile: str - -from microplex_us.pipelines.site_snapshot import write_us_microplex_site_snapshot - - -def _resolve_site_snapshot_path( - *, - output_root: str | Path, - site_snapshot_path: str | Path | None, -) -> Path: - if site_snapshot_path is not None: - return Path(site_snapshot_path) - output_root_path = Path(output_root).resolve() - if output_root_path.name == "artifacts": - return output_root_path / "site_snapshot_us.json" - return output_root_path.parent / "site_snapshot_us.json" - - -def main(argv: list[str] | None = None) -> None: - if CPSASECParquetSourceProvider is None: - raise RuntimeError( - "version benchmark source providers are unavailable with this core build" - ) - if build_and_save_versioned_us_microplex_from_source_providers is None: - raise RuntimeError( - "version benchmark artifact builder is unavailable with this core build" - ) - - parser = argparse.ArgumentParser( - description="Run the canonical US version-bump benchmark build." - ) - parser.add_argument("--output-root", required=True) - parser.add_argument("--cps-parquet-dir", required=True) - parser.add_argument("--baseline-dataset", required=True) - parser.add_argument("--targets-db", required=True) - parser.add_argument("--policyengine-us-data-repo") - parser.add_argument("--version-id") - parser.add_argument("--site-snapshot-path") - parser.add_argument("--target-period", type=int, default=2024) - parser.add_argument("--n-synthetic", type=int, default=100_000) - parser.add_argument("--random-seed", type=int, default=42) - parser.add_argument("--target-profile", default="pe_native_broad") - parser.add_argument("--calibration-target-profile") - parser.add_argument("--puf-path") - parser.add_argument("--puf-demographics-path") - parser.add_argument("--psid-data-dir") - parser.add_argument("--target-variable", action="append", default=[]) - parser.add_argument("--target-domain", action="append", default=[]) - parser.add_argument("--target-geo-level", action="append", default=[]) - parser.add_argument( - "--require-beat-pe-native-loss", - action="store_true", - help="Fail if the saved run does not beat the PE baseline on PE's native enhanced-CPS loss.", - ) - args = parser.parse_args(argv) - - providers = [CPSASECParquetSourceProvider(data_dir=args.cps_parquet_dir)] - if args.puf_path is not None: - providers.append( - PUFSourceProvider( - puf_path=args.puf_path, - demographics_path=args.puf_demographics_path, - target_year=args.target_period, - ) - ) - if args.psid_data_dir is not None: - providers.append(PSIDSourceProvider(data_dir=args.psid_data_dir)) - - config = USMicroplexBuildConfig( - n_synthetic=args.n_synthetic, - random_seed=args.random_seed, - policyengine_baseline_dataset=args.baseline_dataset, - policyengine_targets_db=args.targets_db, - policyengine_dataset_year=args.target_period, - policyengine_target_period=args.target_period, - policyengine_target_variables=tuple(args.target_variable), - policyengine_target_domains=tuple(args.target_domain), - policyengine_target_geo_levels=tuple(args.target_geo_level), - policyengine_target_profile=args.target_profile, - policyengine_calibration_target_profile=( - args.calibration_target_profile or args.target_profile - ), - ) - - artifacts = build_and_save_versioned_us_microplex_from_source_providers( - providers=providers, - output_root=args.output_root, - config=config, - version_id=args.version_id, - frontier_metric="enhanced_cps_native_loss_delta", - policyengine_us_data_repo=args.policyengine_us_data_repo, - require_policyengine_native_score=True, - ) - - native_delta = ( - artifacts.current_entry.enhanced_cps_native_loss_delta - if artifacts.current_entry is not None - else None - ) - candidate_native_loss = ( - artifacts.current_entry.candidate_enhanced_cps_native_loss - if artifacts.current_entry is not None - else None - ) - baseline_native_loss = ( - artifacts.current_entry.baseline_enhanced_cps_native_loss - if artifacts.current_entry is not None - else None - ) - if native_delta is None: - raise SystemExit( - "Saved US benchmark artifact is missing PE-native enhanced-CPS loss delta." - ) - if args.require_beat_pe_native_loss and native_delta >= 0.0: - raise SystemExit( - "US version-bump benchmark did not beat PE on PE-native enhanced-CPS loss: " - f"candidate={candidate_native_loss:.6f}, " - f"baseline={baseline_native_loss:.6f}, " - f"delta={native_delta:.6f}" - ) - - write_us_microplex_site_snapshot( - artifacts.artifact_paths.output_dir, - _resolve_site_snapshot_path( - output_root=args.output_root, - site_snapshot_path=args.site_snapshot_path, - ), - ) - - print(artifacts.artifact_paths.output_dir) - print( - "PE native enhanced-CPS loss: " - f"candidate={candidate_native_loss:.6f} " - f"baseline={baseline_native_loss:.6f} " - f"delta={native_delta:.6f}" - ) - if artifacts.artifact_paths.run_registry is not None: - print(artifacts.artifact_paths.run_registry) - - -if __name__ == "__main__": - main() diff --git a/src/microplex_us/pipelines/versioned_artifacts.py b/src/microplex_us/pipelines/versioned_artifacts.py deleted file mode 100644 index e6485c73..00000000 --- a/src/microplex_us/pipelines/versioned_artifacts.py +++ /dev/null @@ -1,686 +0,0 @@ -"""Versioned build-and-save entrypoints for US Microplex artifacts.""" - -from __future__ import annotations - -from collections.abc import Mapping -from dataclasses import asdict, replace -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -from microplex.core import SourceProvider, SourceQuery -from microplex.targets import TargetProvider - -from microplex_us.pipelines.artifact_types import ( - USMicroplexArtifactPaths, - USMicroplexVersionedBuildArtifacts, -) -from microplex_us.pipelines.registry import ( - FrontierMetric, - load_us_microplex_run_registry, - select_us_microplex_frontier_entry, -) -from microplex_us.pipelines.stage_run import ( - USArtifactRef, - USDiagnosticOutput, - USRunProfileOutputs, - USStageInputOverride, -) -from microplex_us.pipelines.stage_runtime import USStageRuntimeWriter -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexPipeline, - build_us_microplex, -) -from microplex_us.policyengine.harness import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSHarnessSlice, -) - - -def _save_us_microplex_artifacts(*args: Any, **kwargs: Any) -> USMicroplexArtifactPaths: - from microplex_us.pipelines.artifacts import save_us_microplex_artifacts - - return save_us_microplex_artifacts(*args, **kwargs) - - -def _facade_pipeline_cls() -> type[USMicroplexPipeline]: - from microplex_us.pipelines import artifacts - - return artifacts.USMicroplexPipeline - - -def _finalize_via_facade( - build_result: USMicroplexBuildResult, - **kwargs: Any, -) -> USMicroplexVersionedBuildArtifacts: - from microplex_us.pipelines import artifacts - - finalize = artifacts._finalize_versioned_build_artifacts - if finalize is _finalize_versioned_build_artifacts: - return _finalize_versioned_build_artifacts(build_result, **kwargs) - return finalize(build_result, **kwargs) - - -def save_versioned_us_microplex_artifacts( - result: USMicroplexBuildResult, - output_root: str | Path, - *, - version_id: str | None = None, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - defer_policyengine_harness: bool = False, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, - enable_child_tax_unit_agi_drift: bool = False, - child_tax_unit_agi_drift_variables: tuple[str, ...] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), - stage_runtime_writer: USStageRuntimeWriter | None = None, -) -> USMicroplexArtifactPaths: - """Persist a build under a stable versioned directory beneath one output root.""" - output_root = Path(output_root) - output_root.mkdir(parents=True, exist_ok=True) - resolved_version_id, output_dir = _allocate_versioned_output_dir( - output_root, - version_id=version_id, - result=result, - ) - paths = _save_us_microplex_artifacts( - result, - output_dir, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path or output_root / "run_registry.jsonl", - run_index_path=run_index_path or output_root, - run_registry_metadata=run_registry_metadata, - enable_child_tax_unit_agi_drift=enable_child_tax_unit_agi_drift, - child_tax_unit_agi_drift_variables=child_tax_unit_agi_drift_variables, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - stage_runtime_writer=stage_runtime_writer, - ) - return replace(paths, version_id=resolved_version_id) - - -def build_and_save_versioned_us_microplex( - persons: Any, - households: Any, - output_root: str | Path, - *, - config: USMicroplexBuildConfig | None = None, - version_id: str | None = None, - frontier_metric: FrontierMetric = "candidate_composite_parity_loss", - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - defer_policyengine_harness: bool = False, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, - enable_child_tax_unit_agi_drift: bool = False, - child_tax_unit_agi_drift_variables: tuple[str, ...] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), -) -> USMicroplexVersionedBuildArtifacts: - """Build a US microplex dataset, save a versioned bundle, and report frontier gap.""" - build_result = build_us_microplex(persons, households, config=config) - return save_versioned_us_microplex_build_result( - build_result, - output_root, - version_id=version_id, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=run_registry_metadata, - enable_child_tax_unit_agi_drift=enable_child_tax_unit_agi_drift, - child_tax_unit_agi_drift_variables=child_tax_unit_agi_drift_variables, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - - -def save_versioned_us_microplex_build_result( - build_result: USMicroplexBuildResult, - output_root: str | Path, - *, - version_id: str | None = None, - frontier_metric: FrontierMetric = "candidate_composite_parity_loss", - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - defer_policyengine_harness: bool = False, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, - enable_child_tax_unit_agi_drift: bool = False, - child_tax_unit_agi_drift_variables: tuple[str, ...] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), -) -> USMicroplexVersionedBuildArtifacts: - """Save an already-built result as a versioned bundle and report frontier gap.""" - return _finalize_via_facade( - build_result, - output_root=output_root, - version_id=version_id, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=run_registry_metadata, - enable_child_tax_unit_agi_drift=enable_child_tax_unit_agi_drift, - child_tax_unit_agi_drift_variables=child_tax_unit_agi_drift_variables, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - - -def build_and_save_versioned_us_microplex_from_source_provider( - provider: SourceProvider, - output_root: str | Path, - *, - config: USMicroplexBuildConfig | None = None, - query: SourceQuery | None = None, - version_id: str | None = None, - frontier_metric: FrontierMetric = "candidate_composite_parity_loss", - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - defer_policyengine_harness: bool = False, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, - enable_child_tax_unit_agi_drift: bool = False, - child_tax_unit_agi_drift_variables: tuple[str, ...] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), -) -> USMicroplexVersionedBuildArtifacts: - """Build from one source provider, save a versioned bundle, and report frontier gap.""" - pipeline = _facade_pipeline_cls()(config) - build_result = pipeline.build_from_source_provider(provider, query=query) - return _finalize_via_facade( - build_result, - output_root=output_root, - version_id=version_id, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=run_registry_metadata, - enable_child_tax_unit_agi_drift=enable_child_tax_unit_agi_drift, - child_tax_unit_agi_drift_variables=child_tax_unit_agi_drift_variables, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - - -def build_and_save_versioned_us_microplex_from_source_providers( - providers: list[SourceProvider], - output_root: str | Path, - *, - config: USMicroplexBuildConfig | None = None, - queries: dict[str, SourceQuery] | None = None, - version_id: str | None = None, - frontier_metric: FrontierMetric = "candidate_composite_parity_loss", - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - defer_policyengine_harness: bool = False, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, - enable_child_tax_unit_agi_drift: bool = False, - child_tax_unit_agi_drift_variables: tuple[str, ...] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), -) -> USMicroplexVersionedBuildArtifacts: - """Build from multiple source providers, save a versioned bundle, and report frontier gap.""" - resolved_config = config or USMicroplexBuildConfig() - _resolved_version_id, preallocated_output_dir, stage_runtime_writer = ( - _initialize_versioned_stage_runtime_writer( - output_root, - version_id=version_id, - config=resolved_config, - providers=providers, - queries=queries, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - ) - pipeline = _facade_pipeline_cls()( - resolved_config, - stage_runtime_writer=stage_runtime_writer, - ) - build_result = pipeline.build_from_source_providers(providers, queries=queries) - return _finalize_via_facade( - build_result, - output_root=output_root, - version_id=version_id, - preallocated_output_dir=preallocated_output_dir, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=run_registry_metadata, - enable_child_tax_unit_agi_drift=enable_child_tax_unit_agi_drift, - child_tax_unit_agi_drift_variables=child_tax_unit_agi_drift_variables, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - stage_runtime_writer=stage_runtime_writer, - ) - - -def build_and_save_versioned_us_microplex_from_data_dir( - data_dir: str | Path, - output_root: str | Path, - *, - config: USMicroplexBuildConfig | None = None, - version_id: str | None = None, - frontier_metric: FrontierMetric = "candidate_composite_parity_loss", - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None = None, - policyengine_target_provider: TargetProvider | None = None, - policyengine_baseline_dataset: str | Path | None = None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ) = None, - policyengine_harness_metadata: dict[str, Any] | None = None, - policyengine_us_data_repo: str | Path | None = None, - defer_policyengine_harness: bool = False, - require_policyengine_native_score: bool = False, - defer_policyengine_native_score: bool = False, - precomputed_policyengine_harness_payload: dict[str, Any] | None = None, - precomputed_policyengine_native_scores: dict[str, Any] | None = None, - run_registry_path: str | Path | None = None, - run_index_path: str | Path | None = None, - run_registry_metadata: dict[str, Any] | None = None, - enable_child_tax_unit_agi_drift: bool = False, - child_tax_unit_agi_drift_variables: tuple[str, ...] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), -) -> USMicroplexVersionedBuildArtifacts: - """Build from a CPS-style parquet directory, save a versioned bundle, and report frontier gap.""" - pipeline = _facade_pipeline_cls()(config) - build_result = pipeline.build_from_data_dir(data_dir) - return _finalize_via_facade( - build_result, - output_root=output_root, - version_id=version_id, - frontier_metric=frontier_metric, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=run_registry_metadata, - enable_child_tax_unit_agi_drift=enable_child_tax_unit_agi_drift, - child_tax_unit_agi_drift_variables=child_tax_unit_agi_drift_variables, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - - -def _finalize_versioned_build_artifacts( - build_result: USMicroplexBuildResult, - *, - output_root: str | Path, - version_id: str | None, - preallocated_output_dir: str | Path | None = None, - frontier_metric: FrontierMetric, - policyengine_comparison_cache: PolicyEngineUSComparisonCache | None, - policyengine_target_provider: TargetProvider | None, - policyengine_baseline_dataset: str | Path | None, - policyengine_harness_slices: ( - tuple[PolicyEngineUSHarnessSlice, ...] | list[PolicyEngineUSHarnessSlice] | None - ), - policyengine_harness_metadata: dict[str, Any] | None, - policyengine_us_data_repo: str | Path | None, - defer_policyengine_harness: bool, - require_policyengine_native_score: bool, - defer_policyengine_native_score: bool, - precomputed_policyengine_harness_payload: dict[str, Any] | None, - precomputed_policyengine_native_scores: dict[str, Any] | None, - run_registry_path: str | Path | None, - run_index_path: str | Path | None, - run_registry_metadata: dict[str, Any] | None, - enable_child_tax_unit_agi_drift: bool = False, - child_tax_unit_agi_drift_variables: tuple[str, ...] | None = None, - allow_stage_input_overrides: bool = False, - stage_input_overrides: tuple[USStageInputOverride, ...] = (), - stage_runtime_writer: USStageRuntimeWriter | None = None, -) -> USMicroplexVersionedBuildArtifacts: - if preallocated_output_dir is not None: - output_root_path = Path(output_root) - output_dir = Path(preallocated_output_dir) - artifact_paths = _save_us_microplex_artifacts( - build_result, - output_dir, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path - or output_root_path / "run_registry.jsonl", - run_index_path=run_index_path or output_root_path, - run_registry_metadata=run_registry_metadata, - enable_child_tax_unit_agi_drift=enable_child_tax_unit_agi_drift, - child_tax_unit_agi_drift_variables=child_tax_unit_agi_drift_variables, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - stage_runtime_writer=stage_runtime_writer, - ) - artifact_paths = replace(artifact_paths, version_id=output_dir.name) - else: - artifact_paths = save_versioned_us_microplex_artifacts( - build_result, - output_root, - version_id=version_id, - policyengine_comparison_cache=policyengine_comparison_cache, - policyengine_target_provider=policyengine_target_provider, - policyengine_baseline_dataset=policyengine_baseline_dataset, - policyengine_harness_slices=policyengine_harness_slices, - policyengine_harness_metadata=policyengine_harness_metadata, - policyengine_us_data_repo=policyengine_us_data_repo, - defer_policyengine_harness=defer_policyengine_harness, - require_policyengine_native_score=require_policyengine_native_score, - defer_policyengine_native_score=defer_policyengine_native_score, - precomputed_policyengine_harness_payload=precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores=precomputed_policyengine_native_scores, - run_registry_path=run_registry_path, - run_index_path=run_index_path, - run_registry_metadata=run_registry_metadata, - enable_child_tax_unit_agi_drift=enable_child_tax_unit_agi_drift, - child_tax_unit_agi_drift_variables=child_tax_unit_agi_drift_variables, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - stage_runtime_writer=stage_runtime_writer, - ) - current_entry = None - frontier_entry = None - frontier_delta = None - if ( - artifact_paths.run_registry is not None - and artifact_paths.version_id is not None - ): - registry_entries = load_us_microplex_run_registry(artifact_paths.run_registry) - current_entry = next( - ( - entry - for entry in reversed(registry_entries) - if entry.artifact_id == artifact_paths.version_id - ), - None, - ) - frontier_entry = select_us_microplex_frontier_entry( - artifact_paths.run_registry, - metric=frontier_metric, - ) - current_value = _registry_metric_value(current_entry, frontier_metric) - frontier_value = _registry_metric_value(frontier_entry, frontier_metric) - if current_value is not None and frontier_value is not None: - frontier_delta = current_value - frontier_value - return USMicroplexVersionedBuildArtifacts( - build_result=build_result, - artifact_paths=artifact_paths, - current_entry=current_entry, - frontier_entry=frontier_entry, - frontier_delta=frontier_delta, - ) - - -def _allocate_versioned_output_dir( - output_root: Path, - *, - version_id: str | None, - result: USMicroplexBuildResult, -) -> tuple[str, Path]: - return _allocate_versioned_output_dir_for_config( - output_root, - version_id=version_id, - config=result.config.to_dict(), - ) - - -def _allocate_versioned_output_dir_for_config( - output_root: Path, - *, - version_id: str | None, - config: dict[str, Any], -) -> tuple[str, Path]: - if version_id is not None: - output_dir = output_root / version_id - if output_dir.exists(): - raise FileExistsError( - f"Versioned artifact directory already exists: {output_dir}" - ) - return version_id, output_dir - - config_hash = _short_config_hash(config) - timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") - base_version_id = f"{timestamp}-{config_hash}" - candidate_version_id = base_version_id - suffix = 2 - output_dir = output_root / candidate_version_id - while output_dir.exists(): - candidate_version_id = f"{base_version_id}-{suffix}" - output_dir = output_root / candidate_version_id - suffix += 1 - return candidate_version_id, output_dir - - -def _short_config_hash(config: dict[str, Any]) -> str: - import hashlib - import json - - payload = json.dumps(config, sort_keys=True, separators=(",", ":")) - return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:8] - - -def _initialize_versioned_stage_runtime_writer( - output_root: str | Path, - *, - version_id: str | None, - config: USMicroplexBuildConfig, - providers: list[SourceProvider], - queries: dict[str, SourceQuery] | None, - allow_stage_input_overrides: bool, - stage_input_overrides: tuple[USStageInputOverride, ...], -) -> tuple[str, Path, USStageRuntimeWriter]: - root = Path(output_root) - root.mkdir(parents=True, exist_ok=True) - resolved_version_id, output_dir = _allocate_versioned_output_dir_for_config( - root, - version_id=version_id, - config=config.to_dict(), - ) - provider_query_plan = _provider_query_plan(providers, queries) - writer = USStageRuntimeWriter( - output_dir, - manifest_payload={ - "created_at": datetime.now(UTC).isoformat(), - "config": config.to_dict(), - "artifacts": {"manifest": "manifest.json"}, - }, - allow_stage_input_overrides=allow_stage_input_overrides, - stage_input_overrides=stage_input_overrides, - ) - writer.start_stage( - "01_run_profile", - metadata={"version_id": resolved_version_id}, - ) - writer.complete_stage( - USRunProfileOutputs( - manifest=USArtifactRef( - key="manifest", - path="manifest.json", - format="json", - required=True, - assume_exists=True, - ), - resolved_config=config.to_dict(), - provider_query_plan=provider_query_plan, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - description="Runtime run-profile summary.", - summary={ - "provider_names": provider_query_plan["provider_names"], - "version_id": resolved_version_id, - }, - ) - }, - ) - ) - return resolved_version_id, output_dir, writer - - -def _provider_query_plan( - providers: list[SourceProvider], - queries: dict[str, SourceQuery] | None, -) -> dict[str, Any]: - return { - "provider_names": [provider.descriptor.name for provider in providers], - "queries": { - key: _json_ready_query(query) for key, query in dict(queries or {}).items() - }, - } - - -def _json_ready_query(query: SourceQuery) -> dict[str, Any]: - if hasattr(query, "to_dict"): - payload = query.to_dict() - if isinstance(payload, dict): - return payload - if hasattr(query, "__dataclass_fields__"): - return _json_ready(asdict(query)) - return _json_ready(vars(query)) - - -def _json_ready(value: Any) -> Any: - if isinstance(value, Mapping): - return {str(key): _json_ready(item) for key, item in value.items()} - if isinstance(value, (tuple, list, set, frozenset)): - return [_json_ready(item) for item in value] - if isinstance(value, Path): - return str(value) - if hasattr(value, "value"): - return value.value - return value - - -def _registry_metric_value(entry: Any | None, metric: FrontierMetric) -> float | None: - if entry is None: - return None - return getattr(entry, metric, None) diff --git a/src/microplex_us/policyengine/__init__.py b/src/microplex_us/policyengine/__init__.py deleted file mode 100644 index af78a95f..00000000 --- a/src/microplex_us/policyengine/__init__.py +++ /dev/null @@ -1,91 +0,0 @@ -"""US PolicyEngine integration helpers.""" - -from microplex_us.policyengine.comparison import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSMaterializationError, - PolicyEngineUSTargetComparisonReport, - PolicyEngineUSTargetEvaluation, - PolicyEngineUSTargetEvaluationReport, - compare_policyengine_us_target_query_to_baseline, - evaluate_policyengine_us_target_query, - evaluate_policyengine_us_target_set, - evaluate_policyengine_us_target_sets, - slice_policyengine_us_target_evaluation_report, -) -from microplex_us.policyengine.harness import ( - PolicyEngineUSHarnessRun, - PolicyEngineUSHarnessSlice, - PolicyEngineUSHarnessSliceResult, - default_policyengine_us_db_all_target_slices, - default_policyengine_us_db_harness_slices, - default_policyengine_us_db_parity_slices, - default_policyengine_us_harness_slices, - evaluate_policyengine_us_harness, - filter_nonempty_policyengine_us_harness_slices, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSConstraint, - PolicyEngineUSDBTarget, - PolicyEngineUSDBTargetProvider, - PolicyEngineUSEntityTableBundle, - PolicyEngineUSMicrosimulationAdapter, - PolicyEngineUSQuantityTarget, - PolicyEngineUSSimulationTargetCompiler, - PolicyEngineUSVariableBinding, - build_policyengine_us_export_column_names, - build_policyengine_us_time_period_arrays, - compile_policyengine_us_household_linear_constraints, - compute_policyengine_us_definition_hash, - detect_policyengine_pseudo_inputs, - filter_supported_policyengine_us_targets, - infer_policyengine_us_variable_bindings, - load_policyengine_us_entity_tables, - materialize_policyengine_us_variables, - policyengine_us_formula_variables_for_targets, - policyengine_us_variables_to_materialize, - project_frame_to_time_period_arrays, - write_policyengine_us_time_period_dataset, -) - -__all__ = [ - "PolicyEngineUSTargetComparisonReport", - "PolicyEngineUSTargetEvaluation", - "PolicyEngineUSTargetEvaluationReport", - "PolicyEngineUSMaterializationError", - "PolicyEngineUSComparisonCache", - "PolicyEngineUSHarnessRun", - "PolicyEngineUSHarnessSlice", - "PolicyEngineUSHarnessSliceResult", - "compare_policyengine_us_target_query_to_baseline", - "default_policyengine_us_db_all_target_slices", - "default_policyengine_us_db_parity_slices", - "default_policyengine_us_db_harness_slices", - "default_policyengine_us_harness_slices", - "evaluate_policyengine_us_harness", - "evaluate_policyengine_us_target_sets", - "filter_nonempty_policyengine_us_harness_slices", - "evaluate_policyengine_us_target_query", - "evaluate_policyengine_us_target_set", - "slice_policyengine_us_target_evaluation_report", - "PolicyEngineUSConstraint", - "PolicyEngineUSDBTarget", - "PolicyEngineUSDBTargetProvider", - "PolicyEngineUSEntityTableBundle", - "PolicyEngineUSMicrosimulationAdapter", - "PolicyEngineUSQuantityTarget", - "PolicyEngineUSSimulationTargetCompiler", - "PolicyEngineUSVariableBinding", - "build_policyengine_us_export_column_names", - "build_policyengine_us_time_period_arrays", - "compile_policyengine_us_household_linear_constraints", - "compute_policyengine_us_definition_hash", - "detect_policyengine_pseudo_inputs", - "filter_supported_policyengine_us_targets", - "infer_policyengine_us_variable_bindings", - "load_policyengine_us_entity_tables", - "materialize_policyengine_us_variables", - "policyengine_us_formula_variables_for_targets", - "policyengine_us_variables_to_materialize", - "project_frame_to_time_period_arrays", - "write_policyengine_us_time_period_dataset", -] diff --git a/src/microplex_us/policyengine/aotc.py b/src/microplex_us/policyengine/aotc.py deleted file mode 100644 index 1eeee85d..00000000 --- a/src/microplex_us/policyengine/aotc.py +++ /dev/null @@ -1,85 +0,0 @@ -"""American Opportunity Tax Credit (AOTC) helpers backed by PolicyEngine-US. - -Mirrors the credit-to-expenses inverse in the enhanced-CPS baseline (the -``_aotc_qualifying_expenses_from_credit`` staticmethod of ``ExtendedCPS`` in -``PolicyEngine/policyengine-us-data``, unmerged branch -``codex/fix-aotc-eligibility``) so the Microplex AOTC eligibility-input -construction back-solves per-student qualified expenses the same way. Where -the enhanced CPS hard-codes the AOTC bracket constants, these functions read -only the published -``gov.irs.credits.education.american_opportunity_credit.amount`` marginal -schedule, so they stay in lock-step with PolicyEngine-US parameter updates. -""" - -from __future__ import annotations - -import math -from functools import lru_cache - -import numpy as np - - -@lru_cache(maxsize=16) -def get_american_opportunity_credit_amount_scale(year: int): - """Return the PolicyEngine-US AOTC amount scale for a tax year.""" - from policyengine_us import CountryTaxBenefitSystem - - return CountryTaxBenefitSystem().parameters.gov.irs.credits.education.american_opportunity_credit.amount( - f"{year}-01-01" - ) - - -def qualifying_expenses_from_american_opportunity_credit( - credit: float, - year: int, -) -> float: - """Return the minimum expenses that generate ``credit`` under PE-US.""" - amount_scale = get_american_opportunity_credit_amount_scale(year) - return _minimum_base_for_marginal_amount(credit, amount_scale) - - -def maximum_american_opportunity_credit_per_student(year: int) -> float: - """Return the maximum AOTC generated by one student under PE-US.""" - amount_scale = get_american_opportunity_credit_amount_scale(year) - if len(amount_scale.thresholds) == 0: - return 0.0 - terminal_threshold = max(amount_scale.thresholds) - return float(amount_scale.calc(np.array([terminal_threshold], dtype=float))[0]) - - -def _minimum_base_for_marginal_amount(amount: float, scale) -> float: - """Invert a marginal amount schedule using the schedule brackets.""" - amount = max(float(amount), 0) - if amount == 0: - return 0.0 - - thresholds = np.asarray(scale.thresholds, dtype=float) - rates = np.asarray(scale.rates, dtype=float) - if thresholds.size == 0: - return 0.0 - - order = np.argsort(thresholds) - thresholds = thresholds[order] - rates = rates[order] - - accrued = 0.0 - for index, (lower, rate) in enumerate(zip(thresholds, rates)): - lower = float(lower) - rate = float(rate) - upper = ( - float(thresholds[index + 1]) if index + 1 < thresholds.size else math.inf - ) - - if amount <= accrued: - return lower - if rate <= 0: - continue - if math.isinf(upper): - return lower + (amount - accrued) / rate - - bracket_amount = (upper - lower) * rate - if amount <= accrued + bracket_amount: - return lower + (amount - accrued) / rate - accrued += bracket_amount - - return float(thresholds[-1]) diff --git a/src/microplex_us/policyengine/comparison.py b/src/microplex_us/policyengine/comparison.py deleted file mode 100644 index 7ca66f63..00000000 --- a/src/microplex_us/policyengine/comparison.py +++ /dev/null @@ -1,797 +0,0 @@ -"""Library helpers for scoring PE-US-compatible populations against target slices.""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from microplex.targets import ( - BenchmarkComparison, - BenchmarkResult, - TargetAggregation, - TargetMetric, - TargetProvider, - TargetQuery, - TargetSet, - TargetSpec, - UnsupportedTarget, - build_benchmark_result, - compare_benchmark_results, - max_abs_relative_error, - mean_abs_relative_error, - normalize_metric_payload, - relative_error_ratio, -) - -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - PolicyEngineUSVariableBinding, - compile_supported_policyengine_us_household_linear_constraints, - filter_supported_policyengine_us_targets, - infer_policyengine_us_variable_bindings, - load_policyengine_us_entity_tables, - materialize_policyengine_us_variables_safely, - policyengine_us_formula_variables_for_targets, - policyengine_us_variables_to_materialize, -) - -POLICYENGINE_US_BENCHMARK_GROUP_FIELDS = ( - "source", - "geographic_level", - "variable", - "is_count", -) - - -class PolicyEngineUSMaterializationError(RuntimeError): - """Raised when PE-US-derived features cannot be materialized for scoring.""" - - def __init__(self, label: str, failed_variables: dict[str, str]): - self.label = label - self.failed_variables = dict(failed_variables) - details = ", ".join( - f"{variable} ({reason})" - for variable, reason in sorted(self.failed_variables.items()) - ) - super().__init__( - f"{label} could not materialize required PolicyEngine US variables: {details}" - ) - - -def _freeze_cache_value(value: Any) -> Any: - if isinstance(value, dict): - return tuple( - sorted((str(key), _freeze_cache_value(item)) for key, item in value.items()) - ) - if isinstance(value, (list, tuple, set)): - return tuple(_freeze_cache_value(item) for item in value) - if isinstance(value, Path): - return str(value) - if isinstance(value, type): - return f"{value.__module__}.{value.__qualname__}" - return value - - -def _provider_cache_key(provider: TargetProvider) -> tuple[str, int]: - return (f"{provider.__class__.__module__}.{provider.__class__.__qualname__}", id(provider)) - - -def _query_cache_key(query: TargetQuery | None) -> tuple[Any, ...]: - if query is None: - return ("__none__",) - return ( - query.period, - query.entity.value if query.entity is not None else None, - tuple(query.names), - _freeze_cache_value(query.metadata_filters), - _freeze_cache_value(query.provider_filters), - ) - - -def _target_cache_key(target: TargetSpec) -> tuple[Any, ...]: - return ( - target.name, - target.entity.value, - target.period, - target.measure, - target.aggregation.value, - tuple( - ( - target_filter.feature, - target_filter.operator.value, - _freeze_cache_value(target_filter.value), - ) - for target_filter in target.filters - ), - float(target.value), - target.tolerance, - target.source, - target.units, - target.description, - _freeze_cache_value(target.metadata), - ) - - -def _target_set_cache_key( - targets: TargetSet | list[TargetSpec] | tuple[TargetSpec, ...], -) -> tuple[Any, ...]: - return tuple(_target_cache_key(target) for target in _normalize_target_list(targets)) - - -@dataclass -class PolicyEngineUSComparisonCache: - """In-memory cache for immutable PE-US comparison inputs.""" - - target_sets: dict[tuple[Any, ...], TargetSet] = field(default_factory=dict) - baseline_tables: dict[tuple[Any, ...], PolicyEngineUSEntityTableBundle] = field( - default_factory=dict - ) - baseline_reports: dict[tuple[Any, ...], PolicyEngineUSTargetEvaluationReport] = field( - default_factory=dict - ) - - def load_target_set( - self, - provider: TargetProvider, - query: TargetQuery | None, - ) -> TargetSet: - key = (_provider_cache_key(provider), _query_cache_key(query)) - target_set = self.target_sets.get(key) - if target_set is None: - target_set = provider.load_target_set(query) - self.target_sets[key] = target_set - return target_set - - def load_baseline_tables( - self, - baseline_dataset: str | Path | Any, - *, - period: int | str, - ) -> PolicyEngineUSEntityTableBundle: - dataset_key = str(baseline_dataset) if isinstance(baseline_dataset, Path) else baseline_dataset - key = (dataset_key, period) - tables = self.baseline_tables.get(key) - if tables is None: - tables = load_policyengine_us_entity_tables( - baseline_dataset, - period=period, - ) - self.baseline_tables[key] = tables - return tables - - def load_baseline_report( - self, - *, - target_set: TargetSet, - baseline_dataset: str | Path | Any, - period: int | str, - dataset_year: int | None, - simulation_cls: Any | None, - baseline_label: str, - strict_materialization: bool, - ) -> PolicyEngineUSTargetEvaluationReport: - dataset_key = str(baseline_dataset) if isinstance(baseline_dataset, Path) else baseline_dataset - key = ( - dataset_key, - period, - dataset_year, - _freeze_cache_value(simulation_cls), - baseline_label, - strict_materialization, - _target_set_cache_key(target_set), - ) - report = self.baseline_reports.get(key) - if report is None: - baseline_tables = self.load_baseline_tables( - baseline_dataset, - period=period, - ) - report = evaluate_policyengine_us_target_set( - baseline_tables, - target_set, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - label=baseline_label, - strict_materialization=strict_materialization, - ) - self.baseline_reports[key] = report - return report - - -@dataclass(frozen=True) -class PolicyEngineUSTargetEvaluation: - """Observed value and error for a single canonical target.""" - - target: TargetSpec - actual_value: float - - @property - def absolute_error(self) -> float: - return abs(self.actual_value - float(self.target.value)) - - @property - def relative_error(self) -> float | None: - return relative_error_ratio(self.actual_value, float(self.target.value)) - - -@dataclass -class PolicyEngineUSTargetEvaluationReport: - """Summary of target-slice fit for one candidate dataset/bundle.""" - - label: str - period: int | str - evaluations: list[PolicyEngineUSTargetEvaluation] = field(default_factory=list) - unsupported_targets: list[TargetSpec] = field(default_factory=list) - materialized_variables: tuple[str, ...] = () - materialization_failures: dict[str, str] = field(default_factory=dict) - - @property - def mean_abs_relative_error(self) -> float | None: - metrics = self.benchmark_metrics - if not metrics: - return None - return mean_abs_relative_error(metrics) - - @property - def max_abs_relative_error(self) -> float | None: - metrics = self.benchmark_metrics - if not metrics: - return None - return max_abs_relative_error(metrics) - - @property - def supported_target_count(self) -> int: - return len(self.evaluations) - - @property - def benchmark_metrics(self) -> list[TargetMetric]: - return [ - normalize_metric_payload( - { - "name": evaluation.target.name, - "estimate": evaluation.actual_value, - "target": float(evaluation.target.value), - "metadata": { - "source": evaluation.target.source, - "entity": evaluation.target.entity.value, - "measure": evaluation.target.measure, - "aggregation": evaluation.target.aggregation.value, - **dict(evaluation.target.metadata), - }, - } - ) - for evaluation in self.evaluations - ] - - @property - def benchmark_result(self) -> BenchmarkResult: - return build_benchmark_result( - label=self.label, - time_period=self.period, - metrics=self.benchmark_metrics, - target_count=(len(self.evaluations) + len(self.unsupported_targets)), - unsupported_targets=[ - UnsupportedTarget( - name=target.name, - reason="unsupported", - metadata={ - "entity": target.entity.value, - "measure": target.measure, - "aggregation": target.aggregation.value, - **dict(target.metadata), - }, - ) - for target in self.unsupported_targets - ], - metadata={ - "materialized_variables": list(self.materialized_variables), - "materialization_failures": dict(self.materialization_failures), - }, - ) - - -@dataclass -class PolicyEngineUSTargetComparisonReport: - """Side-by-side fit reports for a Microplex candidate and a PE baseline.""" - - candidate: PolicyEngineUSTargetEvaluationReport - baseline: PolicyEngineUSTargetEvaluationReport | None = None - - @property - def benchmark_comparison(self) -> BenchmarkComparison | None: - if self.baseline is None: - return None - try: - return compare_benchmark_results( - self.candidate.benchmark_result, - self.baseline.benchmark_result, - group_fields=POLICYENGINE_US_BENCHMARK_GROUP_FIELDS, - ) - except ValueError: - return None - - @property - def mean_abs_relative_error_delta(self) -> float | None: - comparison = self.benchmark_comparison - if comparison is None: - return None - return comparison.mean_abs_relative_error_delta - - @property - def target_win_rate(self) -> float | None: - comparison = self.benchmark_comparison - if comparison is None: - return None - return comparison.target_win_rate - - @property - def common_target_count(self) -> int: - comparison = self.benchmark_comparison - if comparison is None: - return 0 - return comparison.common_target_count - - @property - def deltas(self): - comparison = self.benchmark_comparison - if comparison is None: - return [] - return comparison.deltas - - @property - def grouped_summaries(self): - comparison = self.benchmark_comparison - if comparison is None: - return {} - return comparison.grouped_summaries - - -def evaluate_policyengine_us_target_set( - tables: PolicyEngineUSEntityTableBundle, - targets: TargetSet | list[TargetSpec] | tuple[TargetSpec, ...], - *, - period: int | str, - dataset_year: int | None = None, - simulation_cls: Any | None = None, - label: str = "candidate", - strict_materialization: bool = False, - direct_override_variables: tuple[str, ...] = (), -) -> PolicyEngineUSTargetEvaluationReport: - """Evaluate canonical targets against a PE-US-style entity-table bundle.""" - target_list = _normalize_target_list(targets) - working_tables = tables - bindings = infer_policyengine_us_variable_bindings(working_tables) - force_materialize_variables = policyengine_us_formula_variables_for_targets( - target_list, - simulation_cls=simulation_cls, - direct_override_variables=direct_override_variables, - ) - variables_to_materialize = policyengine_us_variables_to_materialize( - target_list, - bindings, - force_materialize_variables=force_materialize_variables, - ) - materialization_result = materialize_policyengine_us_variables_safely( - working_tables, - variables=tuple(sorted(variables_to_materialize)), - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - direct_override_variables=direct_override_variables, - ) - working_tables = materialization_result.tables - unmaterialized_forced_variables = ( - force_materialize_variables - & variables_to_materialize - - set(materialization_result.bindings) - ) - bindings = { - variable: binding - for variable, binding in bindings.items() - if variable not in unmaterialized_forced_variables - } - bindings = { - **bindings, - **materialization_result.bindings, - } - materialized_variables = materialization_result.materialized_variables - materialization_failures = materialization_result.failed_variables - if strict_materialization and materialization_failures: - raise PolicyEngineUSMaterializationError(label, materialization_failures) - - supported_targets = filter_supported_policyengine_us_targets( - target_list, - working_tables, - bindings, - ) - supported_targets = _filter_targets_with_supported_dataset_geography( - supported_targets, - tables=working_tables, - bindings=bindings, - ) - supported_target_keys = { - _target_cache_key(target) - for target in supported_targets - } - unsupported_targets = [ - target - for target in target_list - if _target_cache_key(target) not in supported_target_keys - ] - household_weights = _household_weights(working_tables) - linear_targets = [ - target - for target in supported_targets - if target.aggregation is not TargetAggregation.MEAN - ] - mean_targets = [ - target for target in supported_targets if target.aggregation is TargetAggregation.MEAN - ] - linear_targets, compile_unsupported_targets, constraints = ( - compile_supported_policyengine_us_household_linear_constraints( - linear_targets, - working_tables, - variable_bindings=bindings, - ) - ) - unsupported_targets.extend(compile_unsupported_targets) - evaluations: list[PolicyEngineUSTargetEvaluation] = [] - for target, constraint in zip(linear_targets, constraints, strict=True): - evaluations.append( - PolicyEngineUSTargetEvaluation( - target=target, - actual_value=float(np.dot(household_weights, constraint.coefficients)), - ) - ) - for target in mean_targets: - try: - actual_value = _evaluate_target_value( - target, - tables=working_tables, - bindings=bindings, - household_weights=household_weights, - ) - except ValueError as error: - if "Cross-entity constraints are only supported" in str(error): - unsupported_targets.append(target) - continue - raise - evaluations.append( - PolicyEngineUSTargetEvaluation( - target=target, - actual_value=actual_value, - ) - ) - - return PolicyEngineUSTargetEvaluationReport( - label=label, - period=period, - evaluations=evaluations, - unsupported_targets=unsupported_targets, - materialized_variables=tuple(materialized_variables), - materialization_failures=materialization_failures, - ) - - -def slice_policyengine_us_target_evaluation_report( - report: PolicyEngineUSTargetEvaluationReport, - targets: TargetSet | list[TargetSpec] | tuple[TargetSpec, ...], -) -> PolicyEngineUSTargetEvaluationReport: - """Project a union target-evaluation report down to one target subset.""" - target_list = _normalize_target_list(targets) - evaluations_by_name = { - evaluation.target.name: evaluation for evaluation in report.evaluations - } - unsupported_by_name = { - target.name: target for target in report.unsupported_targets - } - evaluations: list[PolicyEngineUSTargetEvaluation] = [] - unsupported_targets: list[TargetSpec] = [] - for target in target_list: - evaluation = evaluations_by_name.get(target.name) - if evaluation is not None: - evaluations.append(evaluation) - continue - unsupported_targets.append(unsupported_by_name.get(target.name, target)) - return PolicyEngineUSTargetEvaluationReport( - label=report.label, - period=report.period, - evaluations=evaluations, - unsupported_targets=unsupported_targets, - materialized_variables=report.materialized_variables, - materialization_failures=dict(report.materialization_failures), - ) - - -def evaluate_policyengine_us_target_sets( - tables: PolicyEngineUSEntityTableBundle, - target_sets: dict[str, TargetSet], - *, - period: int | str, - dataset_year: int | None = None, - simulation_cls: Any | None = None, - label: str = "candidate", - strict_materialization: bool = False, - direct_override_variables: tuple[str, ...] = (), -) -> dict[str, PolicyEngineUSTargetEvaluationReport]: - """Evaluate the union of multiple target sets once, then slice the report back out.""" - union_targets: list[TargetSpec] = [] - seen_targets: dict[str, tuple[Any, ...]] = {} - for target_set in target_sets.values(): - for target in _normalize_target_list(target_set): - target_key = _target_cache_key(target) - existing_key = seen_targets.get(target.name) - if existing_key is None: - seen_targets[target.name] = target_key - union_targets.append(target) - continue - if existing_key != target_key: - raise ValueError( - "PolicyEngine US target-set union encountered conflicting " - f"definitions for target '{target.name}'" - ) - union_report = evaluate_policyengine_us_target_set( - tables, - union_targets, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - label=label, - strict_materialization=strict_materialization, - direct_override_variables=direct_override_variables, - ) - return { - name: slice_policyengine_us_target_evaluation_report(union_report, target_set) - for name, target_set in target_sets.items() - } - - -def evaluate_policyengine_us_target_query( - tables: PolicyEngineUSEntityTableBundle, - provider: TargetProvider, - query: TargetQuery | None = None, - *, - dataset_year: int | None = None, - simulation_cls: Any | None = None, - label: str = "candidate", - strict_materialization: bool = False, - direct_override_variables: tuple[str, ...] = (), -) -> PolicyEngineUSTargetEvaluationReport: - """Load canonical targets from a provider and evaluate them against tables.""" - target_set = provider.load_target_set(query) - period = query.period if query is not None else 2024 - return evaluate_policyengine_us_target_set( - tables, - target_set, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - label=label, - strict_materialization=strict_materialization, - direct_override_variables=direct_override_variables, - ) - - -def compare_policyengine_us_target_query_to_baseline( - candidate_tables: PolicyEngineUSEntityTableBundle, - provider: TargetProvider, - query: TargetQuery | None, - *, - baseline_dataset: str | Any, - dataset_year: int | None = None, - simulation_cls: Any | None = None, - candidate_label: str = "microplex", - baseline_label: str = "policyengine_baseline", - strict_materialization: bool = True, - cache: PolicyEngineUSComparisonCache | None = None, - candidate_direct_override_variables: tuple[str, ...] = (), -) -> PolicyEngineUSTargetComparisonReport: - """Compare a candidate PE-US bundle to a baseline PE dataset on one target slice.""" - target_set = ( - cache.load_target_set(provider, query) - if cache is not None - else provider.load_target_set(query) - ) - period = query.period if query is not None else 2024 - baseline_report = ( - cache.load_baseline_report( - target_set=target_set, - baseline_dataset=baseline_dataset, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - baseline_label=baseline_label, - strict_materialization=strict_materialization, - ) - if cache is not None - else evaluate_policyengine_us_target_set( - load_policyengine_us_entity_tables( - baseline_dataset, - period=period, - ), - target_set, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - label=baseline_label, - strict_materialization=strict_materialization, - ) - ) - candidate_report = evaluate_policyengine_us_target_set( - candidate_tables, - target_set, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - label=candidate_label, - strict_materialization=strict_materialization, - direct_override_variables=candidate_direct_override_variables, - ) - return PolicyEngineUSTargetComparisonReport( - candidate=candidate_report, - baseline=baseline_report, - ) - - -def _normalize_target_list( - targets: TargetSet | list[TargetSpec] | tuple[TargetSpec, ...], -) -> list[TargetSpec]: - if isinstance(targets, TargetSet): - return list(targets.targets) - return list(targets) - - -def _household_weights(tables: PolicyEngineUSEntityTableBundle) -> np.ndarray: - households = tables.households - if "household_weight" in households.columns: - values = households["household_weight"] - elif "weight" in households.columns: - values = households["weight"] - else: - raise ValueError( - "Household table must contain 'household_weight' or 'weight' for evaluation" - ) - return np.asarray(values, dtype=float) - - -def _evaluate_target_value( - target: TargetSpec, - *, - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], - household_weights: np.ndarray, -) -> float: - if target.aggregation is TargetAggregation.MEAN: - numerator_target = TargetSpec( - name=f"{target.name}__numerator", - entity=target.entity, - value=0.0, - period=target.period, - measure=target.measure, - aggregation=TargetAggregation.SUM, - filters=target.filters, - tolerance=target.tolerance, - source=target.source, - units=target.units, - description=target.description, - metadata=dict(target.metadata), - ) - denominator_target = TargetSpec( - name=f"{target.name}__denominator", - entity=target.entity, - value=0.0, - period=target.period, - aggregation=TargetAggregation.COUNT, - filters=target.filters, - tolerance=target.tolerance, - source=target.source, - units=target.units, - description=target.description, - metadata=dict(target.metadata), - ) - numerator = _evaluate_linear_target( - numerator_target, - tables=tables, - bindings=bindings, - household_weights=household_weights, - ) - denominator = _evaluate_linear_target( - denominator_target, - tables=tables, - bindings=bindings, - household_weights=household_weights, - ) - if denominator == 0.0: - return float("nan") - return numerator / denominator - - return _evaluate_linear_target( - target, - tables=tables, - bindings=bindings, - household_weights=household_weights, - ) - - -def _filter_targets_with_supported_dataset_geography( - targets: list[TargetSpec], - *, - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], -) -> list[TargetSpec]: - """Drop targets that require geography the current dataset only encodes as defaults.""" - if _has_nondefault_bound_feature_values( - "congressional_district_geoid", - tables=tables, - bindings=bindings, - ): - return targets - - return [ - target - for target in targets - if not any( - target_filter.feature == "congressional_district_geoid" - for target_filter in target.filters - ) - ] - - -def _has_nondefault_bound_feature_values( - feature: str, - *, - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], -) -> bool: - binding = bindings.get(feature) - if binding is None: - return False - - column = binding.column or feature - try: - table = tables.table_for(binding.entity) - except KeyError: - return False - if column not in table.columns: - return False - - values = pd.Series(table[column]).dropna() - if values.empty: - return False - - numeric_values = pd.to_numeric(values, errors="coerce") - if numeric_values.notna().all(): - return bool((numeric_values != 0).any()) - - normalized = values.astype(str).str.strip() - nondefault_values = normalized[ - ~normalized.isin({"", "0", "0.0", "nan", "None", ""}) - ] - return not nondefault_values.empty - - -def _evaluate_linear_target( - target: TargetSpec, - *, - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], - household_weights: np.ndarray, -) -> float: - supported_targets, unsupported_targets, constraints = ( - compile_supported_policyengine_us_household_linear_constraints( - [target], - tables, - variable_bindings=bindings, - ) - ) - if unsupported_targets or not supported_targets: - raise ValueError( - "Cross-entity constraints are only supported against household targets " - "or household metadata" - ) - constraint = constraints[0] - return float(np.dot(household_weights, constraint.coefficients)) diff --git a/src/microplex_us/policyengine/harness.py b/src/microplex_us/policyengine/harness.py deleted file mode 100644 index 92bc17fe..00000000 --- a/src/microplex_us/policyengine/harness.py +++ /dev/null @@ -1,1444 +0,0 @@ -"""Persistent comparison harness for PE-US target slices.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass, field -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -import numpy as np -from microplex.targets import ( - BatchBenchmarkResultEvaluator, - BenchmarkResult, - BenchmarkSliceComparison, - BenchmarkSliceSpec, - BenchmarkSuiteResult, - FilterOperator, - TargetFilter, - TargetProvider, - TargetQuery, - TargetSet, - build_benchmark_suite_from_results, - build_benchmark_suite_result, - evaluate_benchmark_slice_results, - filter_nonempty_benchmark_slices, - load_benchmark_slice_target_sets, - union_target_sets, -) - -from microplex_us.policyengine.comparison import ( - POLICYENGINE_US_BENCHMARK_GROUP_FIELDS, - PolicyEngineUSComparisonCache, - PolicyEngineUSTargetComparisonReport, - PolicyEngineUSTargetEvaluation, - PolicyEngineUSTargetEvaluationReport, - evaluate_policyengine_us_target_set, - evaluate_policyengine_us_target_sets, - slice_policyengine_us_target_evaluation_report, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - load_policyengine_us_entity_tables, -) - -COMPOSITE_PARITY_LOSS_WEIGHTS = { - "micro": 0.35, - "attribute_macro": 0.35, - "attribute_tail": 0.20, - "support_gap": 0.10, -} -ATTRIBUTE_TAIL_FRACTION = 0.10 -UNSPECIFIED_ATTRIBUTE = "__unknown__" - - -PolicyEngineUSHarnessSlice = BenchmarkSliceSpec - - -@dataclass -class PolicyEngineUSHarnessSliceResult: - """Comparison result for one named harness slice.""" - - slice: PolicyEngineUSHarnessSlice - comparison: PolicyEngineUSTargetComparisonReport - - @property - def candidate_mean_abs_relative_error(self) -> float | None: - return self.comparison.candidate.mean_abs_relative_error - - @property - def baseline_mean_abs_relative_error(self) -> float | None: - if self.comparison.baseline is None: - return None - return self.comparison.baseline.mean_abs_relative_error - - @property - def mean_abs_relative_error_delta(self) -> float | None: - return self.comparison.mean_abs_relative_error_delta - - @property - def candidate_beats_baseline(self) -> bool | None: - delta = self.mean_abs_relative_error_delta - if delta is None: - return None - return delta < 0.0 - - @property - def benchmark_slice_result(self) -> BenchmarkSliceComparison | None: - comparison = self.comparison.benchmark_comparison - if comparison is None: - return None - return BenchmarkSliceComparison( - slice=BenchmarkSliceSpec( - name=self.slice.name, - query=self.slice.query, - description=self.slice.description, - tags=self.slice.tags, - ), - comparison=comparison, - ) - - -@dataclass -class PolicyEngineUSHarnessRun: - """Persistent PE-US harness evaluation across multiple target slices.""" - - candidate_label: str - baseline_label: str - period: int | str - slice_results: list[PolicyEngineUSHarnessSliceResult] = field(default_factory=list) - created_at: str = field( - default_factory=lambda: datetime.now(UTC).replace(microsecond=0).isoformat() - ) - metadata: dict[str, Any] = field(default_factory=dict) - _benchmark_suite: BenchmarkSuiteResult | None = field(default=None, repr=False) - - @property - def benchmark_suite(self) -> BenchmarkSuiteResult: - if self._benchmark_suite is not None: - return self._benchmark_suite - comparable_results = [ - result for result in self.slice_results if result.comparison.baseline is not None - ] - return build_benchmark_suite_from_results( - candidate_label=self.candidate_label, - baseline_label=self.baseline_label, - period=self.period, - slices=[result.slice for result in comparable_results], - candidate_results={ - result.slice.name: result.comparison.candidate.benchmark_result - for result in comparable_results - }, - baseline_results={ - result.slice.name: result.comparison.baseline.benchmark_result - for result in comparable_results - if result.comparison.baseline is not None - }, - group_fields=POLICYENGINE_US_BENCHMARK_GROUP_FIELDS, - created_at=self.created_at, - metadata=dict(self.metadata), - ) - - @property - def candidate_mean_abs_relative_error(self) -> float | None: - return self.benchmark_suite.candidate_mean_abs_relative_error - - @property - def baseline_mean_abs_relative_error(self) -> float | None: - return self.benchmark_suite.baseline_mean_abs_relative_error - - @property - def mean_abs_relative_error_delta(self) -> float | None: - return self.benchmark_suite.mean_abs_relative_error_delta - - @property - def slice_win_rate(self) -> float | None: - return self.benchmark_suite.slice_win_rate - - @property - def target_win_rate(self) -> float | None: - return self.benchmark_suite.target_win_rate - - @property - def supported_target_rate(self) -> float | None: - return self.benchmark_suite.supported_target_rate - - @property - def baseline_supported_target_rate(self) -> float | None: - return self.benchmark_suite.baseline_supported_target_rate - - @property - def candidate_micro_mean_abs_relative_error(self) -> float | None: - return self.candidate_micro_mean_abs_relative_error_for_tag(None) - - @property - def baseline_micro_mean_abs_relative_error(self) -> float | None: - return self.baseline_micro_mean_abs_relative_error_for_tag(None) - - @property - def candidate_attribute_macro_mean_abs_relative_error(self) -> float | None: - return self.candidate_attribute_macro_mean_abs_relative_error_for_tag(None) - - @property - def baseline_attribute_macro_mean_abs_relative_error(self) -> float | None: - return self.baseline_attribute_macro_mean_abs_relative_error_for_tag(None) - - @property - def candidate_attribute_tail_mean_abs_relative_error(self) -> float | None: - return self.candidate_attribute_tail_mean_abs_relative_error_for_tag(None) - - @property - def baseline_attribute_tail_mean_abs_relative_error(self) -> float | None: - return self.baseline_attribute_tail_mean_abs_relative_error_for_tag(None) - - @property - def candidate_composite_parity_loss(self) -> float | None: - return self.candidate_composite_parity_loss_for_tag(None) - - @property - def baseline_composite_parity_loss(self) -> float | None: - return self.baseline_composite_parity_loss_for_tag(None) - - @property - def composite_parity_loss_delta(self) -> float | None: - candidate_loss = self.candidate_composite_parity_loss - baseline_loss = self.baseline_composite_parity_loss - if candidate_loss is None or baseline_loss is None: - return None - return candidate_loss - baseline_loss - - @property - def attribute_cell_summaries(self) -> dict[str, dict[str, Any]]: - return self.attribute_cell_summaries_for_tag(None) - - @property - def tag_summaries(self) -> dict[str, dict[str, float | None]]: - tags = tuple( - dict.fromkeys( - tag - for result in self.slice_results - for tag in result.slice.tags - ) - ) - return { - tag: { - "candidate_mean_abs_relative_error": self.candidate_mean_abs_relative_error_for_tag( - tag - ), - "baseline_mean_abs_relative_error": self.baseline_mean_abs_relative_error_for_tag( - tag - ), - "mean_abs_relative_error_delta": self.mean_abs_relative_error_delta_for_tag( - tag - ), - "slice_win_rate": self.slice_win_rate_for_tag(tag), - "target_win_rate": self.target_win_rate_for_tag(tag), - "supported_target_rate": self.supported_target_rate_for_tag(tag), - "baseline_supported_target_rate": self._supported_target_rate_for_tag( - tag, - kind="baseline", - ), - "candidate_micro_mean_abs_relative_error": self.candidate_micro_mean_abs_relative_error_for_tag( - tag - ), - "baseline_micro_mean_abs_relative_error": self.baseline_micro_mean_abs_relative_error_for_tag( - tag - ), - "candidate_attribute_macro_mean_abs_relative_error": self.candidate_attribute_macro_mean_abs_relative_error_for_tag( - tag - ), - "baseline_attribute_macro_mean_abs_relative_error": self.baseline_attribute_macro_mean_abs_relative_error_for_tag( - tag - ), - "candidate_attribute_tail_mean_abs_relative_error": self.candidate_attribute_tail_mean_abs_relative_error_for_tag( - tag - ), - "baseline_attribute_tail_mean_abs_relative_error": self.baseline_attribute_tail_mean_abs_relative_error_for_tag( - tag - ), - "candidate_composite_parity_loss": self.candidate_composite_parity_loss_for_tag( - tag - ), - "baseline_composite_parity_loss": self.baseline_composite_parity_loss_for_tag( - tag - ), - "composite_parity_loss_delta": self.composite_parity_loss_delta_for_tag( - tag - ), - } - for tag in tags - } - - @property - def parity_scorecard(self) -> dict[str, dict[str, float | bool | None]]: - scopes = { - "overall": None, - "national": "national", - "local": "local", - "state": "state", - "district": "district", - } - scorecard: dict[str, dict[str, float | bool | None]] = {} - for scope, tag in scopes.items(): - if tag is not None and not self._slice_results_for_tag(tag): - continue - scorecard[scope] = { - "candidate_mean_abs_relative_error": self.candidate_mean_abs_relative_error_for_tag( - tag - ), - "baseline_mean_abs_relative_error": self.baseline_mean_abs_relative_error_for_tag( - tag - ), - "mean_abs_relative_error_delta": self.mean_abs_relative_error_delta_for_tag( - tag - ), - "slice_win_rate": self.slice_win_rate_for_tag(tag), - "target_win_rate": self.target_win_rate_for_tag(tag), - "supported_target_rate": self.supported_target_rate_for_tag(tag), - "baseline_supported_target_rate": self._supported_target_rate_for_tag( - tag, - kind="baseline", - ), - "candidate_micro_mean_abs_relative_error": self.candidate_micro_mean_abs_relative_error_for_tag( - tag - ), - "baseline_micro_mean_abs_relative_error": self.baseline_micro_mean_abs_relative_error_for_tag( - tag - ), - "candidate_attribute_macro_mean_abs_relative_error": self.candidate_attribute_macro_mean_abs_relative_error_for_tag( - tag - ), - "baseline_attribute_macro_mean_abs_relative_error": self.baseline_attribute_macro_mean_abs_relative_error_for_tag( - tag - ), - "candidate_attribute_tail_mean_abs_relative_error": self.candidate_attribute_tail_mean_abs_relative_error_for_tag( - tag - ), - "baseline_attribute_tail_mean_abs_relative_error": self.baseline_attribute_tail_mean_abs_relative_error_for_tag( - tag - ), - "candidate_composite_parity_loss": self.candidate_composite_parity_loss_for_tag( - tag - ), - "baseline_composite_parity_loss": self.baseline_composite_parity_loss_for_tag( - tag - ), - "composite_parity_loss_delta": self.composite_parity_loss_delta_for_tag( - tag - ), - "candidate_beats_baseline": self._candidate_beats_baseline_for_tag(tag), - } - return scorecard - - def candidate_mean_abs_relative_error_for_tag(self, tag: str | None) -> float | None: - return self.benchmark_suite.candidate_mean_abs_relative_error_for_tag(tag) - - def baseline_mean_abs_relative_error_for_tag(self, tag: str | None) -> float | None: - return self.benchmark_suite.baseline_mean_abs_relative_error_for_tag(tag) - - def mean_abs_relative_error_delta_for_tag(self, tag: str | None) -> float | None: - return self.benchmark_suite.mean_abs_relative_error_delta_for_tag(tag) - - def slice_win_rate_for_tag(self, tag: str | None) -> float | None: - return self.benchmark_suite.slice_win_rate_for_tag(tag) - - def target_win_rate_for_tag(self, tag: str | None) -> float | None: - return self.benchmark_suite.target_win_rate_for_tag(tag) - - def supported_target_rate_for_tag(self, tag: str | None) -> float | None: - return self.benchmark_suite.supported_target_rate_for_tag(tag) - - def candidate_micro_mean_abs_relative_error_for_tag(self, tag: str | None) -> float | None: - return self._micro_mean_abs_relative_error_for_tag(tag, kind="candidate") - - def baseline_micro_mean_abs_relative_error_for_tag(self, tag: str | None) -> float | None: - return self._micro_mean_abs_relative_error_for_tag(tag, kind="baseline") - - def candidate_attribute_macro_mean_abs_relative_error_for_tag( - self, - tag: str | None, - ) -> float | None: - return self._attribute_macro_mean_abs_relative_error_for_tag( - tag, - kind="candidate", - ) - - def baseline_attribute_macro_mean_abs_relative_error_for_tag( - self, - tag: str | None, - ) -> float | None: - return self._attribute_macro_mean_abs_relative_error_for_tag( - tag, - kind="baseline", - ) - - def candidate_attribute_tail_mean_abs_relative_error_for_tag( - self, - tag: str | None, - ) -> float | None: - return self._attribute_tail_mean_abs_relative_error_for_tag( - tag, - kind="candidate", - ) - - def baseline_attribute_tail_mean_abs_relative_error_for_tag( - self, - tag: str | None, - ) -> float | None: - return self._attribute_tail_mean_abs_relative_error_for_tag( - tag, - kind="baseline", - ) - - def candidate_composite_parity_loss_for_tag(self, tag: str | None) -> float | None: - return self._composite_parity_loss_for_tag(tag, kind="candidate") - - def baseline_composite_parity_loss_for_tag(self, tag: str | None) -> float | None: - return self._composite_parity_loss_for_tag(tag, kind="baseline") - - def composite_parity_loss_delta_for_tag(self, tag: str | None) -> float | None: - candidate_loss = self.candidate_composite_parity_loss_for_tag(tag) - baseline_loss = self.baseline_composite_parity_loss_for_tag(tag) - if candidate_loss is None or baseline_loss is None: - return None - return candidate_loss - baseline_loss - - def attribute_cell_summaries_for_tag( - self, - tag: str | None, - ) -> dict[str, dict[str, Any]]: - candidate_records = self._target_records_for_tag(tag, kind="candidate") - baseline_records = self._target_records_for_tag(tag, kind="baseline") - cells: dict[str, dict[str, Any]] = {} - for kind, records in ( - ("candidate", candidate_records), - ("baseline", baseline_records), - ): - for record in records.values(): - attrs = self._target_attribute_summary(record["target"]) - cell_key = attrs["cell_key"] - summary = cells.setdefault( - cell_key, - { - **attrs, - "candidate_target_count": 0, - "candidate_supported_target_count": 0, - "baseline_target_count": 0, - "baseline_supported_target_count": 0, - "_candidate_errors": [], - "_baseline_errors": [], - }, - ) - summary[f"{kind}_target_count"] += 1 - if record["supported"]: - summary[f"{kind}_supported_target_count"] += 1 - relative_error = record["relative_error"] - if relative_error is not None: - summary[f"_{kind}_errors"].append(abs(relative_error)) - - for summary in cells.values(): - for kind in ("candidate", "baseline"): - errors = summary.pop(f"_{kind}_errors") - target_count = summary[f"{kind}_target_count"] - supported_count = summary[f"{kind}_supported_target_count"] - summary[f"{kind}_mean_abs_relative_error"] = ( - float(np.mean(errors)) if errors else None - ) - summary[f"{kind}_support_rate"] = ( - supported_count / target_count if target_count else None - ) - candidate_error = summary["candidate_mean_abs_relative_error"] - baseline_error = summary["baseline_mean_abs_relative_error"] - summary["mean_abs_relative_error_delta"] = ( - candidate_error - baseline_error - if candidate_error is not None and baseline_error is not None - else None - ) - return dict(sorted(cells.items())) - - def _candidate_beats_baseline_for_tag(self, tag: str | None) -> bool | None: - delta = self.mean_abs_relative_error_delta_for_tag(tag) - if delta is None: - return None - return delta < 0.0 - - def _slice_results_for_tag( - self, - tag: str | None, - ) -> list[PolicyEngineUSHarnessSliceResult]: - if tag is None: - return list(self.slice_results) - return [ - result for result in self.slice_results if tag in result.slice.tags - ] - - def _mean_abs_relative_error( - self, - *, - tag: str | None, - kind: str, - ) -> float | None: - suite = self.benchmark_suite - if kind == "candidate": - return suite.candidate_mean_abs_relative_error_for_tag(tag) - return suite.baseline_mean_abs_relative_error_for_tag(tag) - - def _supported_target_rate_for_tag( - self, - tag: str | None, - *, - kind: str, - ) -> float | None: - suite = self.benchmark_suite - if kind == "candidate": - return suite.supported_target_rate_for_tag(tag) - return suite.baseline_supported_target_rate_for_tag(tag) - - def _micro_mean_abs_relative_error_for_tag( - self, - tag: str | None, - *, - kind: str, - ) -> float | None: - errors = [ - abs(record["relative_error"]) - for record in self._target_records_for_tag(tag, kind=kind).values() - if record["relative_error"] is not None - ] - if not errors: - return None - return float(np.mean(errors)) - - def _attribute_macro_mean_abs_relative_error_for_tag( - self, - tag: str | None, - *, - kind: str, - ) -> float | None: - errors = [ - cell[f"{kind}_mean_abs_relative_error"] - for cell in self.attribute_cell_summaries_for_tag(tag).values() - if cell[f"{kind}_mean_abs_relative_error"] is not None - ] - if not errors: - return None - return float(np.mean(errors)) - - def _attribute_tail_mean_abs_relative_error_for_tag( - self, - tag: str | None, - *, - kind: str, - ) -> float | None: - errors = sorted( - ( - cell[f"{kind}_mean_abs_relative_error"] - for cell in self.attribute_cell_summaries_for_tag(tag).values() - if cell[f"{kind}_mean_abs_relative_error"] is not None - ), - reverse=True, - ) - if not errors: - return None - n_tail = max(1, int(np.ceil(len(errors) * ATTRIBUTE_TAIL_FRACTION))) - return float(np.mean(errors[:n_tail])) - - def _composite_parity_loss_for_tag( - self, - tag: str | None, - *, - kind: str, - ) -> float | None: - micro = self._micro_mean_abs_relative_error_for_tag(tag, kind=kind) - macro = self._attribute_macro_mean_abs_relative_error_for_tag(tag, kind=kind) - tail = self._attribute_tail_mean_abs_relative_error_for_tag(tag, kind=kind) - support_rate = self._supported_target_rate_for_tag(tag, kind=kind) - if ( - micro is None - or macro is None - or tail is None - or support_rate is None - ): - return None - weights = COMPOSITE_PARITY_LOSS_WEIGHTS - return ( - weights["micro"] * micro - + weights["attribute_macro"] * macro - + weights["attribute_tail"] * tail - + weights["support_gap"] * (1.0 - support_rate) - ) - - def _target_records_for_tag( - self, - tag: str | None, - *, - kind: str, - ) -> dict[tuple[str, str], dict[str, Any]]: - records: dict[tuple[str, str], dict[str, Any]] = {} - for result in self._slice_results_for_tag(tag): - report = ( - result.comparison.candidate - if kind == "candidate" - else result.comparison.baseline - ) - if report is None: - continue - for target in report.unsupported_targets: - record_key = (result.slice.name, target.name) - records.setdefault( - record_key, - { - "target": target, - "supported": False, - "relative_error": None, - }, - ) - for evaluation in report.evaluations: - records[(result.slice.name, evaluation.target.name)] = { - "target": evaluation.target, - "supported": True, - "relative_error": evaluation.relative_error, - } - return records - - def _target_attribute_summary(self, target: Any) -> dict[str, str]: - metadata = dict(getattr(target, "metadata", {}) or {}) - geo_level = str(metadata.get("geo_level") or UNSPECIFIED_ATTRIBUTE) - entity = target.entity.value - aggregation = target.aggregation.value - feature = str( - target.measure - or metadata.get("variable") - or f"{entity}_count" - ) - domain_variable = str( - metadata.get("domain_variable") or UNSPECIFIED_ATTRIBUTE - ) - cell_key = "|".join( - ( - f"geo={geo_level}", - f"entity={entity}", - f"aggregation={aggregation}", - f"feature={feature}", - f"domain={domain_variable}", - ) - ) - return { - "cell_key": cell_key, - "geo_level": geo_level, - "entity": entity, - "aggregation": aggregation, - "feature": feature, - "domain_variable": domain_variable, - } - - def to_dict(self) -> dict[str, Any]: - """Serialize the harness run to a JSON-compatible dict.""" - return { - "candidate_label": self.candidate_label, - "baseline_label": self.baseline_label, - "period": self.period, - "created_at": self.created_at, - "metadata": dict(self.metadata), - "summary": { - "candidate_mean_abs_relative_error": self.candidate_mean_abs_relative_error, - "baseline_mean_abs_relative_error": self.baseline_mean_abs_relative_error, - "mean_abs_relative_error_delta": self.mean_abs_relative_error_delta, - "slice_win_rate": self.slice_win_rate, - "target_win_rate": self.target_win_rate, - "supported_target_rate": self.supported_target_rate, - "baseline_supported_target_rate": self.baseline_supported_target_rate, - "candidate_micro_mean_abs_relative_error": self.candidate_micro_mean_abs_relative_error, - "baseline_micro_mean_abs_relative_error": self.baseline_micro_mean_abs_relative_error, - "candidate_attribute_macro_mean_abs_relative_error": self.candidate_attribute_macro_mean_abs_relative_error, - "baseline_attribute_macro_mean_abs_relative_error": self.baseline_attribute_macro_mean_abs_relative_error, - "candidate_attribute_tail_mean_abs_relative_error": self.candidate_attribute_tail_mean_abs_relative_error, - "baseline_attribute_tail_mean_abs_relative_error": self.baseline_attribute_tail_mean_abs_relative_error, - "candidate_composite_parity_loss": self.candidate_composite_parity_loss, - "baseline_composite_parity_loss": self.baseline_composite_parity_loss, - "composite_parity_loss_delta": self.composite_parity_loss_delta, - "tag_summaries": self.tag_summaries, - "parity_scorecard": self.parity_scorecard, - "attribute_cell_summaries": self.attribute_cell_summaries, - }, - "slices": [ - { - **result.slice.to_dict(), - "summary": _slice_result_summary(result), - "candidate": _report_to_dict(result.comparison.candidate), - "baseline": ( - _report_to_dict(result.comparison.baseline) - if result.comparison.baseline is not None - else None - ), - } - for result in self.slice_results - ], - } - - def save(self, path: str | Path) -> Path: - """Persist the harness run as JSON.""" - output_path = Path(path) - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(json.dumps(self.to_dict(), indent=2, sort_keys=True)) - return output_path - - @classmethod - def from_dict(cls, payload: dict[str, Any]) -> PolicyEngineUSHarnessRun: - """Restore a harness run from serialized JSON payload.""" - return cls( - candidate_label=payload["candidate_label"], - baseline_label=payload["baseline_label"], - period=payload["period"], - created_at=payload["created_at"], - metadata=dict(payload.get("metadata", {})), - slice_results=[ - PolicyEngineUSHarnessSliceResult( - slice=PolicyEngineUSHarnessSlice.from_dict(slice_payload), - comparison=PolicyEngineUSTargetComparisonReport( - candidate=_report_from_dict(slice_payload["candidate"]), - baseline=( - _report_from_dict(slice_payload["baseline"]) - if slice_payload.get("baseline") is not None - else None - ), - ), - ) - for slice_payload in payload.get("slices", []) - ], - ) - - @classmethod - def load(cls, path: str | Path) -> PolicyEngineUSHarnessRun: - """Load a persisted harness run from JSON.""" - return cls.from_dict(json.loads(Path(path).read_text())) - - -@dataclass -class _PolicyEngineUSCandidateBatchResultEvaluator(BatchBenchmarkResultEvaluator): - tables: PolicyEngineUSEntityTableBundle - period: int | str - dataset_year: int | None - simulation_cls: Any | None - label: str - strict_materialization: bool - direct_override_variables: tuple[str, ...] = () - last_reports: dict[str, PolicyEngineUSTargetEvaluationReport] = field( - default_factory=dict, - init=False, - ) - - def evaluate_target_sets( - self, - target_sets: dict[str, TargetSet], - slices: tuple[BenchmarkSliceSpec, ...], - ) -> dict[str, BenchmarkResult]: - del slices - reports = evaluate_policyengine_us_target_sets( - self.tables, - target_sets, - period=self.period, - dataset_year=self.dataset_year, - simulation_cls=self.simulation_cls, - label=self.label, - strict_materialization=self.strict_materialization, - direct_override_variables=self.direct_override_variables, - ) - self.last_reports = reports - return {name: report.benchmark_result for name, report in reports.items()} - - -@dataclass -class _PolicyEngineUSBaselineBatchResultEvaluator(BatchBenchmarkResultEvaluator): - baseline_dataset: str | Any - period: int | str - dataset_year: int | None - simulation_cls: Any | None - baseline_label: str - strict_materialization: bool - cache: PolicyEngineUSComparisonCache | None - last_reports: dict[str, PolicyEngineUSTargetEvaluationReport] = field( - default_factory=dict, - init=False, - ) - - def evaluate_target_sets( - self, - target_sets: dict[str, TargetSet], - slices: tuple[BenchmarkSliceSpec, ...], - ) -> dict[str, BenchmarkResult]: - del slices - reports = _evaluate_policyengine_us_baseline_target_sets( - target_sets, - baseline_dataset=self.baseline_dataset, - period=self.period, - dataset_year=self.dataset_year, - simulation_cls=self.simulation_cls, - baseline_label=self.baseline_label, - strict_materialization=self.strict_materialization, - cache=self.cache, - ) - self.last_reports = reports - return {name: report.benchmark_result for name, report in reports.items()} - - -def evaluate_policyengine_us_harness( - candidate_tables: PolicyEngineUSEntityTableBundle, - provider: TargetProvider, - slices: list[PolicyEngineUSHarnessSlice] | tuple[PolicyEngineUSHarnessSlice, ...], - *, - baseline_dataset: str | Any, - dataset_year: int | None = None, - simulation_cls: Any | None = None, - candidate_label: str = "microplex", - baseline_label: str = "policyengine_baseline", - metadata: dict[str, Any] | None = None, - strict_materialization: bool = True, - cache: PolicyEngineUSComparisonCache | None = None, - candidate_direct_override_variables: tuple[str, ...] = (), -) -> PolicyEngineUSHarnessRun: - """Evaluate a candidate bundle against a baseline across named target slices.""" - if not slices: - raise ValueError("PolicyEngineUSHarness requires at least one slice") - - slice_target_sets = load_benchmark_slice_target_sets( - provider, - slices, - loader=( - (lambda effective_provider, query: cache.load_target_set(effective_provider, query)) - if cache is not None - else None - ), - ) - period = slices[0].query.period if slices[0].query.period is not None else 2024 - candidate_result_evaluator = _PolicyEngineUSCandidateBatchResultEvaluator( - tables=candidate_tables, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - label=candidate_label, - strict_materialization=strict_materialization, - direct_override_variables=candidate_direct_override_variables, - ) - candidate_results = evaluate_benchmark_slice_results( - slice_target_sets, - slices, - batch_evaluator=candidate_result_evaluator, - ) - baseline_result_evaluator = _PolicyEngineUSBaselineBatchResultEvaluator( - baseline_dataset=baseline_dataset, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - baseline_label=baseline_label, - strict_materialization=strict_materialization, - cache=cache, - ) - baseline_results = evaluate_benchmark_slice_results( - slice_target_sets, - slices, - batch_evaluator=baseline_result_evaluator, - ) - candidate_reports = candidate_result_evaluator.last_reports - baseline_reports = baseline_result_evaluator.last_reports - slice_results = [ - PolicyEngineUSHarnessSliceResult( - slice=slice_spec, - comparison=PolicyEngineUSTargetComparisonReport( - candidate=candidate_reports[slice_spec.name], - baseline=baseline_reports[slice_spec.name], - ), - ) - for slice_spec in slices - ] - comparable_slice_results = [ - result for result in slice_results if result.comparison.benchmark_comparison is not None - ] - suite_metadata = dict(metadata or {}) - if len(comparable_slice_results) != len(slice_results): - suite_metadata["excluded_slice_names"] = [ - result.slice.name - for result in slice_results - if result.comparison.benchmark_comparison is None - ] - - return PolicyEngineUSHarnessRun( - candidate_label=candidate_label, - baseline_label=baseline_label, - period=period, - slice_results=slice_results, - metadata=suite_metadata, - _benchmark_suite=( - build_benchmark_suite_from_results( - candidate_label=candidate_label, - baseline_label=baseline_label, - period=period, - slices=[result.slice for result in comparable_slice_results], - candidate_results={ - result.slice.name: candidate_results[result.slice.name] - for result in comparable_slice_results - }, - baseline_results={ - result.slice.name: baseline_results[result.slice.name] - for result in comparable_slice_results - }, - group_fields=POLICYENGINE_US_BENCHMARK_GROUP_FIELDS, - metadata=suite_metadata, - ) - if comparable_slice_results - else build_benchmark_suite_result( - candidate_label=candidate_label, - baseline_label=baseline_label, - period=period, - slice_results=[], - metadata=suite_metadata, - ) - ), - ) - - -def _evaluate_policyengine_us_baseline_target_sets( - target_sets: dict[str, TargetSet], - *, - baseline_dataset: str | Any, - period: int | str, - dataset_year: int | None, - simulation_cls: Any | None, - baseline_label: str, - strict_materialization: bool, - cache: PolicyEngineUSComparisonCache | None, -) -> dict[str, PolicyEngineUSTargetEvaluationReport]: - union_target_set = union_target_sets(target_sets) - baseline_union_report = ( - cache.load_baseline_report( - target_set=union_target_set, - baseline_dataset=baseline_dataset, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - baseline_label=baseline_label, - strict_materialization=strict_materialization, - ) - if cache is not None - else evaluate_policyengine_us_target_set( - load_policyengine_us_entity_tables( - baseline_dataset, - period=period, - ), - union_target_set, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - label=baseline_label, - strict_materialization=strict_materialization, - ) - ) - return { - name: slice_policyengine_us_target_evaluation_report( - baseline_union_report, - target_set, - ) - for name, target_set in target_sets.items() - } - - -def filter_nonempty_policyengine_us_harness_slices( - provider: TargetProvider, - slices: list[PolicyEngineUSHarnessSlice] | tuple[PolicyEngineUSHarnessSlice, ...], - *, - cache: PolicyEngineUSComparisonCache | None = None, -) -> tuple[PolicyEngineUSHarnessSlice, ...]: - """Drop harness slices that resolve to no canonical targets.""" - return filter_nonempty_benchmark_slices( - provider, - slices, - loader=( - (lambda effective_provider, query: cache.load_target_set(effective_provider, query)) - if cache is not None - else None - ), - ) - - -def default_policyengine_us_harness_slices( - *, - period: int, -) -> tuple[PolicyEngineUSHarnessSlice, ...]: - """Return a small default PE-US harness focused on parity-critical aggregates.""" - return ( - PolicyEngineUSHarnessSlice( - name="household_count", - description="National household count target slice", - query=TargetQuery(period=period, names=("household_count",)), - ), - PolicyEngineUSHarnessSlice( - name="snap", - description="National SNAP amount and recipient slices", - query=TargetQuery(period=period, names=("snap", "household_count")), - ), - PolicyEngineUSHarnessSlice( - name="california", - description="California geographic subset", - query=TargetQuery( - period=period, - metadata_filters={"geo_level": "state"}, - ), - ), - ) - - -def default_policyengine_us_db_all_target_slices( - *, - period: int, - reform_id: int = 0, -) -> tuple[PolicyEngineUSHarnessSlice, ...]: - """Return one benchmark slice spanning all active PE-US DB targets for a period.""" - return ( - PolicyEngineUSHarnessSlice( - name="all_targets", - description="All active PE-US DB targets for this benchmark period", - tags=("benchmark", "all_targets"), - query=TargetQuery( - period=period, - provider_filters={"reform_id": reform_id}, - ), - ), - ) - - -def default_policyengine_us_db_harness_slices( - *, - period: int, - variables: tuple[str, ...] = (), - domain_variables: tuple[str, ...] = (), - geo_levels: tuple[str, ...] = (), - reform_id: int = 0, -) -> tuple[PolicyEngineUSHarnessSlice, ...]: - """Return DB-backed default PE-US harness slices derived from target filters.""" - base_provider_filters = { - "reform_id": reform_id, - "variables": list(variables) if variables else None, - "domain_variables": list(domain_variables) if domain_variables else None, - "geo_levels": list(geo_levels) if geo_levels else None, - } - slices = [ - PolicyEngineUSHarnessSlice( - name="all_targets", - description="All PE-US DB targets selected for this build", - tags=("benchmark", "all_targets"), - query=TargetQuery( - period=period, - provider_filters={ - key: value - for key, value in base_provider_filters.items() - if value is not None - }, - ), - ) - ] - for variable in variables: - slices.append( - PolicyEngineUSHarnessSlice( - name=variable, - description=f"{variable} targets selected for this build", - query=TargetQuery( - period=period, - provider_filters={ - key: value - for key, value in { - **base_provider_filters, - "variables": [variable], - }.items() - if value is not None - }, - ), - ) - ) - return tuple(slices) - - -def default_policyengine_us_db_parity_slices( - *, - period: int, - variables: tuple[str, ...] = (), - domain_variables: tuple[str, ...] = (), - geo_levels: tuple[str, ...] = (), - reform_id: int = 0, -) -> tuple[PolicyEngineUSHarnessSlice, ...]: - """Return the default PE-US parity suite split across national and local loss.""" - slice_specs = [ - { - "name": "national_aggregate_core", - "description": "National aggregate calibration targets from PE-US production", - "geo_levels": ("national",), - "variables": ( - "adjusted_gross_income", - "child_support_expense", - "child_support_received", - "health_insurance_premiums_without_medicare_part_b", - "income_tax_positive", - "medicaid", - "medicare_part_b_premiums", - "other_medical_expenses", - "over_the_counter_health_expenses", - "qualified_business_income_deduction", - "rent", - "salt_deduction", - "snap", - "social_security", - "social_security_disability", - "social_security_retirement", - "spm_unit_capped_housing_subsidy", - "spm_unit_capped_work_childcare_expenses", - "ssi", - "tanf", - "tip_income", - "unemployment_compensation", - ), - "domain_variable_is_null": True, - "tags": ("parity", "national", "aggregate"), - }, - { - "name": "national_soi_amounts", - "description": "National IRS SOI amount targets used in production calibration", - "geo_levels": ("national",), - "variables": ( - "income_tax_before_credits", - "dividend_income", - "net_capital_gains", - "qualified_business_income_deduction", - "qualified_dividend_income", - "rental_income", - "salt", - "self_employment_income", - "tax_exempt_interest_income", - "tax_unit_partnership_s_corp_income", - "taxable_interest_income", - "taxable_ira_distributions", - "taxable_pension_income", - "taxable_social_security", - "unemployment_compensation", - ), - "domain_variable_values": ( - "income_tax_before_credits", - "dividend_income", - "net_capital_gains", - "qualified_business_income_deduction", - "qualified_dividend_income", - "rental_income", - "salt", - "self_employment_income", - "tax_exempt_interest_income", - "tax_unit_partnership_s_corp_income", - "taxable_interest_income", - "taxable_ira_distributions", - "taxable_pension_income", - "taxable_social_security", - "unemployment_compensation", - ), - "tags": ("parity", "national", "tax", "soi_amounts"), - }, - { - "name": "national_soi_filer_counts", - "description": "National IRS SOI filer-count targets used in production calibration", - "geo_levels": ("national",), - "variables": ("tax_unit_count",), - "domain_variable_values": ( - "dividend_income", - "income_tax", - "income_tax_before_credits", - "medical_expense_deduction", - "net_capital_gains", - "qualified_business_income_deduction", - "qualified_dividend_income", - "real_estate_taxes", - "rental_income", - "salt", - "self_employment_income", - "tax_exempt_interest_income", - "tax_unit_partnership_s_corp_income", - "taxable_interest_income", - "taxable_ira_distributions", - "taxable_pension_income", - "taxable_social_security", - "unemployment_compensation", - ), - "tags": ("parity", "national", "counts", "soi_filers"), - }, - { - "name": "state_programs_core", - "description": "State SNAP household counts and Medicaid recipiency counts from production calibration", - "geo_levels": ("state",), - "variables": ("household_count", "person_count"), - "domain_variable_values": ("snap", "medicaid_enrolled"), - "tags": ("parity", "local", "state", "programs"), - }, - { - "name": "district_age_counts", - "description": "District age-band person counts from production calibration", - "geo_levels": ("district",), - "variables": ("person_count",), - "domain_variable_values": ("age",), - "tags": ("parity", "local", "district", "counts", "age"), - }, - { - "name": "district_agi_counts", - "description": "District AGI-band person counts from production calibration", - "geo_levels": ("district",), - "variables": ("person_count",), - "domain_variable_values": ("adjusted_gross_income",), - "tags": ("parity", "local", "district", "counts", "agi"), - }, - { - "name": "district_snap_households", - "description": "District SNAP-recipient household counts from production calibration", - "geo_levels": ("district",), - "variables": ("household_count",), - "domain_variable_values": ("snap",), - "tags": ("parity", "local", "district", "programs", "snap"), - }, - { - "name": "district_income_core", - "description": "District income-component totals from production calibration", - "geo_levels": ("district",), - "variables": ( - "real_estate_taxes", - "self_employment_income", - "taxable_pension_income", - "unemployment_compensation", - ), - "domain_variable_values": ( - "real_estate_taxes", - "self_employment_income", - "taxable_pension_income", - "unemployment_compensation", - ), - "tags": ("parity", "local", "district", "income"), - }, - ] - slices: list[PolicyEngineUSHarnessSlice] = [] - for spec in slice_specs: - provider_filters = _build_parity_provider_filters( - base_variables=variables, - base_domain_variables=domain_variables, - base_geo_levels=geo_levels, - spec_variables=spec.get("variables"), - spec_domain_variables=spec.get("domain_variables"), - spec_domain_variable_values=spec.get("domain_variable_values"), - spec_domain_variable_is_null=spec.get("domain_variable_is_null"), - spec_geo_levels=spec.get("geo_levels"), - reform_id=reform_id, - ) - if provider_filters is None: - continue - slices.append( - PolicyEngineUSHarnessSlice( - name=spec["name"], - description=spec["description"], - tags=spec["tags"], - query=TargetQuery(period=period, provider_filters=provider_filters), - ) - ) - return tuple(slices) - - -def _build_parity_provider_filters( - *, - base_variables: tuple[str, ...], - base_domain_variables: tuple[str, ...], - base_geo_levels: tuple[str, ...], - spec_variables: tuple[str, ...] | None, - spec_domain_variables: tuple[str, ...] | None, - spec_domain_variable_values: tuple[str, ...] | None, - spec_domain_variable_is_null: bool | None, - spec_geo_levels: tuple[str, ...] | None, - reform_id: int, -) -> dict[str, Any] | None: - resolved_variables = _intersect_optional_filters(base_variables, spec_variables) - resolved_domain_variable_values = _intersect_optional_filters( - base_domain_variables, - spec_domain_variable_values, - ) - resolved_domain_variables = ( - None - if spec_domain_variable_values is not None - else _intersect_optional_filters( - base_domain_variables, - spec_domain_variables, - ) - ) - resolved_geo_levels = _intersect_optional_filters(base_geo_levels, spec_geo_levels) - if ( - resolved_variables == () - or resolved_domain_variables == () - or resolved_domain_variable_values == () - or resolved_geo_levels == () - or (base_domain_variables and spec_domain_variable_is_null is True) - ): - return None - return { - key: value - for key, value in { - "reform_id": reform_id, - "variables": list(resolved_variables) if resolved_variables else None, - "domain_variables": ( - list(resolved_domain_variables) if resolved_domain_variables else None - ), - "domain_variable_values": ( - list(resolved_domain_variable_values) - if resolved_domain_variable_values - else None - ), - "domain_variable_is_null": ( - spec_domain_variable_is_null - if spec_domain_variable_is_null is not None - else None - ), - "geo_levels": list(resolved_geo_levels) if resolved_geo_levels else None, - }.items() - if value is not None - } - - -def _intersect_optional_filters( - base_values: tuple[str, ...], - spec_values: tuple[str, ...] | None, -) -> tuple[str, ...] | None: - if not base_values and not spec_values: - return None - if not base_values: - return tuple(spec_values or ()) - if spec_values is None: - return tuple(base_values) - intersection = tuple(value for value in spec_values if value in set(base_values)) - return intersection - - -def _report_to_dict(report: Any) -> dict[str, Any]: - return { - "label": report.label, - "period": report.period, - "summary": { - "supported_target_count": report.supported_target_count, - "unsupported_target_count": len(report.unsupported_targets), - "mean_abs_relative_error": report.mean_abs_relative_error, - "max_abs_relative_error": report.max_abs_relative_error, - }, - "materialized_variables": list(report.materialized_variables), - "materialization_failures": dict(report.materialization_failures), - "unsupported_targets": [_target_to_dict(target) for target in report.unsupported_targets], - "evaluations": [ - { - "target": _target_to_dict(evaluation.target), - "actual_value": evaluation.actual_value, - "absolute_error": evaluation.absolute_error, - "relative_error": evaluation.relative_error, - } - for evaluation in report.evaluations - ], - } - - -def _report_from_dict(payload: dict[str, Any]) -> Any: - return PolicyEngineUSTargetEvaluationReport( - label=payload["label"], - period=payload["period"], - evaluations=[ - PolicyEngineUSTargetEvaluation( - target=_target_from_dict(item["target"]), - actual_value=float(item["actual_value"]), - ) - for item in payload.get("evaluations", []) - ], - unsupported_targets=[ - _target_from_dict(target) - for target in payload.get("unsupported_targets", []) - ], - materialized_variables=tuple(payload.get("materialized_variables", [])), - materialization_failures=dict(payload.get("materialization_failures", {})), - ) - - -def _slice_result_summary( - result: PolicyEngineUSHarnessSliceResult, -) -> dict[str, float | int | bool | dict[str, str] | None]: - candidate = result.comparison.candidate - baseline = result.comparison.baseline - return { - "candidate_supported_target_count": candidate.supported_target_count, - "candidate_unsupported_target_count": len(candidate.unsupported_targets), - "candidate_mean_abs_relative_error": candidate.mean_abs_relative_error, - "candidate_max_abs_relative_error": candidate.max_abs_relative_error, - "candidate_materialization_failures": dict(candidate.materialization_failures), - "baseline_supported_target_count": ( - baseline.supported_target_count if baseline is not None else None - ), - "baseline_unsupported_target_count": ( - len(baseline.unsupported_targets) if baseline is not None else None - ), - "baseline_mean_abs_relative_error": ( - baseline.mean_abs_relative_error if baseline is not None else None - ), - "baseline_max_abs_relative_error": ( - baseline.max_abs_relative_error if baseline is not None else None - ), - "baseline_materialization_failures": ( - dict(baseline.materialization_failures) - if baseline is not None - else {} - ), - "mean_abs_relative_error_delta": result.mean_abs_relative_error_delta, - "candidate_beats_baseline": result.candidate_beats_baseline, - } - - -def _target_to_dict(target: Any) -> dict[str, Any]: - return { - "name": target.name, - "entity": target.entity.value, - "value": float(target.value), - "period": target.period, - "measure": target.measure, - "aggregation": target.aggregation.value, - "filters": [ - { - "feature": target_filter.feature, - "operator": target_filter.operator.value, - "value": target_filter.value, - } - for target_filter in target.filters - ], - "tolerance": target.tolerance, - "source": target.source, - "units": target.units, - "description": target.description, - "metadata": dict(target.metadata), - } - - -def _target_from_dict(payload: dict[str, Any]) -> Any: - from microplex.core import EntityType - from microplex.targets import TargetAggregation, TargetSpec - - return TargetSpec( - name=payload["name"], - entity=EntityType(payload["entity"]), - value=float(payload["value"]), - period=payload["period"], - measure=payload.get("measure"), - aggregation=TargetAggregation(payload["aggregation"]), - filters=tuple( - TargetFilter( - item["feature"] if "feature" in item else item["variable"], - FilterOperator(item["operator"]), - item["value"], - ) - for item in payload.get("filters", []) - ), - tolerance=payload.get("tolerance"), - source=payload.get("source"), - units=payload.get("units"), - description=payload.get("description"), - metadata=dict(payload.get("metadata", {})), - ) diff --git a/src/microplex_us/policyengine/takeup.py b/src/microplex_us/policyengine/takeup.py deleted file mode 100644 index 2e958783..00000000 --- a/src/microplex_us/policyengine/takeup.py +++ /dev/null @@ -1,749 +0,0 @@ -"""Deterministic PolicyEngine-US take-up input generation.""" - -from __future__ import annotations - -import logging -import warnings -from collections.abc import Iterable - -import numpy as np -import pandas as pd - -LOGGER = logging.getLogger(__name__) - -DEFAULT_ACA_TAKEUP_RATE = 0.672 -DEFAULT_DC_PTC_TAKEUP_RATE = 0.32 -DEFAULT_EARLY_HEAD_START_TAKEUP_RATE = 0.09 -DEFAULT_EITC_TAKEUP_RATES_BY_CHILDREN = {0: 0.65, 1: 0.86, 2: 0.85, 3: 0.85} -DEFAULT_HEAD_START_TAKEUP_RATE = 0.30 -DEFAULT_MEDICAID_TAKEUP_RATE = 0.93 -DEFAULT_MEDICAID_TAKEUP_RATES_BY_STATE = { - "AK": 0.88, - "AL": 0.92, - "AR": 0.79, - "AZ": 0.95, - "CA": 0.78, - "CO": 0.99, - "CT": 0.89, - "DC": 0.99, - "DE": 0.86, - "FL": 0.98, - "GA": 0.73, - "HI": 0.88, - "IA": 0.84, - "ID": 0.78, - "IL": 0.85, - "IN": 0.99, - "KS": 0.92, - "KY": 0.87, - "LA": 0.79, - "MA": 0.94, - "MD": 0.95, - "ME": 0.92, - "MI": 0.91, - "MN": 0.89, - "MO": 0.89, - "MS": 0.75, - "MT": 0.83, - "NC": 0.94, - "ND": 0.91, - "NE": 0.79, - "NH": 0.84, - "NJ": 0.74, - "NM": 0.84, - "NV": 0.93, - "NY": 0.86, - "OH": 0.82, - "OK": 0.77, - "OR": 0.92, - "PA": 0.64, - "RI": 0.94, - "SC": 0.93, - "SD": 0.88, - "TN": 0.92, - "TX": 0.76, - "UT": 0.53, - "VA": 0.82, - "VT": 0.93, - "WA": 0.98, - "WI": 0.91, - "WV": 0.83, - "WY": 0.70, -} -DEFAULT_SNAP_TAKEUP_RATE = 0.82 -DEFAULT_TANF_TAKEUP_RATE = 0.22 -DEFAULT_VOLUNTARY_FILING_RATE = 0.05 -DEFAULT_VOLUNTARY_FILING_RATES = { - "no_children": { - "zero": {"under_65": 0.20, "age_65_plus": 0.05}, - "low": {"under_65": 0.24, "age_65_plus": 0.04}, - "medium": {"under_65": 0.0, "age_65_plus": 0.0}, - "high": {"under_65": 0.0, "age_65_plus": 0.005}, - }, - "with_children": { - "zero": {"under_65": 0.50, "age_65_plus": 0.075}, - "low": {"under_65": 0.60, "age_65_plus": 0.06}, - "medium": {"under_65": 0.0, "age_65_plus": 0.0}, - "high": {"under_65": 0.025, "age_65_plus": 0.0037}, - }, -} -WIC_TAKEUP_CATEGORY_PREGNANT = "PREGNANT" -WIC_TAKEUP_CATEGORY_POSTPARTUM = "POSTPARTUM" -WIC_TAKEUP_CATEGORY_BREASTFEEDING = "BREASTFEEDING" -WIC_TAKEUP_CATEGORY_INFANT = "INFANT" -WIC_TAKEUP_CATEGORY_CHILD = "CHILD" -WIC_TAKEUP_CATEGORY_NONE = "NONE" -DEFAULT_WIC_TAKEUP_RATES = { - WIC_TAKEUP_CATEGORY_PREGNANT: 0.456, - WIC_TAKEUP_CATEGORY_POSTPARTUM: 0.689, - WIC_TAKEUP_CATEGORY_BREASTFEEDING: 0.663, - WIC_TAKEUP_CATEGORY_INFANT: 0.784, - WIC_TAKEUP_CATEGORY_CHILD: 0.460, - WIC_TAKEUP_CATEGORY_NONE: 0.0, -} -DEFAULT_WIC_NUTRITIONAL_RISK_RATES = { - WIC_TAKEUP_CATEGORY_PREGNANT: 0.913, - WIC_TAKEUP_CATEGORY_POSTPARTUM: 0.933, - WIC_TAKEUP_CATEGORY_BREASTFEEDING: 0.889, - WIC_TAKEUP_CATEGORY_INFANT: 0.950, - WIC_TAKEUP_CATEGORY_CHILD: 0.752, - WIC_TAKEUP_CATEGORY_NONE: 0.0, -} -DEFAULT_PREGNANCY_RATE = 0.041 -EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN = "_mp_eitc_child_count_for_takeup" -VOLUNTARY_FILING_AGE_HEAD_HELPER_COLUMN = "_mp_voluntary_filing_age_head" -VOLUNTARY_FILING_WAGE_INCOME_HELPER_COLUMN = "_mp_voluntary_filing_wage_income" - -STATE_FIPS_TO_ABBR = { - 1: "AL", - 2: "AK", - 4: "AZ", - 5: "AR", - 6: "CA", - 8: "CO", - 9: "CT", - 10: "DE", - 11: "DC", - 12: "FL", - 13: "GA", - 15: "HI", - 16: "ID", - 17: "IL", - 18: "IN", - 19: "IA", - 20: "KS", - 21: "KY", - 22: "LA", - 23: "ME", - 24: "MD", - 25: "MA", - 26: "MI", - 27: "MN", - 28: "MS", - 29: "MO", - 30: "MT", - 31: "NE", - 32: "NV", - 33: "NH", - 34: "NJ", - 35: "NM", - 36: "NY", - 37: "NC", - 38: "ND", - 39: "OH", - 40: "OK", - 41: "OR", - 42: "PA", - 44: "RI", - 45: "SC", - 46: "SD", - 47: "TN", - 48: "TX", - 49: "UT", - 50: "VT", - 51: "VA", - 53: "WA", - 54: "WV", - 55: "WI", - 56: "WY", -} - -TAX_UNIT_TAKEUP_FEATURES = frozenset( - { - "takes_up_aca_if_eligible", - "takes_up_dc_ptc", - "takes_up_eitc", - "would_file_taxes_voluntarily", - } -) -PERSON_TAKEUP_FEATURES = frozenset( - { - "takes_up_early_head_start_if_eligible", - "takes_up_head_start_if_eligible", - "takes_up_medicaid_if_eligible", - "takes_up_medicare_if_eligible", - "takes_up_ssi_if_eligible", - "would_claim_wic", - } -) -SPM_UNIT_TAKEUP_FEATURES = frozenset( - { - "takes_up_housing_assistance_if_eligible", - "takes_up_snap_if_eligible", - "takes_up_tanf_if_eligible", - } -) - - -def rerandomize_policyengine_us_takeup_frames( - *, - persons: pd.DataFrame | None, - tax_units: pd.DataFrame | None, - spm_units: pd.DataFrame | None, - features: Iterable[str], - year: int, -) -> tuple[ - pd.DataFrame | None, pd.DataFrame | None, pd.DataFrame | None, tuple[str, ...] -]: - """Regenerate requested PolicyEngine-US take-up inputs where supported.""" - requested = tuple(dict.fromkeys(str(feature) for feature in features)) - unsupported: list[str] = [] - - tax_units_out = tax_units.copy() if tax_units is not None else None - tax_features = [ - feature for feature in requested if feature in TAX_UNIT_TAKEUP_FEATURES - ] - if tax_features and tax_units_out is None: - unsupported.extend(tax_features) - elif tax_units_out is not None: - tax_units_out = _rerandomize_tax_unit_takeup_features( - tax_units_out, - features=tax_features, - year=year, - ) - - persons_out = persons.copy() if persons is not None else None - person_features = [ - feature for feature in requested if feature in PERSON_TAKEUP_FEATURES - ] - if person_features and persons_out is None: - unsupported.extend(person_features) - elif persons_out is not None: - persons_out, person_unsupported = _rerandomize_person_takeup_features( - persons_out, - features=person_features, - year=year, - ) - unsupported.extend(person_unsupported) - - spm_units_out = spm_units.copy() if spm_units is not None else None - spm_features = [ - feature for feature in requested if feature in SPM_UNIT_TAKEUP_FEATURES - ] - if spm_features and spm_units_out is None: - unsupported.extend(spm_features) - elif spm_units_out is not None: - spm_units_out, spm_unsupported = _rerandomize_spm_unit_takeup_features( - spm_units_out, - features=spm_features, - year=year, - ) - unsupported.extend(spm_unsupported) - - known = TAX_UNIT_TAKEUP_FEATURES | PERSON_TAKEUP_FEATURES | SPM_UNIT_TAKEUP_FEATURES - unsupported.extend(feature for feature in requested if feature not in known) - return persons_out, tax_units_out, spm_units_out, tuple(dict.fromkeys(unsupported)) - - -def _rerandomize_tax_unit_takeup_features( - tax_units: pd.DataFrame, - *, - features: Iterable[str], - year: int, -) -> pd.DataFrame: - requested = set(features) - result = tax_units.copy() - if "takes_up_aca_if_eligible" in requested: - result = _set_scalar_takeup( - result, - column="takes_up_aca_if_eligible", - rate=_load_microplex_takeup_rate("aca", year), - ) - if "takes_up_dc_ptc" in requested: - result = _set_scalar_takeup( - result, - column="takes_up_dc_ptc", - rate=_load_microplex_takeup_rate("dc_ptc", year), - ) - if "takes_up_eitc" in requested: - result = _set_eitc_takeup(result, year=year) - if "would_file_taxes_voluntarily" in requested: - if "takes_up_eitc" not in result.columns: - result = _set_eitc_takeup(result, year=year) - result = _set_voluntary_filing(result, year=year) - return result - - -def _rerandomize_person_takeup_features( - persons: pd.DataFrame, - *, - features: Iterable[str], - year: int, -) -> tuple[pd.DataFrame, tuple[str, ...]]: - requested = set(features) - unsupported: list[str] = [] - result = persons.copy() - if "takes_up_medicaid_if_eligible" in requested: - rates = _load_microplex_medicaid_takeup_rates(year) - states = _person_state_abbreviation(result) - takeup_rate = states.map( - lambda state: rates.get(state, DEFAULT_MEDICAID_TAKEUP_RATE) - ) - rng = _microplex_seeded_rng("takes_up_medicaid_if_eligible") - result["takes_up_medicaid_if_eligible"] = rng.random( - len(result) - ) < takeup_rate.to_numpy(dtype=float) - if "takes_up_head_start_if_eligible" in requested: - result = _set_scalar_takeup( - result, - column="takes_up_head_start_if_eligible", - rate=_load_microplex_takeup_rate("head_start", year), - ) - if "takes_up_early_head_start_if_eligible" in requested: - result = _set_scalar_takeup( - result, - column="takes_up_early_head_start_if_eligible", - rate=_load_microplex_takeup_rate("early_head_start", year), - ) - if "takes_up_ssi_if_eligible" in requested: - result, supported = _set_ssi_takeup_if_available(result) - if not supported: - unsupported.append("takes_up_ssi_if_eligible") - if "takes_up_medicare_if_eligible" in requested: - if "takes_up_medicare_if_eligible" in result.columns: - result["takes_up_medicare_if_eligible"] = _normal_bool_series( - result["takes_up_medicare_if_eligible"], - index=result.index, - ) - else: - unsupported.append("takes_up_medicare_if_eligible") - if "would_claim_wic" in requested: - result = _set_wic_takeup_inputs(result, year=year) - return result, tuple(unsupported) - - -def _rerandomize_spm_unit_takeup_features( - spm_units: pd.DataFrame, - *, - features: Iterable[str], - year: int, -) -> tuple[pd.DataFrame, tuple[str, ...]]: - requested = set(features) - unsupported: list[str] = [] - result = spm_units.copy() - if "takes_up_snap_if_eligible" in requested: - result = _set_scalar_takeup( - result, - column="takes_up_snap_if_eligible", - rate=_load_microplex_takeup_rate("snap", year), - ) - if "takes_up_tanf_if_eligible" in requested: - result = _set_scalar_takeup( - result, - column="takes_up_tanf_if_eligible", - rate=_load_microplex_takeup_rate("tanf", year), - ) - if "takes_up_housing_assistance_if_eligible" in requested: - if "takes_up_housing_assistance_if_eligible" in result.columns: - result["takes_up_housing_assistance_if_eligible"] = _normal_bool_series( - result["takes_up_housing_assistance_if_eligible"], - index=result.index, - ) - else: - unsupported.append("takes_up_housing_assistance_if_eligible") - return result, tuple(unsupported) - - -def _set_scalar_takeup( - frame: pd.DataFrame, - *, - column: str, - rate: float, -) -> pd.DataFrame: - result = frame.copy() - rng = _microplex_seeded_rng(column) - result[column] = rng.random(len(result)) < float(rate) - return result - - -def _set_eitc_takeup( - tax_units: pd.DataFrame, - *, - year: int, -) -> pd.DataFrame: - result = tax_units.copy() - rates = _load_microplex_eitc_takeup_rates(year) - child_count = _tax_unit_child_count_for_takeup(result) - takeup_rate = child_count.map(lambda count: rates.get(int(count), 0.85)) - rng = _microplex_seeded_rng("takes_up_eitc") - result["takes_up_eitc"] = rng.random(len(result)) < takeup_rate.to_numpy( - dtype=float - ) - return result - - -def _set_voluntary_filing( - tax_units: pd.DataFrame, - *, - year: int, -) -> pd.DataFrame: - result = tax_units.copy() - rates = _load_microplex_voluntary_filing_rates(year) - takes_up_eitc = _normal_bool_series( - result.get("takes_up_eitc", False), - index=result.index, - ) - child_count = _tax_unit_child_count_for_takeup(result) - wage_income = pd.to_numeric( - result.get( - VOLUNTARY_FILING_WAGE_INCOME_HELPER_COLUMN, - pd.Series(0.0, index=result.index), - ), - errors="coerce", - ).fillna(0.0) - age_head = pd.to_numeric( - result.get( - VOLUNTARY_FILING_AGE_HEAD_HELPER_COLUMN, - pd.Series(0.0, index=result.index), - ), - errors="coerce", - ).fillna(0.0) - takeup_rate = _voluntary_filing_rate_by_tax_unit( - rates, - child_count=child_count, - wage_income=wage_income, - age_head=age_head, - ) - rng = _microplex_seeded_rng("would_file_taxes_voluntarily") - result["would_file_taxes_voluntarily"] = (~takes_up_eitc.to_numpy(dtype=bool)) & ( - rng.random(len(result)) < takeup_rate.to_numpy(dtype=float) - ) - return result.drop( - columns=[ - EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN, - VOLUNTARY_FILING_AGE_HEAD_HELPER_COLUMN, - VOLUNTARY_FILING_WAGE_INCOME_HELPER_COLUMN, - ], - errors="ignore", - ) - - -def _tax_unit_child_count_for_takeup(tax_units: pd.DataFrame) -> pd.Series: - child_count_column = ( - EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN - if EITC_TAKEUP_CHILD_COUNT_HELPER_COLUMN in tax_units.columns - else "n_dependents" - ) - raw_child_count = ( - tax_units[child_count_column] - if child_count_column in tax_units.columns - else pd.Series(0, index=tax_units.index) - ) - return ( - pd.to_numeric(raw_child_count, errors="coerce") - .fillna(0) - .clip(lower=0, upper=3) - .astype(int) - ) - - -def _voluntary_filing_rate_by_tax_unit( - rates: dict, - *, - child_count: pd.Series, - wage_income: pd.Series, - age_head: pd.Series, -) -> pd.Series: - children_bin = np.where( - child_count.to_numpy(dtype=int) > 0, - "with_children", - "no_children", - ) - wage_values = wage_income.to_numpy(dtype=float) - wage_bin = np.select( - [wage_values <= 0.0, wage_values < 15_000.0, wage_values < 30_000.0], - ["zero", "low", "medium"], - default="high", - ) - age_bin = np.where( - age_head.to_numpy(dtype=float) >= 65.0, "age_65_plus", "under_65" - ) - values = [ - rates.get(children, {}).get(wage, {}).get(age, DEFAULT_VOLUNTARY_FILING_RATE) - for children, wage, age in zip(children_bin, wage_bin, age_bin, strict=True) - ] - return pd.Series(values, index=child_count.index, dtype=float) - - -def _set_ssi_takeup_if_available(persons: pd.DataFrame) -> tuple[pd.DataFrame, bool]: - result = persons.copy() - if "takes_up_ssi_if_eligible" in result.columns: - result["takes_up_ssi_if_eligible"] = _normal_bool_series( - result["takes_up_ssi_if_eligible"], - index=result.index, - ) - return result, True - if "ssi_reported" in result.columns: - result["takes_up_ssi_if_eligible"] = _nonzero_series(result["ssi_reported"]) - return result, True - if "ssi" in result.columns: - result["takes_up_ssi_if_eligible"] = _nonzero_series(result["ssi"]) - return result, True - return result, False - - -def _set_wic_takeup_inputs( - persons: pd.DataFrame, - *, - year: int, -) -> pd.DataFrame: - result = persons.copy() - if "is_pregnant" in result.columns: - result["is_pregnant"] = _normal_bool_series( - result["is_pregnant"], index=result.index - ) - else: - result = _set_pregnancy_inputs(result, year=year) - - category = _wic_category_for_takeup(result) - claim_rates = _load_microplex_wic_takeup_rates(year) - claim_rate = category.map(lambda value: claim_rates.get(str(value), 0.0)).fillna( - 0.0 - ) - rng = _microplex_seeded_rng("would_claim_wic") - result["would_claim_wic"] = rng.random(len(result)) < claim_rate.to_numpy( - dtype=float - ) - - risk_rates = _load_microplex_wic_nutritional_risk_rates(year) - risk_rate = category.map(lambda value: risk_rates.get(str(value), 0.0)).fillna(0.0) - receives_wic = _normal_bool_series( - result.get("receives_wic", False), - index=result.index, - ) - rng = _microplex_seeded_rng("is_wic_at_nutritional_risk") - result["is_wic_at_nutritional_risk"] = receives_wic | ( - rng.random(len(result)) < risk_rate.to_numpy(dtype=float) - ) - return result - - -def _set_pregnancy_inputs( - persons: pd.DataFrame, - *, - year: int, -) -> pd.DataFrame: - result = persons.copy() - index = result.index - age = pd.to_numeric( - result.get("age", pd.Series(0.0, index=index)), - errors="coerce", - ).fillna(0.0) - if "is_female" in result.columns: - female = _normal_bool_series(result["is_female"], index=index) - elif "sex" in result.columns: - female = ( - pd.to_numeric(result["sex"], errors="coerce").fillna(0).astype(int).eq(2) - ) - else: - female = pd.Series(False, index=index) - - rates = _load_microplex_pregnancy_rates(year) - states = _person_state_abbreviation(result) - pregnancy_rate = states.map( - lambda state: rates.get(str(state).upper(), DEFAULT_PREGNANCY_RATE) - ).fillna(DEFAULT_PREGNANCY_RATE) - eligible = female & age.ge(15.0) & age.le(44.0) - rng = _microplex_seeded_rng("is_pregnant") - result["is_pregnant"] = eligible.to_numpy(dtype=bool) & ( - rng.random(len(result)) < pregnancy_rate.to_numpy(dtype=float) - ) - return result - - -def _wic_category_for_takeup(persons: pd.DataFrame) -> pd.Series: - index = persons.index - age = pd.to_numeric( - persons.get("age", pd.Series(0.0, index=index)), - errors="coerce", - ).fillna(0.0) - pregnant = _normal_bool_series(persons.get("is_pregnant", False), index=index) - breastfeeding = _normal_bool_series( - persons.get("is_breastfeeding", False), - index=index, - ) - if "is_female" in persons.columns: - female = _normal_bool_series(persons["is_female"], index=index) - elif "sex" in persons.columns: - female = ( - pd.to_numeric(persons["sex"], errors="coerce").fillna(0).astype(int).eq(2) - ) - else: - female = pd.Series(False, index=index) - - own_children = pd.to_numeric( - persons.get("own_children_in_household", pd.Series(0, index=index)), - errors="coerce", - ).fillna(0.0) - mother = breastfeeding | (female & own_children.gt(0)) - - group_column = next( - ( - column - for column in ("family_id", "spm_unit_id", "household_id") - if column in persons.columns - ), - None, - ) - if group_column is None: - min_age_group = age - else: - group_keys = persons[group_column].where( - persons[group_column].notna(), - pd.Series(np.arange(len(persons)), index=index), - ) - min_age_group = age.groupby(group_keys, sort=False).transform("min") - - category = np.select( - [ - pregnant.to_numpy(dtype=bool), - ( - mother.to_numpy(dtype=bool) - & breastfeeding.to_numpy(dtype=bool) - & min_age_group.lt(1.0).to_numpy(dtype=bool) - ), - (mother.to_numpy(dtype=bool) & min_age_group.lt(0.5).to_numpy(dtype=bool)), - age.lt(1.0).to_numpy(dtype=bool), - age.lt(5.0).to_numpy(dtype=bool), - ], - [ - WIC_TAKEUP_CATEGORY_PREGNANT, - WIC_TAKEUP_CATEGORY_BREASTFEEDING, - WIC_TAKEUP_CATEGORY_POSTPARTUM, - WIC_TAKEUP_CATEGORY_INFANT, - WIC_TAKEUP_CATEGORY_CHILD, - ], - default=WIC_TAKEUP_CATEGORY_NONE, - ) - return pd.Series(category, index=index, dtype="string") - - -def _person_state_abbreviation(persons: pd.DataFrame) -> pd.Series: - if "state" in persons.columns: - state = persons["state"].astype("string").str.upper() - known = set(STATE_FIPS_TO_ABBR.values()) - return state.where(state.isin(known), "CA").fillna("CA") - if "state_code_str" in persons.columns: - state = persons["state_code_str"].astype("string").str.upper() - known = set(STATE_FIPS_TO_ABBR.values()) - return state.where(state.isin(known), "CA").fillna("CA") - if "state_fips" in persons.columns: - state_fips = ( - pd.to_numeric(persons["state_fips"], errors="coerce").fillna(6).astype(int) - ) - return state_fips.map(lambda value: STATE_FIPS_TO_ABBR.get(int(value), "CA")) - return pd.Series("CA", index=persons.index, dtype="string") - - -def _nonzero_series(value: pd.Series) -> pd.Series: - return pd.to_numeric(value, errors="coerce").fillna(0.0).ne(0.0) - - -def _normal_bool_series(value, *, index: pd.Index) -> pd.Series: - if isinstance(value, pd.Series): - series = value.reindex(index) - else: - series = pd.Series(value, index=index) - return pd.to_numeric(series, errors="coerce").fillna(0.0).ne(0.0).astype(bool) - - -def _stable_string_hash(value: str) -> np.uint64: - """Deterministic string hash for reproducible MP stochastic inputs.""" - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "overflow encountered", RuntimeWarning) - hashed = np.uint64(0) - for byte in value.encode("utf-8"): - hashed = hashed * np.uint64(31) + np.uint64(byte) - hashed = hashed ^ (hashed >> np.uint64(33)) - hashed = hashed * np.uint64(0xFF51AFD7ED558CCD) - hashed = hashed ^ (hashed >> np.uint64(33)) - return hashed - - -def _microplex_seeded_rng( - variable_name: str, *, salt: str | None = None -) -> np.random.Generator: - key = variable_name if salt is None else f"{variable_name}:{salt}" - seed = int(_stable_string_hash(key)) % (2**63) - return np.random.default_rng(seed=seed) - - -def _load_microplex_takeup_rate(variable_name: str, year: int) -> float: - """Load MP-owned scalar take-up assumptions for PE dataset inputs.""" - if variable_name == "aca": - return DEFAULT_ACA_TAKEUP_RATE - if variable_name == "dc_ptc": - return DEFAULT_DC_PTC_TAKEUP_RATE - if variable_name == "early_head_start": - return DEFAULT_EARLY_HEAD_START_TAKEUP_RATE - if variable_name == "head_start": - return 0.40 if year <= 2020 else DEFAULT_HEAD_START_TAKEUP_RATE - if variable_name == "snap": - return DEFAULT_SNAP_TAKEUP_RATE - if variable_name == "tanf": - return DEFAULT_TANF_TAKEUP_RATE - raise KeyError(f"Unknown Microplex take-up rate: {variable_name!r}") - - -def _load_microplex_medicaid_takeup_rates(year: int) -> dict[str, float]: - _ = year - return dict(DEFAULT_MEDICAID_TAKEUP_RATES_BY_STATE) - - -def _load_microplex_eitc_takeup_rates(year: int) -> dict[int, float]: - _ = year - return dict(DEFAULT_EITC_TAKEUP_RATES_BY_CHILDREN) - - -def _load_microplex_voluntary_filing_rates(year: int) -> dict: - _ = year - return { - children: {wage: dict(age_rates) for wage, age_rates in wage_rates.items()} - for children, wage_rates in DEFAULT_VOLUNTARY_FILING_RATES.items() - } - - -def _load_microplex_wic_takeup_rates(year: int) -> dict[str, float]: - _ = year - return dict(DEFAULT_WIC_TAKEUP_RATES) - - -def _load_microplex_wic_nutritional_risk_rates(year: int) -> dict[str, float]: - _ = year - return dict(DEFAULT_WIC_NUTRITIONAL_RISK_RATES) - - -def _load_microplex_pregnancy_rates(year: int) -> dict[str, float]: - _ = year - try: - from policyengine_us_data.db.etl_pregnancy import get_state_pregnancy_rates - - rates = get_state_pregnancy_rates() - except Exception: - LOGGER.warning( - "Failed to load state pregnancy rates; using national fallback", - exc_info=True, - ) - return {} - - return {str(state).upper(): float(rate) for state, rate in rates.items()} diff --git a/src/microplex_us/policyengine/target_profiles.py b/src/microplex_us/policyengine/target_profiles.py deleted file mode 100644 index bb7afa54..00000000 --- a/src/microplex_us/policyengine/target_profiles.py +++ /dev/null @@ -1,1105 +0,0 @@ -"""Named target-cell profiles for PolicyEngine US target selection.""" - -from __future__ import annotations - -from dataclasses import dataclass - - -@dataclass(frozen=True) -class PolicyEngineUSTargetCell: - """One exact target cell from the PolicyEngine US target DB.""" - - variable: str - geo_level: str | None = None - domain_variable: str | None = None - geographic_id: str | None = None - - def to_provider_filter(self) -> dict[str, str | None]: - return { - "variable": self.variable, - "geo_level": self.geo_level, - "domain_variable": self.domain_variable, - "geographic_id": self.geographic_id, - } - - -PolicyEngineUSTargetCellKey = tuple[str, str | None, str | None, str | None] - - -def _target_cell_key(cell: PolicyEngineUSTargetCell) -> PolicyEngineUSTargetCellKey: - return ( - cell.variable, - cell.geo_level, - cell.domain_variable, - cell.geographic_id, - ) - - -PE_NATIVE_BROAD_TARGET_CELLS: tuple[PolicyEngineUSTargetCell, ...] = ( - PolicyEngineUSTargetCell( - "aca_ptc", geo_level="national", domain_variable="aca_ptc" - ), - PolicyEngineUSTargetCell("adjusted_gross_income", geo_level="national"), - PolicyEngineUSTargetCell( - "adjusted_gross_income", - geo_level="national", - domain_variable="adjusted_gross_income", - ), - PolicyEngineUSTargetCell( - "adjusted_gross_income", - geo_level="national", - domain_variable="adjusted_gross_income,filing_status,income_tax_before_credits", - ), - PolicyEngineUSTargetCell( - "adjusted_gross_income", - geo_level="national", - domain_variable="adjusted_gross_income,income_tax_before_credits", - ), - PolicyEngineUSTargetCell("alimony_expense", geo_level="national"), - PolicyEngineUSTargetCell("alimony_income", geo_level="national"), - PolicyEngineUSTargetCell("charitable_deduction", geo_level="national"), - PolicyEngineUSTargetCell("childcare_expenses", geo_level="national"), - PolicyEngineUSTargetCell("child_support_expense", geo_level="national"), - PolicyEngineUSTargetCell("child_support_received", geo_level="national"), - PolicyEngineUSTargetCell("deductible_mortgage_interest", geo_level="national"), - PolicyEngineUSTargetCell("dividend_income", geo_level="national"), - PolicyEngineUSTargetCell( - "dividend_income", geo_level="national", domain_variable="dividend_income" - ), - PolicyEngineUSTargetCell("employment_income_before_lsr", geo_level="national"), - PolicyEngineUSTargetCell( - "employment_income", geo_level="national", domain_variable="employment_income" - ), - PolicyEngineUSTargetCell("eitc", geo_level="national"), - PolicyEngineUSTargetCell( - "eitc", geo_level="national", domain_variable="eitc_child_count" - ), - PolicyEngineUSTargetCell( - "eitc", - geo_level="national", - domain_variable="adjusted_gross_income,eitc,eitc_child_count", - ), - PolicyEngineUSTargetCell( - "health_insurance_premiums_without_medicare_part_b", - geo_level="national", - ), - PolicyEngineUSTargetCell( - "household_count", - geo_level="national", - domain_variable="spm_unit_energy_subsidy_reported", - ), - PolicyEngineUSTargetCell( - "income_tax", geo_level="national", domain_variable="income_tax" - ), - PolicyEngineUSTargetCell( - "income_tax_before_credits", - geo_level="national", - domain_variable="income_tax_before_credits", - ), - PolicyEngineUSTargetCell("income_tax_positive", geo_level="national"), - PolicyEngineUSTargetCell("investment_interest_expense", geo_level="national"), - PolicyEngineUSTargetCell("interest_deduction", geo_level="national"), - PolicyEngineUSTargetCell("long_term_capital_gains", geo_level="national"), - PolicyEngineUSTargetCell("medicaid", geo_level="national"), - PolicyEngineUSTargetCell("medical_expense_deduction", geo_level="national"), - PolicyEngineUSTargetCell( - "medical_expense_deduction", - geo_level="national", - domain_variable="medical_expense_deduction", - ), - PolicyEngineUSTargetCell( - "medical_expense_deduction", - geo_level="national", - domain_variable="medical_expense_deduction,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell("medicare_part_b_premiums", geo_level="national"), - PolicyEngineUSTargetCell( - "net_capital_gains", geo_level="national", domain_variable="net_capital_gains" - ), - PolicyEngineUSTargetCell("net_worth", geo_level="national"), - PolicyEngineUSTargetCell( - "non_refundable_ctc", - geo_level="national", - domain_variable="adjusted_gross_income,non_refundable_ctc", - ), - PolicyEngineUSTargetCell( - "non_refundable_ctc", - geo_level="national", - domain_variable="non_refundable_ctc", - ), - PolicyEngineUSTargetCell("other_medical_expenses", geo_level="national"), - PolicyEngineUSTargetCell("over_the_counter_health_expenses", geo_level="national"), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="aca_ptc" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="age" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="medicaid" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="snap" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="ssi" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="ssi,is_ssi_aged" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="ssi,is_blind" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="ssi,is_ssi_disabled" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="national", domain_variable="ssn_card_type" - ), - PolicyEngineUSTargetCell( - "qualified_business_income_deduction", geo_level="national" - ), - PolicyEngineUSTargetCell( - "qualified_business_income_deduction", - geo_level="national", - domain_variable="qualified_business_income_deduction", - ), - PolicyEngineUSTargetCell( - "qualified_dividend_income", - geo_level="national", - domain_variable="qualified_dividend_income", - ), - PolicyEngineUSTargetCell("real_estate_taxes", geo_level="national"), - PolicyEngineUSTargetCell( - "real_estate_taxes", - geo_level="national", - domain_variable="real_estate_taxes", - ), - PolicyEngineUSTargetCell( - "real_estate_taxes", - geo_level="national", - domain_variable="real_estate_taxes,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "refundable_ctc", - geo_level="national", - domain_variable="adjusted_gross_income,refundable_ctc", - ), - PolicyEngineUSTargetCell( - "refundable_ctc", geo_level="national", domain_variable="refundable_ctc" - ), - PolicyEngineUSTargetCell("rent", geo_level="national"), - PolicyEngineUSTargetCell("rental_income", geo_level="national"), - PolicyEngineUSTargetCell( - "rental_income", geo_level="national", domain_variable="rental_income" - ), - PolicyEngineUSTargetCell("roth_401k_contributions", geo_level="national"), - PolicyEngineUSTargetCell("roth_ira_contributions", geo_level="national"), - PolicyEngineUSTargetCell("salt", geo_level="national", domain_variable="salt"), - PolicyEngineUSTargetCell( - "salt", geo_level="national", domain_variable="salt,tax_unit_itemizes" - ), - PolicyEngineUSTargetCell("salt_deduction", geo_level="national"), - PolicyEngineUSTargetCell("salt_refund_income", geo_level="national"), - PolicyEngineUSTargetCell( - "self_employed_pension_contribution_ald", geo_level="national" - ), - PolicyEngineUSTargetCell( - "self_employment_income", - geo_level="national", - ), - PolicyEngineUSTargetCell( - "self_employment_income", - geo_level="national", - domain_variable="self_employment_income", - ), - PolicyEngineUSTargetCell("short_term_capital_gains", geo_level="national"), - PolicyEngineUSTargetCell( - "household_count", geo_level="national", domain_variable="snap" - ), - PolicyEngineUSTargetCell("snap", geo_level="national"), - PolicyEngineUSTargetCell("social_security", geo_level="national"), - PolicyEngineUSTargetCell("social_security_dependents", geo_level="national"), - PolicyEngineUSTargetCell("social_security_disability", geo_level="national"), - PolicyEngineUSTargetCell("social_security_retirement", geo_level="national"), - PolicyEngineUSTargetCell("social_security_survivors", geo_level="national"), - PolicyEngineUSTargetCell("spm_unit_capped_housing_subsidy", geo_level="national"), - PolicyEngineUSTargetCell( - "spm_unit_capped_work_childcare_expenses", geo_level="national" - ), - PolicyEngineUSTargetCell( - "spm_unit_count", geo_level="national", domain_variable="tanf" - ), - PolicyEngineUSTargetCell("ssi", geo_level="national"), - PolicyEngineUSTargetCell( - "ssi", geo_level="national", domain_variable="ssi,is_ssi_aged" - ), - PolicyEngineUSTargetCell( - "ssi", geo_level="national", domain_variable="ssi,is_blind" - ), - PolicyEngineUSTargetCell( - "ssi", geo_level="national", domain_variable="ssi,is_ssi_disabled" - ), - PolicyEngineUSTargetCell("tanf", geo_level="national"), - PolicyEngineUSTargetCell("tanf", geo_level="national", domain_variable="tanf"), - PolicyEngineUSTargetCell( - "tax_exempt_interest_income", - geo_level="national", - domain_variable="tax_exempt_interest_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="national", domain_variable="aca_ptc" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income,filing_status,income_tax_before_credits", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income,income_tax_before_credits", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income,non_refundable_ctc", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income,refundable_ctc", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="dividend_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="employment_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="eitc_child_count", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income,eitc,eitc_child_count", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="national", domain_variable="income_tax" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="income_tax_before_credits", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="investment_interest_expense", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="long_term_capital_gains", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="medical_expense_deduction", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="medical_expense_deduction,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="net_capital_gains", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="non_refundable_ctc", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="qualified_business_income_deduction", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="qualified_dividend_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="real_estate_taxes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="real_estate_taxes,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="refundable_ctc", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="national", domain_variable="rental_income" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="national", domain_variable="salt" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="salt_refund_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="salt,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="self_employment_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="short_term_capital_gains", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="tax_exempt_interest_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="tax_unit_partnership_s_corp_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="taxable_interest_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="taxable_ira_distributions", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="taxable_pension_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="taxable_social_security", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="total_self_employment_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="unemployment_compensation", - ), - PolicyEngineUSTargetCell( - "tax_unit_partnership_s_corp_income", - geo_level="national", - domain_variable="tax_unit_partnership_s_corp_income", - ), - PolicyEngineUSTargetCell( - "taxable_interest_income", - geo_level="national", - domain_variable="taxable_interest_income", - ), - PolicyEngineUSTargetCell( - "taxable_ira_distributions", - geo_level="national", - domain_variable="taxable_ira_distributions", - ), - PolicyEngineUSTargetCell( - "taxable_pension_income", - geo_level="national", - domain_variable="taxable_pension_income", - ), - PolicyEngineUSTargetCell( - "taxable_social_security", - geo_level="national", - domain_variable="taxable_social_security", - ), - PolicyEngineUSTargetCell("tip_income", geo_level="national"), - PolicyEngineUSTargetCell( - "total_self_employment_income", - geo_level="national", - domain_variable="total_self_employment_income", - ), - PolicyEngineUSTargetCell("traditional_401k_contributions", geo_level="national"), - PolicyEngineUSTargetCell("traditional_ira_contributions", geo_level="national"), - PolicyEngineUSTargetCell("unemployment_compensation", geo_level="national"), - PolicyEngineUSTargetCell( - "unemployment_compensation", - geo_level="national", - domain_variable="unemployment_compensation", - ), - PolicyEngineUSTargetCell("aca_ptc", geo_level="state", domain_variable=None), - PolicyEngineUSTargetCell("aca_ptc", geo_level="state", domain_variable="aca_ptc"), - PolicyEngineUSTargetCell("adjusted_gross_income", geo_level="state"), - PolicyEngineUSTargetCell( - "adjusted_gross_income", - geo_level="state", - domain_variable="adjusted_gross_income", - ), - PolicyEngineUSTargetCell( - "dividend_income", geo_level="state", domain_variable="dividend_income" - ), - PolicyEngineUSTargetCell("employment_income_before_lsr", geo_level="state"), - PolicyEngineUSTargetCell( - "employment_income", geo_level="state", domain_variable="employment_income" - ), - PolicyEngineUSTargetCell( - "eitc", geo_level="state", domain_variable="eitc_child_count" - ), - PolicyEngineUSTargetCell( - "household_count", geo_level="state", domain_variable="snap" - ), - PolicyEngineUSTargetCell( - "income_tax", geo_level="state", domain_variable="income_tax" - ), - PolicyEngineUSTargetCell( - "income_tax_before_credits", - geo_level="state", - domain_variable="income_tax_before_credits", - ), - PolicyEngineUSTargetCell("investment_interest_expense", geo_level="state"), - PolicyEngineUSTargetCell("long_term_capital_gains", geo_level="state"), - PolicyEngineUSTargetCell( - "medical_expense_deduction", - geo_level="state", - domain_variable="medical_expense_deduction", - ), - PolicyEngineUSTargetCell( - "medical_expense_deduction", - geo_level="state", - domain_variable="medical_expense_deduction,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "net_capital_gains", geo_level="state", domain_variable="net_capital_gains" - ), - PolicyEngineUSTargetCell( - "non_refundable_ctc", - geo_level="state", - domain_variable="non_refundable_ctc", - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="state", domain_variable="aca_ptc" - ), - PolicyEngineUSTargetCell( - "person_count", - geo_level="state", - domain_variable="aca_ptc,is_aca_ptc_eligible", - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="state", domain_variable="adjusted_gross_income" - ), - PolicyEngineUSTargetCell("person_count", geo_level="state", domain_variable="age"), - PolicyEngineUSTargetCell( - "person_count", geo_level="state", domain_variable="is_pregnant" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="state", domain_variable="medicaid_enrolled" - ), - PolicyEngineUSTargetCell("person_count", geo_level="state", domain_variable="snap"), - PolicyEngineUSTargetCell("person_count", geo_level="state", domain_variable="ssi"), - PolicyEngineUSTargetCell( - "person_count", geo_level="state", domain_variable="ssi,is_ssi_aged" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="state", domain_variable="ssi,is_blind" - ), - PolicyEngineUSTargetCell( - "person_count", geo_level="state", domain_variable="ssi,is_ssi_disabled" - ), - PolicyEngineUSTargetCell( - "qualified_business_income_deduction", - geo_level="state", - domain_variable="qualified_business_income_deduction", - ), - PolicyEngineUSTargetCell( - "qualified_dividend_income", - geo_level="state", - domain_variable="qualified_dividend_income", - ), - PolicyEngineUSTargetCell( - "real_estate_taxes", geo_level="state", domain_variable="real_estate_taxes" - ), - PolicyEngineUSTargetCell( - "real_estate_taxes", - geo_level="state", - domain_variable="real_estate_taxes,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "refundable_ctc", geo_level="state", domain_variable="refundable_ctc" - ), - PolicyEngineUSTargetCell( - "rental_income", geo_level="state", domain_variable="rental_income" - ), - PolicyEngineUSTargetCell("salt", geo_level="state", domain_variable="salt"), - PolicyEngineUSTargetCell( - "salt", geo_level="state", domain_variable="salt,tax_unit_itemizes" - ), - PolicyEngineUSTargetCell("salt_refund_income", geo_level="state"), - PolicyEngineUSTargetCell( - "self_employment_income", - geo_level="state", - ), - PolicyEngineUSTargetCell( - "self_employment_income", - geo_level="state", - domain_variable="self_employment_income", - ), - PolicyEngineUSTargetCell("short_term_capital_gains", geo_level="state"), - PolicyEngineUSTargetCell("snap", geo_level="state", domain_variable="snap"), - PolicyEngineUSTargetCell( - "spm_unit_count", geo_level="state", domain_variable="tanf" - ), - PolicyEngineUSTargetCell("ssi", geo_level="state"), - PolicyEngineUSTargetCell("state_income_tax", geo_level="state"), - PolicyEngineUSTargetCell("tanf", geo_level="state", domain_variable="tanf"), - PolicyEngineUSTargetCell( - "tax_exempt_interest_income", - geo_level="state", - domain_variable="tax_exempt_interest_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="aca_ptc" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="adjusted_gross_income" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="dividend_income" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="employment_income" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="eitc_child_count" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="income_tax" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="income_tax_before_credits", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="investment_interest_expense", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="long_term_capital_gains", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="medical_expense_deduction", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="medical_expense_deduction,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="net_capital_gains" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="non_refundable_ctc" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="qualified_business_income_deduction", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="qualified_dividend_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="real_estate_taxes" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="real_estate_taxes,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="refundable_ctc" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="rental_income" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="salt" - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="salt_refund_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="salt,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="selected_marketplace_plan_benchmark_ratio,used_aca_ptc", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="self_employment_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="short_term_capital_gains", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="tax_exempt_interest_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="tax_unit_partnership_s_corp_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="taxable_interest_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="taxable_ira_distributions", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="taxable_pension_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="taxable_social_security", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="total_self_employment_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="state", - domain_variable="unemployment_compensation", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", geo_level="state", domain_variable="used_aca_ptc" - ), - PolicyEngineUSTargetCell( - "tax_unit_partnership_s_corp_income", - geo_level="state", - domain_variable="tax_unit_partnership_s_corp_income", - ), - PolicyEngineUSTargetCell( - "taxable_interest_income", - geo_level="state", - domain_variable="taxable_interest_income", - ), - PolicyEngineUSTargetCell( - "taxable_ira_distributions", - geo_level="state", - domain_variable="taxable_ira_distributions", - ), - PolicyEngineUSTargetCell( - "taxable_pension_income", - geo_level="state", - domain_variable="taxable_pension_income", - ), - PolicyEngineUSTargetCell( - "taxable_social_security", - geo_level="state", - domain_variable="taxable_social_security", - ), - PolicyEngineUSTargetCell( - "total_self_employment_income", - geo_level="state", - domain_variable="total_self_employment_income", - ), - PolicyEngineUSTargetCell( - "unemployment_compensation", - geo_level="state", - domain_variable="unemployment_compensation", - ), -) - -_PE_NATIVE_BROAD_NO_STATE_ACA_EXCLUDED_CELLS = frozenset( - { - ("aca_ptc", "state", None, None), - ("aca_ptc", "state", "aca_ptc", None), - ("person_count", "state", "aca_ptc", None), - ("person_count", "state", "aca_ptc,is_aca_ptc_eligible", None), - ("tax_unit_count", "state", "aca_ptc", None), - ( - "tax_unit_count", - "state", - "selected_marketplace_plan_benchmark_ratio,used_aca_ptc", - None, - ), - ("tax_unit_count", "state", "used_aca_ptc", None), - } -) - -PE_NATIVE_BROAD_NO_STATE_ACA_TARGET_CELLS: tuple[PolicyEngineUSTargetCell, ...] = tuple( - cell - for cell in PE_NATIVE_BROAD_TARGET_CELLS - if _target_cell_key(cell) not in _PE_NATIVE_BROAD_NO_STATE_ACA_EXCLUDED_CELLS -) - -PE_NATIVE_BROAD_SOURCE_BACKED_EXCLUDED_CELL_REASONS: dict[ - PolicyEngineUSTargetCellKey, - str, -] = { - ( - "adjusted_gross_income", - "national", - "adjusted_gross_income,filing_status,income_tax_before_credits", - None, - ): ( - "SOI source packages currently loaded by Arch do not publish adjusted " - "gross income jointly by AGI band, filing status, and returns with " - "positive income tax before credits." - ), - ( - "adjusted_gross_income", - "national", - "adjusted_gross_income,income_tax_before_credits", - None, - ): ( - "SOI source packages currently loaded by Arch publish AGI bands and " - "income-tax-before-credits returns separately, not AGI amounts " - "restricted to returns with positive income tax before credits." - ), - ( - "tax_unit_count", - "national", - "adjusted_gross_income,filing_status,income_tax_before_credits", - None, - ): ( - "SOI Historic Table 2 does not provide the full AGI by filing-status " - "by positive-income-tax-before-credits joint count required by this " - "PolicyEngine cell." - ), - ( - "person_count", - "national", - "ssn_card_type", - None, - ): ( - "PolicyEngine ssn_card_type is a modeled legal-status input; no " - "accepted primary aggregate source mapping is encoded for Arch." - ), - ( - "person_count", - "state", - "is_pregnant", - None, - ): ( - "The PolicyEngine cell is a pregnancy stock by state; live births are " - "a flow and are not a defensible direct source fact for this target." - ), - ( - "person_count", - "state", - "adjusted_gross_income", - None, - ): ( - "Loaded SOI state AGI sources provide return counts and AGI amounts, " - "not filer-person counts by AGI band." - ), - ( - "child_support_expense", - "national", - None, - None, - ): ( - "No accepted primary source mapping is encoded for this " - "survey/model-input expense variable." - ), - ( - "child_support_received", - "national", - None, - None, - ): ( - "No accepted primary source mapping is encoded for this " - "survey/model-input receipt variable." - ), - ( - "childcare_expenses", - "national", - None, - None, - ): ( - "IRS child-care credit expenses and W-2 dependent-care benefits are " - "narrower tax concepts than PolicyEngine childcare_expenses, so they " - "are not treated as source-equivalent." - ), - ( - "health_insurance_premiums_without_medicare_part_b", - "national", - None, - None, - ): ( - "This premium component is a modeled/survey input; no accepted primary " - "aggregate source mapping is encoded for Arch." - ), - ( - "other_medical_expenses", - "national", - None, - None, - ): ( - "This out-of-pocket medical expense component is a survey/model input " - "without an accepted primary aggregate source mapping." - ), - ( - "over_the_counter_health_expenses", - "national", - None, - None, - ): ( - "This out-of-pocket medical expense component is a survey/model input " - "without an accepted primary aggregate source mapping." - ), - ( - "rent", - "national", - None, - None, - ): ( - "PolicyEngine rent is a household survey/model input; ACS rent tables " - "do not provide a direct aggregate source fact for this exact variable." - ), - ( - "salt", - "national", - "salt", - None, - ): ( - "SOI Table 2.1 itemized deduction sources cover itemizers; " - "PolicyEngine salt can be positive outside the itemizer domain." - ), - ( - "tax_unit_count", - "national", - "salt", - None, - ): ( - "SOI Table 2.1 publishes separate component counts, not the union " - "count of tax units with positive PolicyEngine salt." - ), - ( - "tax_unit_count", - "national", - "salt,tax_unit_itemizes", - None, - ): ( - "SOI Table 2.1 publishes separate component counts, not the union " - "count of itemizing tax units with positive PolicyEngine salt." - ), - ( - "salt", - "state", - "salt", - None, - ): ( - "Loaded state SOI sources do not provide an exact state-level " - "PolicyEngine salt amount; total state/local taxes also include " - "personal property taxes." - ), - ( - "salt", - "state", - "salt,tax_unit_itemizes", - None, - ): ( - "Loaded state SOI sources do not provide state-level itemizer salt " - "as income-or-sales tax plus real estate tax without personal " - "property taxes." - ), - ( - "tax_unit_count", - "state", - "salt", - None, - ): ( - "Loaded state SOI sources do not provide the union count of tax units " - "with positive PolicyEngine salt." - ), - ( - "tax_unit_count", - "state", - "salt,tax_unit_itemizes", - None, - ): ( - "Loaded state SOI sources do not provide the union count of itemizing " - "tax units with positive PolicyEngine salt." - ), - ( - "spm_unit_capped_housing_subsidy", - "national", - None, - None, - ): ( - "This is a capped SPM model amount rather than a direct publisher source fact." - ), - ( - "spm_unit_capped_work_childcare_expenses", - "national", - None, - None, - ): ( - "This is a capped SPM model amount rather than a direct publisher source fact." - ), -} - -_PENDING_IRS_DETAIL_SOURCE_REASON = ( - "Current Arch IRS SOI source packages do not yet encode an exact source " - "fact for this detailed PolicyEngine tax cell at the requested geography " - "and domain." -) - -_PENDING_ARCH_SOURCE_BACKED_CELL_REASONS: dict[ - PolicyEngineUSTargetCellKey, - str, -] = { - **{ - (variable, geo_level, None, None): _PENDING_IRS_DETAIL_SOURCE_REASON - for variable in ( - "long_term_capital_gains", - "salt_refund_income", - "short_term_capital_gains", - ) - for geo_level in ("national", "state") - }, - ( - "investment_interest_expense", - "state", - None, - None, - ): _PENDING_IRS_DETAIL_SOURCE_REASON, - **{ - ("tax_unit_count", geo_level, domain_variable, None): ( - _PENDING_IRS_DETAIL_SOURCE_REASON - ) - for geo_level in ("national", "state") - for domain_variable in ( - "investment_interest_expense", - "long_term_capital_gains", - "salt_refund_income", - "short_term_capital_gains", - ) - }, -} - -PE_NATIVE_BROAD_SOURCE_BACKED_EXCLUDED_CELL_REASONS = { - **PE_NATIVE_BROAD_SOURCE_BACKED_EXCLUDED_CELL_REASONS, - **_PENDING_ARCH_SOURCE_BACKED_CELL_REASONS, -} - -PE_NATIVE_BROAD_SOURCE_BACKED_TARGET_CELLS: tuple[PolicyEngineUSTargetCell, ...] = ( - tuple( - cell - for cell in PE_NATIVE_BROAD_TARGET_CELLS - if _target_cell_key(cell) - not in PE_NATIVE_BROAD_SOURCE_BACKED_EXCLUDED_CELL_REASONS - ) -) - -_TARGET_PROFILES: dict[str, tuple[PolicyEngineUSTargetCell, ...]] = { - "pe_native_broad": PE_NATIVE_BROAD_TARGET_CELLS, - "pe_native_broad_no_state_aca": PE_NATIVE_BROAD_NO_STATE_ACA_TARGET_CELLS, - "pe_native_broad_source_backed": PE_NATIVE_BROAD_SOURCE_BACKED_TARGET_CELLS, -} - -_TARGET_PROFILE_EXCLUSION_REASONS: dict[ - str, - dict[PolicyEngineUSTargetCellKey, str], -] = { - "pe_native_broad": {}, - "pe_native_broad_no_state_aca": { - cell_key: "State ACA cells are excluded from this profile variant." - for cell_key in _PE_NATIVE_BROAD_NO_STATE_ACA_EXCLUDED_CELLS - }, - "pe_native_broad_source_backed": ( - PE_NATIVE_BROAD_SOURCE_BACKED_EXCLUDED_CELL_REASONS - ), -} - - -def policyengine_us_target_profile_names() -> tuple[str, ...]: - return tuple(sorted(_TARGET_PROFILES)) - - -def resolve_policyengine_us_target_profile( - name: str, -) -> tuple[PolicyEngineUSTargetCell, ...]: - try: - return _TARGET_PROFILES[name] - except KeyError as exc: - known = ", ".join(policyengine_us_target_profile_names()) - raise ValueError( - f"Unknown PolicyEngine US target profile '{name}'. Known profiles: {known}" - ) from exc - - -def policyengine_us_target_profile_exclusion_reasons( - name: str, -) -> dict[PolicyEngineUSTargetCellKey, str]: - if name not in _TARGET_PROFILES: - known = ", ".join(policyengine_us_target_profile_names()) - raise ValueError( - f"Unknown PolicyEngine US target profile '{name}'. Known profiles: {known}" - ) - return dict(_TARGET_PROFILE_EXCLUSION_REASONS.get(name, {})) diff --git a/src/microplex_us/policyengine/us.py b/src/microplex_us/policyengine/us.py deleted file mode 100644 index f4c793e7..00000000 --- a/src/microplex_us/policyengine/us.py +++ /dev/null @@ -1,4610 +0,0 @@ -"""PolicyEngine US integration helpers for targets, simulation, and export.""" - -from __future__ import annotations - -import hashlib -import json -import sqlite3 -from collections.abc import Callable, Mapping, Sequence -from dataclasses import dataclass, field -from functools import lru_cache -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Any, Literal - -import h5py -import numpy as np -import pandas as pd -from microplex.calibration import LinearConstraint -from microplex.core import EntityType -from microplex.targets import ( - TargetAggregation, - TargetConstraintCompilationResult, - TargetFilter, - TargetQuery, - TargetReweightingConstraint, - TargetSet, - TargetSpec, - apply_target_query, -) - -from microplex_us.microdata_roles import POLICYENGINE_US_TAKEUP_INPUT_VARIABLES -from microplex_us.policyengine.takeup import ( - rerandomize_policyengine_us_takeup_frames, -) -from microplex_us.policyengine.target_profiles import ( - PolicyEngineUSTargetCell, - resolve_policyengine_us_target_profile, -) - -GEOGRAPHIC_CONSTRAINT_VARIABLES: set[str] = { - "state_fips", - "congressional_district_geoid", -} - -NYC_FULL_COUNTY_FIPS: frozenset[int] = frozenset( - { - 36005, # Bronx - 36047, # Kings (Brooklyn) - 36061, # New York (Manhattan) - 36081, # Queens - 36085, # Richmond (Staten Island) - } -) - - -@dataclass(frozen=True) -class PolicyEngineUSConstraint: - """A single stratum constraint from the PolicyEngine targets DB.""" - - variable: str - operation: str - value: str - - -@dataclass(frozen=True) -class PolicyEngineUSStratum: - """A stratum definition from the PolicyEngine US targets DB.""" - - stratum_id: int - definition_hash: str | None = None - parent_stratum_id: int | None = None - constraints: tuple[PolicyEngineUSConstraint, ...] = () - - -class PolicyEngineUSTargetValidationError(ValueError): - """Raised when imported PolicyEngine target metadata is inconsistent.""" - - -@dataclass(frozen=True) -class PolicyEngineUSDBTarget: - """A target row from the PolicyEngine US targets database.""" - - target_id: int - variable: str - period: int - stratum_id: int - reform_id: int - value: float - active: bool - tolerance: float | None = None - source: str | None = None - notes: str | None = None - geo_level: str | None = None - geographic_id: str | None = None - domain_variable: str | None = None - definition_hash: str | None = None - parent_stratum_id: int | None = None - constraints: tuple[PolicyEngineUSConstraint, ...] = () - - @property - def is_unconstrained(self) -> bool: - """Whether this target applies nationally without stratum filters.""" - return not self.constraints - - @property - def domain_variables(self) -> tuple[str, ...]: - """Domain-variable hints parsed from the target_overview view.""" - if self.domain_variable is None: - return () - values = [ - item.strip() - for item in str(self.domain_variable).split(",") - if item.strip() - ] - return tuple(dict.fromkeys(values)) - - -@dataclass(frozen=True) -class PolicyEngineUSQuantityTarget: - """A PE-computed quantity used as a Microplex calibration total.""" - - name: str - variable: str - column: str - period: int | None = None - map_to: str | None = None - aggregation: Literal["sum", "mean", "count_positive"] = "sum" - - -@dataclass(frozen=True) -class PolicyEngineUSVariableBinding: - """How a PolicyEngine variable is represented in Microplex entity tables.""" - - entity: EntityType - column: str | None = None - household_id_column: str = "household_id" - - -@dataclass(frozen=True) -class PolicyEngineUSEntityTableBundle: - """Entity tables aligned to household weights for PE-style calibration.""" - - households: pd.DataFrame - persons: pd.DataFrame | None = None - tax_units: pd.DataFrame | None = None - spm_units: pd.DataFrame | None = None - families: pd.DataFrame | None = None - marital_units: pd.DataFrame | None = None - - def table_for(self, entity: EntityType) -> pd.DataFrame: - if entity is EntityType.HOUSEHOLD: - return self.households - if entity is EntityType.PERSON and self.persons is not None: - return self.persons - if entity is EntityType.TAX_UNIT and self.tax_units is not None: - return self.tax_units - if entity is EntityType.SPM_UNIT and self.spm_units is not None: - return self.spm_units - if entity is EntityType.FAMILY and self.families is not None: - return self.families - raise KeyError(f"No table available for entity '{entity.value}'") - - -_PIPELINE_CHECKPOINT_TABLES: tuple[str, ...] = ( - "households", - "persons", - "tax_units", - "spm_units", - "families", - "marital_units", -) - -USPipelineCheckpointStage = Literal[ - "post_imputation", - "post_microsim", - "post_calibration", -] - -_ALLOWED_CHECKPOINT_STAGES: frozenset[str] = frozenset( - {"post_imputation", "post_microsim", "post_calibration"} -) - - -def save_us_pipeline_checkpoint( - bundle: PolicyEngineUSEntityTableBundle, - path: str | Path, - *, - stage: USPipelineCheckpointStage, -) -> Path: - """Persist a pipeline-stage bundle to ``path`` as parquet + metadata. - - Writes one parquet file per non-None entity table plus a - ``metadata.json`` index tagged with the pipeline ``stage``. Two - stages are supported: - - * ``"post_imputation"`` — after donor imputation, before PE microsim - materializes target variables. Resuming from here reruns - microsim + calibration. - * ``"post_microsim"`` — after microsim materialization, before the - calibration fit loop. Resuming from here reruns only calibration. - * ``"post_calibration"`` — after calibration, ready for dataset export. - """ - import json - import shutil - - if stage not in _ALLOWED_CHECKPOINT_STAGES: - raise ValueError( - f"stage must be one of {sorted(_ALLOWED_CHECKPOINT_STAGES)}; got {stage!r}" - ) - - checkpoint_dir = Path(path) - if checkpoint_dir.exists(): - shutil.rmtree(checkpoint_dir) - checkpoint_dir.mkdir(parents=True) - - metadata: dict[str, Any] = {"format_version": 1, "stage": stage} - for table_name in _PIPELINE_CHECKPOINT_TABLES: - frame = getattr(bundle, table_name) - if frame is None: - metadata[table_name] = None - continue - frame.to_parquet(checkpoint_dir / f"{table_name}.parquet", index=False) - metadata[table_name] = { - "rows": int(len(frame)), - "columns": list(frame.columns), - } - - (checkpoint_dir / "metadata.json").write_text(json.dumps(metadata, indent=2)) - return checkpoint_dir - - -def load_us_pipeline_checkpoint( - path: str | Path, - *, - expected_stage: USPipelineCheckpointStage | None = None, -) -> tuple[PolicyEngineUSEntityTableBundle, dict[str, Any]]: - """Load a pipeline-stage bundle previously saved by ``save_us_pipeline_checkpoint``. - - Returns ``(bundle, metadata)`` so callers can inspect the saved - stage. If ``expected_stage`` is provided, a mismatch raises a clear - error — protects against running recalibration from a post-microsim - checkpoint when a post-imputation checkpoint was expected or vice - versa. - """ - import json - - checkpoint_dir = Path(path) - metadata_path = checkpoint_dir / "metadata.json" - if not metadata_path.exists(): - raise FileNotFoundError(f"US pipeline checkpoint not found at {checkpoint_dir}") - metadata = json.loads(metadata_path.read_text()) - - saved_stage = metadata.get("stage") - if expected_stage is not None and saved_stage != expected_stage: - raise ValueError( - f"Checkpoint at {checkpoint_dir} has stage {saved_stage!r}, " - f"expected {expected_stage!r}" - ) - - tables: dict[str, pd.DataFrame | None] = {} - for table_name in _PIPELINE_CHECKPOINT_TABLES: - if metadata.get(table_name) is None: - tables[table_name] = None - continue - tables[table_name] = pd.read_parquet(checkpoint_dir / f"{table_name}.parquet") - return PolicyEngineUSEntityTableBundle(**tables), metadata - - -@dataclass(frozen=True) -class PolicyEngineUSVariableMaterializationResult: - """Materialized PE variables plus any per-variable failures.""" - - tables: PolicyEngineUSEntityTableBundle - bindings: dict[str, PolicyEngineUSVariableBinding] - materialized_variables: tuple[str, ...] = () - failed_variables: dict[str, str] = field(default_factory=dict) - - -PolicyEngineUSSimulationModifierHandler = Callable[ - ..., - PolicyEngineUSEntityTableBundle, -] - - -class PolicyEngineUSSimulationModifierSkipError(ValueError): - """Raised when a simulator modifier should fail closed for target rows.""" - - -DEFAULT_POLICYENGINE_US_VARIABLE_BINDINGS: dict[str, PolicyEngineUSVariableBinding] = { - "household_count": PolicyEngineUSVariableBinding(entity=EntityType.HOUSEHOLD), - "person_count": PolicyEngineUSVariableBinding(entity=EntityType.PERSON), - "tax_unit_count": PolicyEngineUSVariableBinding(entity=EntityType.TAX_UNIT), - "spm_unit_count": PolicyEngineUSVariableBinding(entity=EntityType.SPM_UNIT), -} - -POLICYENGINE_US_ENTITY_KEY_TO_ENTITY_TYPE: dict[str, EntityType] = { - "person": EntityType.PERSON, - "household": EntityType.HOUSEHOLD, - "tax_unit": EntityType.TAX_UNIT, - "spm_unit": EntityType.SPM_UNIT, - "family": EntityType.FAMILY, -} - -ENTITY_TYPE_TO_POLICYENGINE_US_ENTITY_KEY: dict[EntityType, str] = { - entity_type: entity_key - for entity_key, entity_type in POLICYENGINE_US_ENTITY_KEY_TO_ENTITY_TYPE.items() -} - -SAFE_POLICYENGINE_US_EXPORT_VARIABLES: set[str] = { - "age", - # American Opportunity Tax Credit (AOTC) factual eligibility inputs, - # populated per tax unit by - # ``USMicroplexPipeline._construct_aotc_eligibility_inputs`` from the - # PUF ``american_opportunity_credit`` signal, matching the enhanced-CPS - # baseline ``_impute_aotc_eligibility_inputs`` - # (PolicyEngine/policyengine-us-data, unmerged branch - # ``codex/fix-aotc-eligibility``). - "is_pursuing_credential_for_american_opportunity_credit", - "attends_eligible_educational_institution_for_american_opportunity_credit", - "is_enrolled_at_least_half_time_for_american_opportunity_credit", - "has_american_opportunity_credit_1098_t_or_exception", - "has_american_opportunity_credit_institution_ein", - "has_completed_first_four_years_of_postsecondary_education", - "has_felony_drug_conviction", - "american_opportunity_credit_claimed_prior_years", - "alimony_expense", - "alimony_income", - "amt_foreign_tax_credit", - "auto_loan_balance", - "auto_loan_interest", - "bank_account_assets", - "bond_assets", - "casualty_loss", - "child_support_expense", - "child_support_received", - "charitable_cash_donations", - "charitable_non_cash_donations", - "receives_housing_assistance", - "receives_wic", - "cps_race", - "disability_benefits", - "domestic_production_ald", - "early_withdrawal_penalty", - "educator_expense", - "excess_withheld_payroll_tax", - "general_business_credit", - "health_insurance_premiums_without_medicare_part_b", - "investment_income_elected_form_4952", - "is_female", - "is_hispanic", - "is_blind", - "is_disabled", - # eCPS disability-difficulty leaves (ASEC PEDIS* recodes). Not pe-us - # variables; exported as dataset columns via the legacy-contract map. - "difficulty_seeing", - "difficulty_hearing", - "difficulty_walking_or_climbing_stairs", - "difficulty_dressing_or_bathing", - "difficulty_doing_errands", - "difficulty_remembering_or_making_decisions", - "is_household_head", - "long_term_capital_gains_on_collectibles", - "employment_income_before_lsr", - "miscellaneous_income", - "non_sch_d_capital_gains", - "other_medical_expenses", - "other_credits", - "over_the_counter_health_expenses", - "own_children_in_household", - "prior_year_minimum_tax_credit", - "qualified_tuition_expenses", - "recapture_of_investment_credit", - "salt_refund_income", - "self_employment_income_before_lsr", - "social_security_disability", - "social_security_retirement", - "social_security_survivors", - "social_security_dependents", - "spm_unit_capped_work_childcare_expenses", - # Retirement-contribution leaves split from CPS RETCB_VAL. The capped - # account-level leaves mirror eCPS' current direct inputs; the desired - # leaves preserve the pre-statutory-limit values PE-US formulas use. - "self_employed_pension_contributions", - "traditional_401k_contributions", - "roth_401k_contributions", - "traditional_ira_contributions", - "roth_ira_contributions", - "self_employed_pension_contributions_desired", - "traditional_401k_contributions_desired", - "roth_401k_contributions_desired", - "traditional_ira_contributions_desired", - "roth_ira_contributions_desired", - # CPS-derived direct income copies (eCPS cps.py:1493-1495). - "survivor_benefits", - "educational_assistance", - "financial_assistance", - "stock_assets", - "taxable_ira_distributions", - "tip_income", - "unemployment_compensation", - "unrecaptured_section_1250_gain", - "unreimbursed_business_employee_expenses", - "unreported_payroll_tax", - "taxable_interest_income", - "tax_exempt_interest_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "real_estate_taxes", - "rental_income", - "short_term_capital_gains", - "long_term_capital_gains_before_response", - "partnership_s_corp_income", - "partnership_se_income", - "estate_income", - "farm_income", - "farm_operations_income", - "farm_rent_income", - "has_esi", - "has_marketplace_health_coverage", - # CPS-derived employer-sponsored insurance leaves. Mirrors the eCPS ESI - # imputation (policyengine-us-data, unmerged branch max/esi-premiums-cbo): - # the policyholder flag is NOW_OWNGRP == 1 and the premium comes from - # impute_employer_sponsored_insurance_premiums(). The premium leaf is a - # storable pe-us INPUT (no formula in pinned pe-us 1.715.2). The policyholder - # flag is not a released pe-us variable; it is routed through the legacy- - # contract entity map below so it still exports for eCPS column parity. - "employer_sponsored_insurance_premiums", - "reported_owns_employer_sponsored_health_insurance_at_interview", - # Unmarried partner of the household head: ASEC PERRP recode (codes - # 43/44/46/47), mirroring the eCPS perrp.isin(...) recode (policyengine-us- - # data, unmerged branch claude/document-census-tax-id-replacement). Storable - # pe-us INPUT (no formula in pinned pe-us 1.715.2). - "is_unmarried_partner_of_household_head", - "health_savings_account_ald", - "is_separated", - "is_surviving_spouse", - "net_worth", - # SCF net-worth component leaves (G1). eCPS persists these 19 balance-sheet - # columns to its final H5 (the ecps_export_contract.json "required" set); - # they are NOT pe-us registered variables, so they are exported via the - # legacy-contract entity map below, not recomputed by pe-us. - "scf_business_equity", - "scf_cash_value_life_insurance", - "scf_certificates_of_deposit", - "scf_credit_card_debt", - "scf_mortgage_debt", - "scf_nonresidential_real_estate_equity", - "scf_other_debt", - "scf_other_financial_assets", - "scf_other_installment_debt", - "scf_other_lines_of_credit", - "scf_other_managed_assets", - "scf_other_nonfinancial_assets", - "scf_other_residential_debt", - "scf_other_residential_real_estate", - "scf_primary_residence_value", - "scf_retirement_assets", - "scf_savings_bonds", - "scf_student_loan_debt", - "scf_vehicle_installment_debt", - "ssn_card_type", - "spm_unit_tenure_type", - "taxable_private_pension_income", - "tax_exempt_private_pension_income", - "student_loan_interest", - "tenure_type", - "state_fips", - "county_fips", - "block_geoid", - "tract_geoid", - "congressional_district_geoid", -} | set(POLICYENGINE_US_TAKEUP_INPUT_VARIABLES) - -POLICYENGINE_US_EXPORT_COLUMN_ALIASES: dict[str, str] = { - # policyengine-us #8507 made monthly_hours_worked derive from - # hours_worked_last_week. Microplex source frames still often carry the - # annualized eCPS-compatible ``hours_worked`` name, so expose it through - # the persisted leaf input when no explicit last-week field is present. - "hours_worked": "hours_worked_last_week", - "race": "cps_race", - # PE-US computes ``rent`` from the persisted source input - # ``pre_subsidy_rent``. Microplex's ACS donor block restores the - # user-facing ``rent`` column, so export it through the storable leaf input - # rather than dropping it as formula-owned. - "rent": "pre_subsidy_rent", -} - -POLICYENGINE_US_STRUCTURAL_EXPORT_COLUMNS: frozenset[str] = frozenset( - { - "household_id", - "person_id", - "person_household_id", - "household_weight", - "tax_unit_id", - "person_tax_unit_id", - "spm_unit_id", - "person_spm_unit_id", - "family_id", - "person_family_id", - "marital_unit_id", - "person_marital_unit_id", - } -) - -POLICYENGINE_US_EXPORT_DEFAULTS: dict[str, Any] = { - "auto_loan_balance": 0.0, - # American Opportunity Tax Credit factual eligibility inputs. The - # per-tax-unit construction in - # ``USMicroplexPipeline._construct_aotc_eligibility_inputs`` writes the - # real values for selected students; these defaults guarantee the - # contract-required columns always export (False / 0) for the - # non-student majority and for builds with no positive AOTC signal. - "is_pursuing_credential_for_american_opportunity_credit": False, - "attends_eligible_educational_institution_for_american_opportunity_credit": False, - "is_enrolled_at_least_half_time_for_american_opportunity_credit": False, - "has_american_opportunity_credit_1098_t_or_exception": False, - "has_american_opportunity_credit_institution_ein": False, - "has_completed_first_four_years_of_postsecondary_education": False, - "has_felony_drug_conviction": False, - "american_opportunity_credit_claimed_prior_years": 0, - "auto_loan_interest": 0.0, - # SCF net-worth component leaves (G1): positive-magnitude balances, - # default 0 when the SCF donor leaves a row without that component. - "scf_business_equity": 0.0, - "scf_cash_value_life_insurance": 0.0, - "scf_certificates_of_deposit": 0.0, - "scf_credit_card_debt": 0.0, - "scf_mortgage_debt": 0.0, - "scf_nonresidential_real_estate_equity": 0.0, - "scf_other_debt": 0.0, - "scf_other_financial_assets": 0.0, - "scf_other_installment_debt": 0.0, - "scf_other_lines_of_credit": 0.0, - "scf_other_managed_assets": 0.0, - "scf_other_nonfinancial_assets": 0.0, - "scf_other_residential_debt": 0.0, - "scf_other_residential_real_estate": 0.0, - "scf_primary_residence_value": 0.0, - "scf_retirement_assets": 0.0, - "scf_savings_bonds": 0.0, - "scf_student_loan_debt": 0.0, - "scf_vehicle_installment_debt": 0.0, - "business_is_sstb": False, - "detailed_occupation_recode": 0, - "domestic_production_ald": 0, - "estate_income_would_be_qualified": True, - "farm_operations_income_would_be_qualified": True, - "farm_rent_income_would_be_qualified": True, - "first_home_mortgage_balance": 0.0, - "first_home_mortgage_interest": 0.0, - "first_home_mortgage_origination_year": 0, - "free_school_meals_reported": 0.0, - "fsla_overtime_premium": 0.0, - "has_champva_health_coverage_at_interview": False, - "has_itin": True, - "has_indian_health_service_coverage_at_interview": False, - "has_marketplace_health_coverage_at_interview": False, - "has_medicaid_health_coverage_at_interview": False, - "has_never_worked": False, - "has_non_marketplace_direct_purchase_health_coverage_at_interview": False, - "has_other_means_tested_health_coverage_at_interview": False, - "has_tricare_health_coverage_at_interview": False, - "has_tin": True, - "has_valid_ssn": True, - "has_va_health_coverage_at_interview": False, - "home_mortgage_interest": 0, - "hourly_wage": 0, - "hours_worked_last_week": 0, - "household_vehicles_owned": 0, - "household_vehicles_value": 0, - "immigration_status_str": "CITIZEN", - "investment_interest_expense": 0, - "in_nyc": False, - "is_computer_scientist": False, - "is_executive_administrative_professional": False, - "is_farmer_fisher": False, - "is_blind": False, - "difficulty_seeing": False, - "difficulty_hearing": False, - "difficulty_walking_or_climbing_stairs": False, - "difficulty_dressing_or_bathing": False, - "difficulty_doing_errands": False, - "difficulty_remembering_or_making_decisions": False, - "is_full_time_college_student": False, - "is_military": False, - "is_paid_hourly": False, - "is_pregnant": False, - "is_tipped_occupation": False, - "is_union_member_or_covered": False, - "is_wic_at_nutritional_risk": True, - "keogh_distributions": 0, - "meets_ssi_disability_criteria": False, - "net_worth": 0, - "other_health_insurance_premiums": 0, - "other_type_retirement_account_distributions": 0, - "partnership_s_corp_income_would_be_qualified": True, - "pre_subsidy_rent": 0, - "previous_year_income_available": False, - "qualified_bdc_income": 0, - "qualified_reit_and_ptp_income": 0, - "recapture_of_investment_credit": 0, - "reduced_price_school_meals_reported": 0, - "regular_ira_distributions": 0, - "reported_has_champva_health_coverage_at_interview": False, - "reported_has_chip_health_coverage_at_interview": False, - "reported_has_direct_purchase_health_coverage_at_interview": False, - "reported_has_employer_sponsored_health_coverage_at_interview": False, - "reported_has_indian_health_service_coverage_at_interview": False, - "reported_has_marketplace_health_coverage_at_interview": False, - "reported_has_means_tested_health_coverage_at_interview": False, - "reported_has_medicaid_health_coverage_at_interview": False, - "reported_has_medicare_health_coverage_at_interview": False, - "reported_has_multiple_health_coverage_at_interview": False, - "reported_has_non_marketplace_direct_purchase_health_coverage_at_interview": False, - "reported_has_other_means_tested_health_coverage_at_interview": False, - "reported_has_private_health_coverage_at_interview": False, - "reported_has_public_health_coverage_at_interview": False, - "reported_has_subsidized_marketplace_health_coverage_at_interview": False, - "reported_has_tricare_health_coverage_at_interview": False, - "reported_has_unsubsidized_marketplace_health_coverage_at_interview": False, - "reported_has_va_health_coverage_at_interview": False, - "reported_is_insured_at_interview": False, - "reported_is_uninsured_at_interview": False, - "rental_income_would_be_qualified": True, - "roth_ira_distributions": 0, - "second_home_mortgage_balance": 0.0, - "second_home_mortgage_interest": 0.0, - "second_home_mortgage_origination_year": 0, - "selected_marketplace_plan_benchmark_ratio": 1.0, - "self_employment_income_last_year": 0, - "self_employment_income_would_be_qualified": True, - "snap_reported": 0.0, - "spm_unit_broadband_subsidy_reported": 0.0, - "spm_unit_capped_housing_subsidy_reported": 0.0, - "spm_unit_capped_work_childcare_expenses": 0.0, - "spm_unit_energy_subsidy_reported": 0.0, - "spm_unit_federal_tax_reported": 0.0, - "spm_unit_net_income_reported": 0.0, - "spm_unit_payroll_tax_reported": 0.0, - "spm_unit_state_tax_reported": 0.0, - "spm_unit_total_income_reported": 0.0, - "spm_unit_wic_reported": 0.0, - "spm_unit_pre_subsidy_childcare_expenses": 0, - "spm_unit_tenure_type": "RENTER", - "ssi_reported": 0.0, - "ssn_card_type": "CITIZEN", - "sstb_self_employment_income_before_lsr": 0, - # SSTB QBI-qualification flag (G9). eCPS never recodes this flag, so its - # export carries the pe-us default (default_value=True). MP exports False - # instead: because MP carries no SSTB self-employment income - # (business_is_sstb=False and sstb_self_employment_income_before_lsr=0 for - # every record), the section 199A SSTB component is zero under either value, - # so the choice is tax-inert and passes the name-only column-parity gate. - # False is chosen for internal consistency with MP's business_is_sstb=False; - # exact value-parity with the eCPS baseline would instead require True. - # Storable pe-us INPUT (no formula in pinned pe-us 1.715.2). - "sstb_self_employment_income_would_be_qualified": False, - "sstb_unadjusted_basis_qualified_property": 0.0, - "sstb_w2_wages_from_qualified_business": 0.0, - "strike_benefits": 0, - "takes_up_aca_if_eligible": True, - "takes_up_dc_ptc": True, - "takes_up_early_head_start_if_eligible": True, - "takes_up_eitc": True, - "takes_up_head_start_if_eligible": True, - "takes_up_medicaid_if_eligible": True, - "takes_up_snap_if_eligible": True, - "takes_up_ssi_if_eligible": True, - "takes_up_tanf_if_eligible": True, - "tax_exempt_401k_distributions": 0, - "tax_exempt_403b_distributions": 0, - "tax_exempt_ira_distributions": 0, - "tax_exempt_sep_distributions": 0, - "taxable_401k_distributions": 0, - "taxable_403b_distributions": 0, - "taxable_sep_distributions": 0, - "tanf_reported": 0.0, - "taxpayer_id_type": "VALID_SSN", - "tenure_type": "NONE", - "treasury_tipped_occupation_code": 0, - "unadjusted_basis_qualified_property": 0, - "unrecaptured_section_1250_gain": 0, - "unreported_payroll_tax": 0, - "veterans_benefits": 0, - "w2_wages_from_qualified_business": 0, - "weekly_hours_worked_before_lsr": 40.0, - "weeks_unemployed": 0, - "workers_compensation": 0, - "would_claim_wic": True, - "would_file_taxes_voluntarily": False, -} - -POLICYENGINE_US_NUMERIC_ENUM_EXPORT_MAPS: dict[str, dict[float, str]] = { - # Microplex imports ACS tenure as compact source codes. PolicyEngine-US - # expects enum member names in its HDF5 inputs. - "tenure_type": { - 0.0: "NONE", - 1.0: "OWNED_WITH_MORTGAGE", - 2.0: "RENTED", - }, - # SPM tenure has no NONE value; no-cash/unknown tenure should not make the - # exported dataset unreadable, so use the same renter fallback as PE-US. - "spm_unit_tenure_type": { - 0.0: "RENTER", - 1.0: "OWNER_WITH_MORTGAGE", - 2.0: "RENTER", - }, -} - -POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES: dict[str, str] = { - # eCPS disability-difficulty leaves (ASEC PEDIS* recodes): eCPS final-H5 - # contract columns with no pe-us variable, attached as person-level data - # columns (matching eCPS per-person storage). - "difficulty_seeing": "person", - "difficulty_hearing": "person", - "difficulty_walking_or_climbing_stairs": "person", - "difficulty_dressing_or_bathing": "person", - "difficulty_doing_errands": "person", - "difficulty_remembering_or_making_decisions": "person", - # SCF net-worth component leaves (G1). These are eCPS final-H5 contract - # columns (ecps_export_contract.json "required"), not pe-us registered - # variables, so the export resolver has no pe-us metadata for them. Map - # them to the person entity (matching eCPS per-reference-person storage) - # so they are attached as direct data columns rather than recomputed. - "scf_business_equity": "person", - "scf_cash_value_life_insurance": "person", - "scf_certificates_of_deposit": "person", - "scf_credit_card_debt": "person", - "scf_mortgage_debt": "person", - "scf_nonresidential_real_estate_equity": "person", - "scf_other_debt": "person", - "scf_other_financial_assets": "person", - "scf_other_installment_debt": "person", - "scf_other_lines_of_credit": "person", - "scf_other_managed_assets": "person", - "scf_other_nonfinancial_assets": "person", - "scf_other_residential_debt": "person", - "scf_other_residential_real_estate": "person", - "scf_primary_residence_value": "person", - "scf_retirement_assets": "person", - "scf_savings_bonds": "person", - "scf_student_loan_debt": "person", - "scf_vehicle_installment_debt": "person", - "count_under_18": "person", - "count_under_6": "person", - "free_school_meals_reported": "spm_unit", - "has_valid_ssn": "person", - "is_tipped_occupation": "person", - "other_type_retirement_account_distributions": "person", - "reduced_price_school_meals_reported": "spm_unit", - "regular_ira_distributions": "person", - "reported_has_champva_health_coverage_at_interview": "person", - "reported_has_chip_health_coverage_at_interview": "person", - "reported_has_direct_purchase_health_coverage_at_interview": "person", - "reported_has_employer_sponsored_health_coverage_at_interview": "person", - # Real CPS recode (NOW_OWNGRP == 1) carried on the person frame. Not yet a - # released pe-us input variable, so the entity is pinned here (like its - # reported_has_* siblings) to keep it on the eCPS-parity export surface. - "reported_owns_employer_sponsored_health_insurance_at_interview": "person", - "reported_has_indian_health_service_coverage_at_interview": "person", - "reported_has_marketplace_health_coverage_at_interview": "person", - "reported_has_means_tested_health_coverage_at_interview": "person", - "reported_has_medicaid_health_coverage_at_interview": "person", - "reported_has_medicare_health_coverage_at_interview": "person", - "reported_has_multiple_health_coverage_at_interview": "person", - "reported_has_non_marketplace_direct_purchase_health_coverage_at_interview": "person", - "reported_has_other_means_tested_health_coverage_at_interview": "person", - "reported_has_private_health_coverage_at_interview": "person", - "reported_has_public_health_coverage_at_interview": "person", - "reported_has_subsidized_marketplace_health_coverage_at_interview": "person", - "reported_has_tricare_health_coverage_at_interview": "person", - "reported_has_unsubsidized_marketplace_health_coverage_at_interview": "person", - "reported_has_va_health_coverage_at_interview": "person", - "reported_is_insured_at_interview": "person", - "reported_is_uninsured_at_interview": "person", - "roth_ira_distributions": "person", - "snap_reported": "spm_unit", - "spm_unit_broadband_subsidy_reported": "spm_unit", - "spm_unit_capped_housing_subsidy_reported": "spm_unit", - "spm_unit_capped_work_childcare_expenses": "spm_unit", - "spm_unit_energy_subsidy": "spm_unit", - "spm_unit_energy_subsidy_reported": "spm_unit", - "spm_unit_federal_tax_reported": "spm_unit", - "spm_unit_payroll_tax_reported": "spm_unit", - "spm_unit_state_tax_reported": "spm_unit", - "spm_unit_wic_reported": "spm_unit", - "takes_up_housing_assistance_if_eligible": "spm_unit", - "tanf_reported": "person", - "taxpayer_id_type": "person", -} - -POLICYENGINE_US_STRUCTURAL_COMPUTED_EXPORT_VARIABLES: frozenset[str] = frozenset( - { - # Aligned with policyengine-us-data's final-H5 validation contract. - # These are structural/cache fields, not model outputs. - "person_id", - "has_tin", - "has_itin", - "in_nyc", - } -) - -POLICYENGINE_US_DATA_OVERRIDABLE_COMPUTED_EXPORT_VARIABLES: frozenset[str] = frozenset( - { - # policyengine-us-data intentionally persists stronger source-data - # inputs for these fallback formulas. - "fsla_overtime_premium", - "meets_ssi_disability_criteria", - "self_employed_pension_contributions", - "traditional_401k_contributions", - "roth_401k_contributions", - "traditional_ira_contributions", - "roth_ira_contributions", - "spm_unit_energy_subsidy", - "spm_unit_capped_work_childcare_expenses", - # social_security_retirement is a storable INPUT in the pinned pe-us - # (no formula), reconstructed from the CPS SS_VAL/RESNSS split. Some - # pe-us versions add a fallback formula; listing it here keeps the - # source-data value exported so the computed-export guard cannot - # silently drop the leaf if that formula ever returns. - "social_security_retirement", - } -) - -POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES: frozenset[str] = ( - POLICYENGINE_US_STRUCTURAL_COMPUTED_EXPORT_VARIABLES - | POLICYENGINE_US_DATA_OVERRIDABLE_COMPUTED_EXPORT_VARIABLES -) - -MARKETPLACE_PLAN_BENCHMARK_RATIO_MIN = 0.5 -MARKETPLACE_PLAN_BENCHMARK_RATIO_MAX = 1.5 -MARKETPLACE_PLAN_BENCHMARK_RATIO_COLUMN = "selected_marketplace_plan_benchmark_ratio" -MARKETPLACE_PLAN_BENCHMARK_RATIO_SOURCE_COLUMNS: frozenset[str] = frozenset( - { - "health_insurance_premiums_without_medicare_part_b", - "aca_ptc", - "slcsp", - "takes_up_aca_if_eligible", - } -) - - -def compute_marketplace_plan_benchmark_ratio( - *, - reported_premium: Any, - aca_ptc: Any, - slcsp: Any, - takes_up_aca: Any, -) -> np.ndarray: - """Back out selected Marketplace plan cost relative to SLCSP. - - ``selected_marketplace_plan_benchmark_ratio`` is a persisted eCPS input, - not SLCSP itself. PE-US still computes SLCSP from geography and family - composition; this ratio carries the selected-plan-to-benchmark adjustment - for tax units modeled as taking up Marketplace coverage. - """ - reported = np.asarray(reported_premium, dtype=float) - ptc = np.asarray(aca_ptc, dtype=float) - benchmark = np.asarray(slcsp, dtype=float) - takeup = np.asarray(takes_up_aca, dtype=bool) - with np.errstate(divide="ignore", invalid="ignore"): - raw = (reported + ptc) / np.where(benchmark > 0, benchmark, 1.0) - clipped = np.clip( - raw, - MARKETPLACE_PLAN_BENCHMARK_RATIO_MIN, - MARKETPLACE_PLAN_BENCHMARK_RATIO_MAX, - ) - applicable = takeup & (benchmark > 0) - return np.where(applicable, clipped, 1.0) - - -def compute_policyengine_us_definition_hash( - constraints: tuple[PolicyEngineUSConstraint, ...] | list[PolicyEngineUSConstraint], - *, - parent_stratum_id: int | None = None, -) -> str: - """Replicate policyengine-us-data's stratum definition hash logic.""" - parent_prefix = str(parent_stratum_id) if parent_stratum_id is not None else "" - constraint_strings = sorted( - f"{constraint.variable}|{constraint.operation}|{constraint.value}" - for constraint in constraints - ) - if not constraint_strings: - fingerprint_text = parent_prefix - else: - fingerprint_text = parent_prefix + "\n" + "\n".join(constraint_strings) - return hashlib.sha256(fingerprint_text.encode("utf-8")).hexdigest() - - -class PolicyEngineUSDBTargetProvider: - """Read PolicyEngine US target rows from the policyengine-us-data SQLite DB.""" - - def __init__( - self, - db_path: str | Path, - *, - validate: bool = True, - ): - self.db_path = Path(db_path) - self.validate = validate - - def load_strata( - self, - stratum_ids: list[int] | tuple[int, ...] | None = None, - *, - include_ancestors: bool = True, - ) -> dict[int, PolicyEngineUSStratum]: - """Load strata with constraints and optional ancestor chain.""" - if not self.db_path.exists(): - raise FileNotFoundError( - f"PolicyEngine targets DB not found: {self.db_path}" - ) - - available_columns = self._table_columns("strata") - has_definition_hash = "definition_hash" in available_columns - has_parent_stratum_id = "parent_stratum_id" in available_columns - - remaining_ids = ( - {int(stratum_id) for stratum_id in stratum_ids} - if stratum_ids is not None - else None - ) - loaded: dict[int, PolicyEngineUSStratum] = {} - - conn = sqlite3.connect(self.db_path) - conn.row_factory = sqlite3.Row - try: - while remaining_ids is None or remaining_ids: - query, params = self._build_strata_query( - remaining_ids, - has_definition_hash=has_definition_hash, - has_parent_stratum_id=has_parent_stratum_id, - ) - rows = conn.execute(query, params).fetchall() - grouped = self._group_stratum_rows(rows) - newly_loaded = { - stratum_id: stratum - for stratum_id, stratum in grouped.items() - if stratum_id not in loaded - } - loaded.update(newly_loaded) - - if remaining_ids is None or not include_ancestors: - break - - next_remaining = { - stratum.parent_stratum_id - for stratum in newly_loaded.values() - if stratum.parent_stratum_id is not None - and stratum.parent_stratum_id not in loaded - } - if not next_remaining: - break - remaining_ids = next_remaining - finally: - conn.close() - - return loaded - - def load_targets( - self, - period: int | None = None, - variables: list[str] | None = None, - domain_variables: list[str] | None = None, - domain_variable_values: list[str] | None = None, - domain_variable_is_null: bool | None = None, - geo_levels: list[str] | None = None, - target_cells: list[dict[str, Any]] | None = None, - target_ids: list[int] | None = None, - stratum_ids: list[int] | None = None, - reform_id: int = 0, - active_only: bool = True, - best_period: bool = True, - ) -> list[PolicyEngineUSDBTarget]: - """Load target rows with attached stratum constraints.""" - if not self.db_path.exists(): - raise FileNotFoundError( - f"PolicyEngine targets DB not found: {self.db_path}" - ) - - if self._has_target_overview_view() and best_period: - return self._load_targets_via_target_overview( - period=period, - variables=variables, - domain_variables=domain_variables, - domain_variable_values=domain_variable_values, - domain_variable_is_null=domain_variable_is_null, - geo_levels=geo_levels, - target_cells=target_cells, - target_ids=target_ids, - stratum_ids=stratum_ids, - reform_id=reform_id, - active_only=active_only, - ) - if ( - domain_variables - or domain_variable_values - or geo_levels - or domain_variable_is_null is not None - or target_cells - ): - raise ValueError("domain/geography filters require a target_overview view") - - strata_columns = self._table_columns("strata") - definition_hash_select = ( - "s.definition_hash AS definition_hash" - if "definition_hash" in strata_columns - else "NULL AS definition_hash" - ) - parent_stratum_id_select = ( - "s.parent_stratum_id AS parent_stratum_id" - if "parent_stratum_id" in strata_columns - else "NULL AS parent_stratum_id" - ) - clauses = ["t.reform_id = ?"] - params: list[Any] = [reform_id] - if active_only: - clauses.append("t.active = 1") - if period is not None: - clauses.append("t.period = ?") - params.append(period) - if variables: - placeholders = ", ".join("?" for _ in variables) - clauses.append(f"t.variable IN ({placeholders})") - params.extend(variables) - if target_ids: - placeholders = ", ".join("?" for _ in target_ids) - clauses.append(f"t.target_id IN ({placeholders})") - params.extend(target_ids) - if stratum_ids: - placeholders = ", ".join("?" for _ in stratum_ids) - clauses.append(f"t.stratum_id IN ({placeholders})") - params.extend(stratum_ids) - - where_clause = " AND ".join(clauses) - query = f""" - SELECT - t.target_id, - t.variable, - t.period, - t.stratum_id, - t.reform_id, - t.value AS target_value, - t.active, - t.tolerance, - t.source, - t.notes, - NULL AS geo_level, - NULL AS geographic_id, - NULL AS domain_variable, - {definition_hash_select}, - {parent_stratum_id_select}, - sc.constraint_variable, - sc.operation, - sc.value AS constraint_value - FROM targets AS t - JOIN strata AS s - ON t.stratum_id = s.stratum_id - LEFT JOIN stratum_constraints AS sc - ON t.stratum_id = sc.stratum_id - WHERE {where_clause} - ORDER BY t.target_id, sc.constraint_variable, sc.operation, sc.value - """ - - conn = sqlite3.connect(self.db_path) - conn.row_factory = sqlite3.Row - try: - rows = conn.execute(query, params).fetchall() - finally: - conn.close() - - targets = self._group_target_rows(rows) - if self.validate: - self._validate_targets(targets) - return targets - - def load_target_set(self, query: TargetQuery | None = None) -> TargetSet: - """Load canonical targets through the core provider protocol.""" - from microplex_us.targets import policyengine_db_targets_to_canonical_set - - query = query or TargetQuery() - provider_filters = query.provider_filters - best_period = bool(provider_filters.get("best_period", True)) - target_cells = provider_filters.get("target_cells") - target_profile = provider_filters.get("calibration_target_profile") or ( - provider_filters.get("target_profile") - ) - if target_profile is not None: - if target_cells is not None: - raise ValueError( - "target_profile/calibration_target_profile cannot be combined " - "with explicit target_cells" - ) - target_cells = [ - cell.to_provider_filter() - for cell in resolve_policyengine_us_target_profile(str(target_profile)) - ] - canonical_targets = policyengine_db_targets_to_canonical_set( - self.load_targets( - period=query.period if isinstance(query.period, int) else None, - variables=provider_filters.get("variables"), - domain_variables=provider_filters.get("domain_variables"), - domain_variable_values=provider_filters.get("domain_variable_values"), - domain_variable_is_null=provider_filters.get("domain_variable_is_null"), - geo_levels=provider_filters.get("geo_levels"), - target_cells=target_cells, - target_ids=provider_filters.get("target_ids"), - stratum_ids=provider_filters.get("stratum_ids"), - reform_id=int(provider_filters.get("reform_id", 0)), - active_only=bool(provider_filters.get("active_only", True)), - best_period=best_period, - ), - default_entity=provider_filters.get("default_entity", EntityType.HOUSEHOLD), - entity_overrides=provider_filters.get("entity_overrides"), - ) - return apply_target_query( - canonical_targets, - TargetQuery( - period=None if best_period else query.period, - entity=query.entity, - names=query.names, - metadata_filters=query.metadata_filters, - ), - ) - - def _has_target_overview_view(self) -> bool: - conn = sqlite3.connect(self.db_path) - try: - row = conn.execute( - """ - SELECT 1 - FROM sqlite_master - WHERE type = 'view' AND name = 'target_overview' - """ - ).fetchone() - finally: - conn.close() - return row is not None - - def _load_targets_via_target_overview( - self, - *, - period: int | None, - variables: list[str] | None, - domain_variables: list[str] | None, - domain_variable_values: list[str] | None, - domain_variable_is_null: bool | None, - geo_levels: list[str] | None, - target_cells: list[dict[str, Any]] | None, - target_ids: list[int] | None, - stratum_ids: list[int] | None, - reform_id: int, - active_only: bool, - ) -> list[PolicyEngineUSDBTarget]: - strata_columns = self._table_columns("strata") - definition_hash_select = ( - "s.definition_hash AS definition_hash" - if "definition_hash" in strata_columns - else "NULL AS definition_hash" - ) - parent_stratum_id_select = ( - "s.parent_stratum_id AS parent_stratum_id" - if "parent_stratum_id" in strata_columns - else "NULL AS parent_stratum_id" - ) - clauses = ["t.reform_id = ?"] - params: list[Any] = [reform_id] - if active_only: - clauses.append("tv.active = 1") - if variables: - placeholders = ", ".join("?" for _ in variables) - clauses.append(f"tv.variable IN ({placeholders})") - params.extend(variables) - if target_ids: - placeholders = ", ".join("?" for _ in target_ids) - clauses.append(f"tv.target_id IN ({placeholders})") - params.extend(target_ids) - if stratum_ids: - placeholders = ", ".join("?" for _ in stratum_ids) - clauses.append(f"tv.stratum_id IN ({placeholders})") - params.extend(stratum_ids) - if geo_levels: - placeholders = ", ".join("?" for _ in geo_levels) - clauses.append(f"tv.geo_level IN ({placeholders})") - params.extend(geo_levels) - if domain_variables: - domain_clauses = [ - "instr(',' || coalesce(tv.domain_variable, '') || ',', ',' || ? || ',') > 0" - for _ in domain_variables - ] - clauses.append("(" + " OR ".join(domain_clauses) + ")") - params.extend(domain_variables) - if domain_variable_values: - placeholders = ", ".join("?" for _ in domain_variable_values) - clauses.append(f"coalesce(tv.domain_variable, '') IN ({placeholders})") - params.extend(domain_variable_values) - if domain_variable_is_null is True: - clauses.append("coalesce(tv.domain_variable, '') = ''") - elif domain_variable_is_null is False: - clauses.append("coalesce(tv.domain_variable, '') <> ''") - if target_cells: - clauses.append(self._build_target_cell_clause(target_cells, params)) - - time_period = period if period is not None else 9999 - where_clause = " AND ".join(clauses) if clauses else "1=1" - query = f""" - WITH filtered_targets AS ( - SELECT - tv.target_id, - tv.stratum_id, - tv.variable, - tv.value, - tv.period, - tv.active, - tv.geo_level, - tv.geographic_id, - tv.domain_variable - FROM target_overview tv - JOIN targets t ON tv.target_id = t.target_id - WHERE {where_clause} - ), - best_periods AS ( - SELECT - stratum_id, - variable, - CASE - WHEN MAX(CASE WHEN period <= ? THEN period END) IS NOT NULL - THEN MAX(CASE WHEN period <= ? THEN period END) - ELSE MIN(period) - END AS best_period - FROM filtered_targets - GROUP BY stratum_id, variable - ) - SELECT - ft.target_id, - ft.variable, - ft.period, - ft.stratum_id, - t.reform_id, - ft.value AS target_value, - t.active, - t.tolerance, - t.source, - t.notes, - ft.geo_level, - ft.geographic_id, - ft.domain_variable, - {definition_hash_select}, - {parent_stratum_id_select}, - sc.constraint_variable, - sc.operation, - sc.value AS constraint_value - FROM filtered_targets ft - JOIN best_periods bp - ON ft.stratum_id = bp.stratum_id - AND ft.variable = bp.variable - AND ft.period = bp.best_period - JOIN targets t - ON ft.target_id = t.target_id - JOIN strata s - ON ft.stratum_id = s.stratum_id - LEFT JOIN stratum_constraints sc - ON ft.stratum_id = sc.stratum_id - ORDER BY ft.target_id, sc.constraint_variable, sc.operation, sc.value - """ - - conn = sqlite3.connect(self.db_path) - conn.row_factory = sqlite3.Row - try: - rows = conn.execute(query, [*params, time_period, time_period]).fetchall() - finally: - conn.close() - - targets = self._group_target_rows(rows) - if self.validate: - self._validate_targets(targets) - return targets - - def _build_target_cell_clause( - self, - target_cells: list[dict[str, Any]], - params: list[Any], - ) -> str: - cell_clauses: list[str] = [] - for raw_cell in target_cells: - cell = PolicyEngineUSTargetCell( - variable=str(raw_cell["variable"]), - geo_level=( - None - if raw_cell.get("geo_level") is None - else str(raw_cell["geo_level"]) - ), - domain_variable=( - None - if "domain_variable" in raw_cell - and raw_cell.get("domain_variable") is None - else ( - str(raw_cell["domain_variable"]) - if raw_cell.get("domain_variable") is not None - else None - ) - ), - geographic_id=( - None - if raw_cell.get("geographic_id") is None - else str(raw_cell["geographic_id"]) - ), - ) - subclauses = ["tv.variable = ?"] - params.append(cell.variable) - if cell.geo_level is not None: - subclauses.append("tv.geo_level = ?") - params.append(cell.geo_level) - if "domain_variable" in raw_cell: - if cell.domain_variable is None: - subclauses.append("coalesce(tv.domain_variable, '') = ''") - else: - subclauses.append( - "instr(',' || coalesce(tv.domain_variable, '') || ',', ',' || ? || ',') > 0" - ) - params.append(cell.domain_variable) - if cell.geographic_id is not None: - subclauses.append("coalesce(tv.geographic_id, '') = ?") - params.append(cell.geographic_id) - cell_clauses.append("(" + " AND ".join(subclauses) + ")") - return "(" + " OR ".join(cell_clauses) + ")" - - def _group_target_rows( - self, - rows: list[sqlite3.Row], - ) -> list[PolicyEngineUSDBTarget]: - grouped: dict[int, dict[str, Any]] = {} - for row in rows: - target_id = int(row["target_id"]) - item = grouped.setdefault( - target_id, - { - "target_id": target_id, - "variable": row["variable"], - "period": int(row["period"]), - "stratum_id": int(row["stratum_id"]), - "reform_id": int(row["reform_id"]), - "value": float(row["target_value"]), - "active": bool(row["active"]), - "tolerance": ( - float(row["tolerance"]) - if row["tolerance"] is not None - else None - ), - "source": row["source"], - "notes": row["notes"], - "geo_level": row["geo_level"], - "geographic_id": row["geographic_id"], - "domain_variable": row["domain_variable"], - "definition_hash": row["definition_hash"], - "parent_stratum_id": ( - int(row["parent_stratum_id"]) - if row["parent_stratum_id"] is not None - else None - ), - "constraints": [], - }, - ) - if row["constraint_variable"] is not None: - item["constraints"].append( - PolicyEngineUSConstraint( - variable=row["constraint_variable"], - operation=row["operation"], - value=row["constraint_value"], - ) - ) - - return [ - PolicyEngineUSDBTarget( - target_id=item["target_id"], - variable=item["variable"], - period=item["period"], - stratum_id=item["stratum_id"], - reform_id=item["reform_id"], - value=item["value"], - active=item["active"], - tolerance=item["tolerance"], - source=item["source"], - notes=item["notes"], - geo_level=item["geo_level"], - geographic_id=item["geographic_id"], - domain_variable=item["domain_variable"], - definition_hash=item["definition_hash"], - parent_stratum_id=item["parent_stratum_id"], - constraints=tuple(item["constraints"]), - ) - for item in grouped.values() - ] - - def _build_strata_query( - self, - stratum_ids: set[int] | None, - *, - has_definition_hash: bool, - has_parent_stratum_id: bool, - ) -> tuple[str, list[Any]]: - clauses = [] - params: list[Any] = [] - if stratum_ids is not None: - if not stratum_ids: - return ( - """ - SELECT - s.stratum_id, - NULL AS definition_hash, - NULL AS parent_stratum_id, - NULL AS constraint_variable, - NULL AS operation, - NULL AS constraint_value - FROM strata AS s - WHERE 1 = 0 - """, - [], - ) - placeholders = ", ".join("?" for _ in sorted(stratum_ids)) - clauses.append(f"s.stratum_id IN ({placeholders})") - params.extend(sorted(stratum_ids)) - - where_clause = "WHERE " + " AND ".join(clauses) if clauses else "" - definition_hash_select = ( - "s.definition_hash AS definition_hash" - if has_definition_hash - else "NULL AS definition_hash" - ) - parent_stratum_id_select = ( - "s.parent_stratum_id AS parent_stratum_id" - if has_parent_stratum_id - else "NULL AS parent_stratum_id" - ) - query = f""" - SELECT - s.stratum_id, - {definition_hash_select}, - {parent_stratum_id_select}, - sc.constraint_variable, - sc.operation, - sc.value AS constraint_value - FROM strata AS s - LEFT JOIN stratum_constraints AS sc - ON s.stratum_id = sc.stratum_id - {where_clause} - ORDER BY s.stratum_id, sc.constraint_variable, sc.operation, sc.value - """ - return query, params - - def _group_stratum_rows( - self, - rows: list[sqlite3.Row], - ) -> dict[int, PolicyEngineUSStratum]: - grouped: dict[int, dict[str, Any]] = {} - for row in rows: - stratum_id = int(row["stratum_id"]) - item = grouped.setdefault( - stratum_id, - { - "stratum_id": stratum_id, - "definition_hash": row["definition_hash"], - "parent_stratum_id": ( - int(row["parent_stratum_id"]) - if row["parent_stratum_id"] is not None - else None - ), - "constraints": [], - }, - ) - if row["constraint_variable"] is not None: - item["constraints"].append( - PolicyEngineUSConstraint( - variable=row["constraint_variable"], - operation=row["operation"], - value=row["constraint_value"], - ) - ) - return { - stratum_id: PolicyEngineUSStratum( - stratum_id=stratum_id, - definition_hash=item["definition_hash"], - parent_stratum_id=item["parent_stratum_id"], - constraints=tuple(item["constraints"]), - ) - for stratum_id, item in grouped.items() - } - - def _table_columns(self, table_name: str) -> set[str]: - conn = sqlite3.connect(self.db_path) - try: - rows = conn.execute(f"PRAGMA table_info({table_name})").fetchall() - finally: - conn.close() - return {str(row[1]) for row in rows} - - def _validate_targets(self, targets: list[PolicyEngineUSDBTarget]) -> None: - if not targets: - return - strata = self.load_strata([target.stratum_id for target in targets]) - missing_strata = sorted( - {target.stratum_id for target in targets if target.stratum_id not in strata} - ) - if missing_strata: - raise PolicyEngineUSTargetValidationError( - f"Missing strata for target rows: {missing_strata}" - ) - self._validate_strata(strata) - - def _validate_strata( - self, - strata: dict[int, PolicyEngineUSStratum], - ) -> None: - errors: list[str] = [] - for stratum in strata.values(): - if stratum.definition_hash is not None: - expected_hash = compute_policyengine_us_definition_hash( - stratum.constraints, - parent_stratum_id=stratum.parent_stratum_id, - ) - if stratum.definition_hash != expected_hash: - errors.append( - "Stratum " - f"{stratum.stratum_id} has definition_hash " - f"{stratum.definition_hash!r}, expected {expected_hash!r}" - ) - - if stratum.parent_stratum_id is None: - continue - parent = strata.get(stratum.parent_stratum_id) - if parent is None: - errors.append( - f"Stratum {stratum.stratum_id} references missing parent " - f"{stratum.parent_stratum_id}" - ) - continue - - parent_vars = {constraint.variable for constraint in parent.constraints} - child_vars = {constraint.variable for constraint in stratum.constraints} - geographic_error = self._validate_geographic_consistency(parent, stratum) - if geographic_error is not None: - errors.append(geographic_error) - - if ( - parent_vars <= GEOGRAPHIC_CONSTRAINT_VARIABLES - and child_vars <= GEOGRAPHIC_CONSTRAINT_VARIABLES - ): - continue - - parent_constraints = { - self._constraint_signature(constraint) - for constraint in parent.constraints - } - child_constraints = { - self._constraint_signature(constraint) - for constraint in stratum.constraints - } - missing_parent_constraints = sorted(parent_constraints - child_constraints) - if missing_parent_constraints: - errors.append( - f"Stratum {stratum.stratum_id} is missing inherited parent " - f"constraints from {stratum.parent_stratum_id}: " - f"{missing_parent_constraints}" - ) - - if errors: - raise PolicyEngineUSTargetValidationError("\n".join(errors)) - - def _constraint_signature( - self, - constraint: PolicyEngineUSConstraint, - ) -> tuple[str, str, str]: - return ( - constraint.variable, - constraint.operation, - self._normalize_constraint_value(constraint.variable, constraint.value), - ) - - def _normalize_constraint_value(self, variable: str, value: str) -> str: - if variable in GEOGRAPHIC_CONSTRAINT_VARIABLES: - return str(int(value)) - return str(value) - - def _validate_geographic_consistency( - self, - parent: PolicyEngineUSStratum, - child: PolicyEngineUSStratum, - ) -> str | None: - parent_equalities = { - constraint.variable: constraint.value - for constraint in parent.constraints - if constraint.operation == "==" - } - child_equalities = { - constraint.variable: constraint.value - for constraint in child.constraints - if constraint.operation == "==" - } - - for variable in GEOGRAPHIC_CONSTRAINT_VARIABLES: - if variable not in parent_equalities or variable not in child_equalities: - continue - if int(parent_equalities[variable]) != int(child_equalities[variable]): - return ( - f"Stratum {child.stratum_id} has geographic constraint " - f"{variable}={child_equalities[variable]!r} but parent " - f"{parent.stratum_id} has {variable}={parent_equalities[variable]!r}" - ) - - if ( - "state_fips" in parent_equalities - and "congressional_district_geoid" in child_equalities - ): - parent_state = int(parent_equalities["state_fips"]) - district_state = ( - int(child_equalities["congressional_district_geoid"]) // 100 - ) - if district_state != parent_state: - return ( - f"Stratum {child.stratum_id} has congressional_district_geoid=" - f"{child_equalities['congressional_district_geoid']!r} which belongs " - f"to state {district_state}, not parent state {parent_state}" - ) - return None - - def to_quantity_targets( - self, - variable_column_map: dict[str, str], - period: int | None = None, - reform_id: int = 0, - ) -> tuple[PolicyEngineUSQuantityTarget, ...]: - """Convert unconstrained DB rows into quantity targets for calibration.""" - quantity_targets: list[PolicyEngineUSQuantityTarget] = [] - for target in self.load_targets(period=period, reform_id=reform_id): - if not target.is_unconstrained: - continue - column = variable_column_map.get(target.variable) - if column is None: - continue - quantity_targets.append( - PolicyEngineUSQuantityTarget( - name=target.variable, - variable=target.variable, - column=column, - period=target.period, - ) - ) - return tuple(quantity_targets) - - -def compile_policyengine_us_household_linear_constraints( - targets: ( - tuple[PolicyEngineUSDBTarget | TargetSpec, ...] - | list[PolicyEngineUSDBTarget | TargetSpec] - ), - tables: PolicyEngineUSEntityTableBundle, - *, - variable_bindings: dict[str, PolicyEngineUSVariableBinding] | None = None, - household_id_column: str = "household_id", -) -> tuple[LinearConstraint, ...]: - """Compile PE target rows into household-level linear calibration rows.""" - households = tables.households - if household_id_column not in households.columns: - raise ValueError( - f"Household table must contain '{household_id_column}' for calibration" - ) - if households[household_id_column].duplicated().any(): - raise ValueError("Household calibration table must have unique household ids") - - household_ids = pd.Index(households[household_id_column], name=household_id_column) - bindings = { - **DEFAULT_POLICYENGINE_US_VARIABLE_BINDINGS, - **(variable_bindings or {}), - } - - compiled: list[LinearConstraint] = [] - for target in targets: - coefficients = _compile_household_coefficients( - target=target, - tables=tables, - bindings=bindings, - household_ids=household_ids, - household_id_column=household_id_column, - ) - compiled.append( - LinearConstraint( - name=_policyengine_target_name(target), - coefficients=coefficients, - target=_target_value(target), - ) - ) - - return tuple(compiled) - - -@dataclass -class PolicyEngineUSSimulationTargetCompiler: - """Compile simulator-dependent PE-US targets into sparse calibration rows.""" - - period: int - dataset_year: int | None = None - simulation_cls: Any | None = None - microsimulation_kwargs: dict[str, Any] | None = None - temp_dir: str | Path | None = None - direct_override_variables: tuple[str, ...] = () - batch_size: int | None = None - force_materialize_variables: set[str] | tuple[str, ...] | None = None - modifier_handlers: Mapping[str, PolicyEngineUSSimulationModifierHandler] = field( - default_factory=dict - ) - - def compile_simulation_target_constraints( - self, - *, - targets: Sequence[TargetSpec], - entity_frames: Mapping[EntityType, pd.DataFrame], - entity_weight_indexes: Mapping[EntityType, pd.Series | np.ndarray], - ) -> TargetConstraintCompilationResult: - """Compile PE-materialized target rows required by Microplex core.""" - target_list = tuple(targets) - if not target_list: - return TargetConstraintCompilationResult(constraints=()) - - tables = _policyengine_us_tables_from_entity_frames(entity_frames) - household_weight_indexes = _policyengine_us_household_weight_indexes( - entity_weight_indexes, - household_count=len(tables.households), - ) - tables, supported_targets, skipped_targets = self._apply_modifiers( - tables=tables, - targets=target_list, - ) - if not supported_targets: - return TargetConstraintCompilationResult( - constraints=(), - skipped_targets=tuple(skipped_targets), - ) - - bindings = infer_policyengine_us_variable_bindings(tables) - force_materialize_variables = { - *set(self.force_materialize_variables or ()), - *_policyengine_us_forced_materialization_features(supported_targets), - } - variables_to_materialize = policyengine_us_variables_to_materialize( - list(supported_targets), - bindings, - force_materialize_variables=force_materialize_variables, - ) - materialization = materialize_policyengine_us_variables_safely( - tables, - variables=tuple(sorted(variables_to_materialize)), - period=self.period, - dataset_year=self.dataset_year, - simulation_cls=self.simulation_cls, - microsimulation_kwargs=self.microsimulation_kwargs, - temp_dir=self.temp_dir, - direct_override_variables=self.direct_override_variables, - batch_size=self.batch_size, - ) - materialized_tables = materialization.tables - materialized_bindings = { - **bindings, - **infer_policyengine_us_variable_bindings(materialized_tables), - **materialization.bindings, - } - - compilable_targets: list[TargetSpec] = [] - for target in supported_targets: - missing_features = [ - feature - for feature in target.required_features - if feature not in materialized_bindings - ] - failed_features = sorted( - set(target.required_features) & set(materialization.failed_variables) - ) - if failed_features: - skipped_targets.append( - ( - target.name, - "policyengine_us_materialization_failed:" - + ",".join(failed_features), - ) - ) - continue - if missing_features: - skipped_targets.append( - ( - target.name, - "missing_features_after_policyengine_us_materialization:" - + ",".join(sorted(missing_features)), - ) - ) - continue - compilable_targets.append(target) - - if not compilable_targets: - return TargetConstraintCompilationResult( - constraints=(), - skipped_targets=tuple(skipped_targets), - ) - - dense_constraints = compile_policyengine_us_household_linear_constraints( - compilable_targets, - materialized_tables, - variable_bindings=materialized_bindings, - ) - sparse_constraints = tuple( - _policyengine_us_linear_constraint_to_target_reweighting_constraint( - target=target, - constraint=constraint, - household_weight_indexes=household_weight_indexes, - ) - for target, constraint in zip( - compilable_targets, - dense_constraints, - strict=True, - ) - ) - return TargetConstraintCompilationResult( - constraints=sparse_constraints, - skipped_targets=tuple(skipped_targets), - ) - - def _apply_modifiers( - self, - *, - tables: PolicyEngineUSEntityTableBundle, - targets: Sequence[TargetSpec], - ) -> tuple[ - PolicyEngineUSEntityTableBundle, - tuple[TargetSpec, ...], - list[tuple[str, str]], - ]: - def rerandomize_takeup_handler( - tables: PolicyEngineUSEntityTableBundle, - *, - targets: Sequence[TargetSpec], - parameters: Sequence[Mapping[str, Any]], - ) -> PolicyEngineUSEntityTableBundle: - return _rerandomize_policyengine_us_takeup_sim_modifier( - tables, - targets=targets, - parameters=parameters, - year=self.dataset_year or self.period, - ) - - handlers = { - "policyengine_us_materialize": _identity_policyengine_us_sim_modifier, - "rerandomize_takeup": rerandomize_takeup_handler, - **dict(self.modifier_handlers), - } - supported_targets: list[TargetSpec] = [] - skipped_targets: list[tuple[str, str]] = [] - for target in targets: - missing_handlers = [ - modifier_name - for modifier_name in target.sim_modifier_names - if modifier_name not in handlers - ] - if missing_handlers: - skipped_targets.append( - ( - target.name, - "missing_policyengine_us_sim_modifier_handler:" - + ",".join(missing_handlers), - ) - ) - continue - supported_targets.append(target) - - working_tables = _copy_policyengine_us_entity_tables(tables) - modifier_names = tuple( - dict.fromkeys( - modifier_name - for target in supported_targets - for modifier_name in target.sim_modifier_names - ) - ) - for modifier_name in modifier_names: - relevant_targets = tuple( - target - for target in supported_targets - if modifier_name in target.sim_modifier_names - ) - if not relevant_targets: - continue - parameters = tuple( - modifier.parameters - for target in relevant_targets - for modifier in target.sim_modifiers - if modifier.name == modifier_name - ) - handler = handlers[modifier_name] - try: - working_tables = handler( - working_tables, - targets=relevant_targets, - parameters=parameters, - ) - except PolicyEngineUSSimulationModifierSkipError as exc: - skipped_targets.extend( - (target.name, str(exc)) for target in relevant_targets - ) - skipped_names = {target.name for target in relevant_targets} - supported_targets = [ - target - for target in supported_targets - if target.name not in skipped_names - ] - continue - if not isinstance(working_tables, PolicyEngineUSEntityTableBundle): - raise TypeError( - f"PolicyEngine US sim modifier '{modifier_name}' must return " - "PolicyEngineUSEntityTableBundle" - ) - return working_tables, tuple(supported_targets), skipped_targets - - -def _identity_policyengine_us_sim_modifier( - tables: PolicyEngineUSEntityTableBundle, - *, - targets: Sequence[TargetSpec], - parameters: Sequence[Mapping[str, Any]], -) -> PolicyEngineUSEntityTableBundle: - del targets, parameters - return tables - - -def _rerandomize_policyengine_us_takeup_sim_modifier( - tables: PolicyEngineUSEntityTableBundle, - *, - targets: Sequence[TargetSpec], - parameters: Sequence[Mapping[str, Any]], - year: int, -) -> PolicyEngineUSEntityTableBundle: - features = _policyengine_us_takeup_features_from_modifier_parameters( - targets=targets, - parameters=parameters, - ) - persons, tax_units, spm_units, unsupported = ( - rerandomize_policyengine_us_takeup_frames( - persons=tables.persons, - tax_units=tables.tax_units, - spm_units=tables.spm_units, - features=features, - year=year, - ) - ) - if unsupported: - raise PolicyEngineUSSimulationModifierSkipError( - "policyengine_us_rerandomize_takeup_unsupported_features:" - + ",".join(unsupported) - ) - return PolicyEngineUSEntityTableBundle( - households=tables.households, - persons=persons, - tax_units=tax_units, - spm_units=spm_units, - families=tables.families, - marital_units=tables.marital_units, - ) - - -def _policyengine_us_takeup_features_from_modifier_parameters( - *, - targets: Sequence[TargetSpec], - parameters: Sequence[Mapping[str, Any]], -) -> tuple[str, ...]: - features: list[str] = [] - for parameter in parameters: - raw_features = parameter.get("features") - if raw_features is None: - continue - if isinstance(raw_features, str): - features.append(raw_features) - else: - features.extend(str(feature) for feature in raw_features) - if features: - return tuple(dict.fromkeys(features)) - - inferred: list[str] = [] - for target in targets: - inferred.extend( - feature - for feature in target.required_features - if feature in POLICYENGINE_US_TAKEUP_INPUT_VARIABLES - ) - return tuple(dict.fromkeys(inferred)) - - -def _policyengine_us_forced_materialization_features( - targets: Sequence[TargetSpec], -) -> set[str]: - forced_features: set[str] = set() - for target in targets: - for modifier in target.sim_modifiers: - if modifier.name != "policyengine_us_materialize": - continue - raw_features = modifier.parameters.get("features") - if raw_features is None: - forced_features.update(target.required_features) - continue - if isinstance(raw_features, str): - forced_features.add(raw_features) - else: - forced_features.update(str(feature) for feature in raw_features) - return forced_features - - -def _policyengine_us_tables_from_entity_frames( - entity_frames: Mapping[EntityType, pd.DataFrame], -) -> PolicyEngineUSEntityTableBundle: - households = entity_frames.get(EntityType.HOUSEHOLD) - if households is None: - raise ValueError( - "PolicyEngineUSSimulationTargetCompiler requires a household table" - ) - return PolicyEngineUSEntityTableBundle( - households=households.copy(), - persons=_copy_optional_entity_frame(entity_frames, EntityType.PERSON), - tax_units=_copy_optional_entity_frame(entity_frames, EntityType.TAX_UNIT), - spm_units=_copy_optional_entity_frame(entity_frames, EntityType.SPM_UNIT), - families=_copy_optional_entity_frame(entity_frames, EntityType.FAMILY), - ) - - -def _copy_optional_entity_frame( - entity_frames: Mapping[EntityType, pd.DataFrame], - entity: EntityType, -) -> pd.DataFrame | None: - frame = entity_frames.get(entity) - return None if frame is None else frame.copy() - - -def _policyengine_us_household_weight_indexes( - entity_weight_indexes: Mapping[EntityType, pd.Series | np.ndarray], - *, - household_count: int, -) -> np.ndarray: - raw_indexes = entity_weight_indexes.get(EntityType.HOUSEHOLD) - if raw_indexes is None: - raise ValueError( - "PolicyEngineUSSimulationTargetCompiler requires household weight indexes" - ) - indexes = np.asarray(raw_indexes, dtype=int) - if indexes.ndim != 1 or len(indexes) != household_count: - raise ValueError( - "Household weight indexes must be one-dimensional and aligned to " - f"{household_count} households" - ) - return indexes - - -def _policyengine_us_linear_constraint_to_target_reweighting_constraint( - *, - target: TargetSpec, - constraint: LinearConstraint, - household_weight_indexes: np.ndarray, -) -> TargetReweightingConstraint: - coefficients = np.asarray(constraint.coefficients, dtype=float) - if coefficients.ndim != 1 or len(coefficients) != len(household_weight_indexes): - raise ValueError( - f"Compiled constraint '{constraint.name}' has {len(coefficients)} " - f"coefficients for {len(household_weight_indexes)} household weights" - ) - if not np.isfinite(coefficients).all(): - raise ValueError( - f"Compiled constraint '{constraint.name}' has nonfinite values" - ) - - active = coefficients != 0.0 - active_indexes = household_weight_indexes[active] - active_coefficients = coefficients[active] - if len(active_indexes): - grouped = ( - pd.Series(active_coefficients, index=active_indexes, dtype=float) - .groupby(level=0) - .sum() - ) - active_indexes = grouped.index.to_numpy(dtype=int) - active_coefficients = grouped.to_numpy(dtype=float) - - metadata = { - **dict(target.metadata), - "compiled_by": "policyengine_us_simulation_target_compiler", - "sim_modifier_names": target.sim_modifier_names, - } - return TargetReweightingConstraint( - name=target.name, - entity=target.entity, - weight_indexes=active_indexes, - coefficients=active_coefficients, - target=constraint.target, - metadata=metadata, - ) - - -class PolicyEngineUSMicrosimulationAdapter: - """Thin wrapper around a PolicyEngine US microsimulation instance.""" - - def __init__(self, simulation: Any): - self.simulation = simulation - - @property - def tax_benefit_system(self) -> Any: - """Return the underlying PolicyEngine tax-benefit system.""" - tax_benefit_system = getattr( - self.simulation, - "tax_benefit_system", - getattr(self.simulation, "system", None), - ) - return getattr(tax_benefit_system, "system", tax_benefit_system) - - @classmethod - def from_dataset( - cls, - dataset: str | Path | Any, - *, - dataset_year: int | None = None, - simulation_cls: Any | None = None, - **kwargs: Any, - ) -> PolicyEngineUSMicrosimulationAdapter: - """Construct an adapter from a dataset path or PolicyEngine dataset class.""" - if simulation_cls is None: - try: - import policyengine_us - except ImportError as exc: - raise ImportError( - "policyengine_us is required to build a microsimulation adapter" - ) from exc - simulation_cls = policyengine_us.Microsimulation - - sim_kwargs = dict(kwargs) - sim_kwargs["dataset"] = str(dataset) if isinstance(dataset, Path) else dataset - if dataset_year is not None: - sim_kwargs["dataset_year"] = dataset_year - try: - simulation = simulation_cls(**sim_kwargs) - except TypeError as exc: - if dataset_year is None or "dataset_year" not in str(exc): - raise - sim_kwargs.pop("dataset_year", None) - simulation = simulation_cls(**sim_kwargs) - return cls(simulation=simulation) - - def calculate( - self, - variable: str, - *, - period: int | None = None, - map_to: str | None = None, - ) -> Any: - """Calculate a PolicyEngine variable.""" - if map_to is None: - return self.simulation.calculate(variable, period) - return self.simulation.calculate(variable, period, map_to=map_to) - - def variable_entity(self, variable: str) -> EntityType: - """Resolve the Microplex entity type for a PolicyEngine variable.""" - return _resolve_policyengine_variable_entity( - variable, - tax_benefit_system=self.tax_benefit_system, - ) - - def compute_targets( - self, - quantity_targets: tuple[PolicyEngineUSQuantityTarget, ...], - ) -> dict[str, float]: - """Compute aggregate targets from the configured PE quantity specs.""" - results: dict[str, float] = {} - for target in quantity_targets: - values = self.calculate( - target.variable, - period=target.period, - map_to=target.map_to, - ) - results[target.name] = self._aggregate(values, target.aggregation) - return results - - def _aggregate( - self, - values: Any, - aggregation: Literal["sum", "mean", "count_positive"], - ) -> float: - if aggregation == "sum": - if hasattr(values, "sum"): - return float(values.sum()) - return float(np.asarray(values).sum()) - if aggregation == "mean": - if hasattr(values, "mean"): - return float(values.mean()) - return float(np.asarray(values).mean()) - if aggregation == "count_positive": - positive = values > 0 - if hasattr(positive, "sum"): - return float(positive.sum()) - return float(np.asarray(positive).sum()) - raise ValueError(f"Unsupported aggregation: {aggregation}") - - -def _policyengine_us_variable_is_computed_export(metadata: Any) -> bool: - """Return whether PE-US should compute this variable in final datasets.""" - return ( - bool(getattr(metadata, "formulas", {}) or {}) - or bool(getattr(metadata, "adds", None)) - or bool(getattr(metadata, "subtracts", None)) - ) - - -def detect_policyengine_computed_export_variables( - tax_benefit_system: Any, - input_variables: list[str] | tuple[str, ...], -) -> set[str]: - """Detect exported variables computed by PolicyEngine-US.""" - computed_exports: set[str] = set() - variables = getattr(tax_benefit_system, "variables", {}) - - for variable_name in input_variables: - if variable_name in POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES: - continue - variable = variables.get(variable_name) - if variable is None: - continue - if _policyengine_us_variable_is_computed_export(variable): - computed_exports.add(variable_name) - - return computed_exports - - -def detect_policyengine_pseudo_inputs( - tax_benefit_system: Any, - input_variables: list[str] | tuple[str, ...], -) -> set[str]: - """Detect PE-computed variables that must not be persisted as inputs.""" - return detect_policyengine_computed_export_variables( - tax_benefit_system, - input_variables, - ) - - -@lru_cache(maxsize=1) -def _contract_forbidden_export_columns() -> frozenset[str]: - """Columns the eCPS export contract forbids (single source of truth). - - The transient ``*_reported`` takeup-input columns and the PUF - reported/calculated tax-credit outputs that the enhanced-CPS baseline - deliberately drops from its export. Read from the frozen contract so the - forbidden set never drifts from ``check_export_columns`` and the artifact - gate. - """ - contract_path = ( - Path(__file__).resolve().parents[1] / "pipelines" / "ecps_export_contract.json" - ) - payload = json.loads(contract_path.read_text()) - return frozenset(str(name) for name in payload.get("forbidden", ())) - - -def resolve_policyengine_excluded_export_variables( - tax_benefit_system: Any, - exported_inputs: list[str] | tuple[str, ...], - *, - direct_override_variables: tuple[str, ...] = (), -) -> set[str]: - """Resolve variables to exclude from final H5 datasets. - - Excludes both PolicyEngine-computed exports (formula/derived variables - that must not be persisted as inputs) and the eCPS-contract *forbidden* - columns (transient ``*_reported`` takeup inputs and PUF reported/calculated - tax-credit outputs) when they appear in the exported set. - """ - excluded = detect_policyengine_computed_export_variables( - tax_benefit_system, - exported_inputs, - ) - excluded |= _contract_forbidden_export_columns() & set(exported_inputs) - return excluded - - -def subset_policyengine_tables_by_households( - tables: PolicyEngineUSEntityTableBundle, - household_ids: np.ndarray | pd.Index, -) -> PolicyEngineUSEntityTableBundle: - """Slice an entity bundle to a subset of household_ids, preserving order. - - The returned bundle's ``households`` frame is reordered to match the - order of ``household_ids``; related entity tables retain their own - internal order but are filtered to only rows whose ``household_id`` - is in the selection. - """ - selected = pd.Index(household_ids, name="household_id") - order = pd.Series(np.arange(len(selected)), index=selected) - - households = tables.households.loc[ - tables.households["household_id"].isin(selected) - ].copy() - households = ( - households.assign(_hh_order=households["household_id"].map(order)) - .sort_values("_hh_order") - .drop(columns="_hh_order") - .reset_index(drop=True) - ) - - def _slice(df: pd.DataFrame | None) -> pd.DataFrame | None: - if df is None: - return None - return df.loc[df["household_id"].isin(selected)].reset_index(drop=True) - - return PolicyEngineUSEntityTableBundle( - households=households, - persons=_slice(tables.persons), - tax_units=_slice(tables.tax_units), - spm_units=_slice(tables.spm_units), - families=_slice(tables.families), - marital_units=_slice(tables.marital_units), - ) - - -def _concat_bundles( - bundles: list[PolicyEngineUSEntityTableBundle], -) -> PolicyEngineUSEntityTableBundle: - """Concatenate a list of entity bundles into one, preserving order.""" - - def _join(field: str) -> pd.DataFrame | None: - frames = [getattr(b, field) for b in bundles if getattr(b, field) is not None] - if not frames: - return None - return pd.concat(frames, ignore_index=True) - - return PolicyEngineUSEntityTableBundle( - households=_join("households"), - persons=_join("persons"), - tax_units=_join("tax_units"), - spm_units=_join("spm_units"), - families=_join("families"), - marital_units=_join("marital_units"), - ) - - -def materialize_policyengine_us_variables( - tables: PolicyEngineUSEntityTableBundle, - *, - variables: tuple[str, ...] | list[str], - period: int, - dataset_year: int | None = None, - simulation_cls: Any | None = None, - microsimulation_kwargs: dict[str, Any] | None = None, - temp_dir: str | Path | None = None, - direct_override_variables: tuple[str, ...] = (), - batch_size: int | None = None, -) -> tuple[PolicyEngineUSEntityTableBundle, dict[str, PolicyEngineUSVariableBinding]]: - """Calculate PolicyEngine variables on a temporary export and attach them to tables. - - Memory control: when ``batch_size`` is set, the function loops over - disjoint household chunks of that size, materializing variables on - each chunk (one temp h5 + one Microsimulation per chunk) and - concatenating results. Peak Microsimulation working set drops from - O(n_households) to O(batch_size) with no change in output — this is - additive for the per-household scalar variables we use as calibration - targets (employment income, EITC, CTC, federal income tax, etc.), and - the per-chunk Microsims are independent of each other. - - Variables with cross-household semantics (national quantile - thresholds, poverty rates that depend on the full income - distribution) would be incorrect under batching and are not supported - when ``batch_size`` is not ``None``. Use ``batch_size=None`` for - those. - """ - if batch_size is not None and batch_size > 0: - n_households = len(tables.households) - if n_households > batch_size: - chunk_bundles: list[PolicyEngineUSEntityTableBundle] = [] - chunk_bindings: dict[str, PolicyEngineUSVariableBinding] = {} - household_ids = tables.households["household_id"].to_numpy() - for start in range(0, n_households, batch_size): - end = min(start + batch_size, n_households) - chunk_ids = household_ids[start:end] - chunk_tables = subset_policyengine_tables_by_households( - tables, chunk_ids - ) - chunk_result, chunk_binding = materialize_policyengine_us_variables( - chunk_tables, - variables=variables, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - microsimulation_kwargs=microsimulation_kwargs, - temp_dir=temp_dir, - direct_override_variables=direct_override_variables, - batch_size=None, - ) - chunk_bundles.append(chunk_result) - chunk_bindings.update(chunk_binding) - return _concat_bundles(chunk_bundles), chunk_bindings - requested_variables = tuple(dict.fromkeys(str(variable) for variable in variables)) - if not requested_variables: - return tables, {} - - tax_benefit_system = _resolve_policyengine_us_tax_benefit_system( - simulation_cls=simulation_cls - ) - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=tax_benefit_system, - direct_override_variables=direct_override_variables, - ) - exported_inputs = sorted( - { - target - for variable_map in export_maps.values() - for target in variable_map.values() - } - ) - excluded_variables = resolve_policyengine_excluded_export_variables( - tax_benefit_system, - exported_inputs, - direct_override_variables=direct_override_variables, - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=period, - household_variable_map=export_maps["household"], - person_variable_map=export_maps["person"], - tax_unit_variable_map=export_maps["tax_unit"], - spm_unit_variable_map=export_maps["spm_unit"], - family_variable_map=export_maps["family"], - ) - - temp_parent = Path(temp_dir) if temp_dir is not None else None - with TemporaryDirectory(dir=temp_parent) as directory: - dataset_path = Path(directory) / "policyengine_us_materialize.h5" - write_policyengine_us_time_period_dataset( - arrays, - dataset_path, - excluded_variables=excluded_variables, - tax_benefit_system=tax_benefit_system, - ) - adapter = PolicyEngineUSMicrosimulationAdapter.from_dataset( - dataset_path, - dataset_year=dataset_year or period, - simulation_cls=simulation_cls, - **(microsimulation_kwargs or {}), - ) - return _attach_policyengine_variables_to_tables( - tables, - variables=requested_variables, - period=period, - adapter=adapter, - ) - - -def materialize_policyengine_us_variables_safely( - tables: PolicyEngineUSEntityTableBundle, - *, - variables: tuple[str, ...] | list[str], - period: int, - dataset_year: int | None = None, - simulation_cls: Any | None = None, - microsimulation_kwargs: dict[str, Any] | None = None, - temp_dir: str | Path | None = None, - direct_override_variables: tuple[str, ...] = (), - batch_size: int | None = None, -) -> PolicyEngineUSVariableMaterializationResult: - """Materialize PE variables, degrading to per-variable failures when needed. - - ``batch_size`` forwards to :func:`materialize_policyengine_us_variables`. - With a non-``None`` positive value, the full-dataset Microsimulation - (25–35 GB peak at 1.5M households) is replaced with N per-chunk - Microsims (each ~2–3 GB). Results are concatenated; output is - identical for per-household scalar variables. - """ - requested_variables = tuple(dict.fromkeys(str(variable) for variable in variables)) - if not requested_variables: - return PolicyEngineUSVariableMaterializationResult( - tables=tables, - bindings={}, - ) - - try: - materialized_tables, materialized_bindings = ( - materialize_policyengine_us_variables( - tables, - variables=requested_variables, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - microsimulation_kwargs=microsimulation_kwargs, - temp_dir=temp_dir, - direct_override_variables=direct_override_variables, - batch_size=batch_size, - ) - ) - except Exception: - return _materialize_policyengine_us_variables_one_by_one( - tables, - requested_variables, - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - microsimulation_kwargs=microsimulation_kwargs, - temp_dir=temp_dir, - direct_override_variables=direct_override_variables, - ) - - return PolicyEngineUSVariableMaterializationResult( - tables=materialized_tables, - bindings=materialized_bindings, - materialized_variables=requested_variables, - ) - - -def _materialize_policyengine_us_variables_one_by_one( - tables: PolicyEngineUSEntityTableBundle, - requested_variables: tuple[str, ...], - *, - period: int, - dataset_year: int | None, - simulation_cls: Any | None, - microsimulation_kwargs: dict[str, Any] | None, - temp_dir: str | Path | None, - direct_override_variables: tuple[str, ...], -) -> PolicyEngineUSVariableMaterializationResult: - working_tables = _copy_policyengine_us_entity_tables(tables) - bindings: dict[str, PolicyEngineUSVariableBinding] = {} - materialized_variables: list[str] = [] - failed_variables: dict[str, str] = {} - - for variable in requested_variables: - try: - materialized_tables, materialized_bindings = ( - materialize_policyengine_us_variables( - working_tables, - variables=(variable,), - period=period, - dataset_year=dataset_year, - simulation_cls=simulation_cls, - microsimulation_kwargs=microsimulation_kwargs, - temp_dir=temp_dir, - direct_override_variables=direct_override_variables, - ) - ) - except Exception as exc: - failed_variables[variable] = f"{type(exc).__name__}: {exc}" - continue - working_tables = _merge_materialized_policyengine_bindings( - working_tables, - source_tables=materialized_tables, - bindings=materialized_bindings, - ) - bindings.update(materialized_bindings) - materialized_variables.append(variable) - - return PolicyEngineUSVariableMaterializationResult( - tables=working_tables, - bindings=bindings, - materialized_variables=tuple(materialized_variables), - failed_variables=failed_variables, - ) - - -def _copy_policyengine_us_entity_tables( - tables: PolicyEngineUSEntityTableBundle, -) -> PolicyEngineUSEntityTableBundle: - return PolicyEngineUSEntityTableBundle( - households=tables.households.copy(), - persons=tables.persons.copy() if tables.persons is not None else None, - tax_units=tables.tax_units.copy() if tables.tax_units is not None else None, - spm_units=tables.spm_units.copy() if tables.spm_units is not None else None, - families=tables.families.copy() if tables.families is not None else None, - marital_units=( - tables.marital_units.copy() if tables.marital_units is not None else None - ), - ) - - -def _merge_materialized_policyengine_bindings( - destination_tables: PolicyEngineUSEntityTableBundle, - *, - source_tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], -) -> PolicyEngineUSEntityTableBundle: - merged_tables = _copy_policyengine_us_entity_tables(destination_tables) - for binding in bindings.values(): - if binding.column is None: - continue - source_table = source_tables.table_for(binding.entity) - destination_table = merged_tables.table_for(binding.entity) - destination_table[binding.column] = source_table[binding.column].to_numpy( - copy=True - ) - return merged_tables - - -def load_policyengine_us_entity_tables( - dataset: str | Path | Any, - *, - period: int | str, - variables: tuple[str, ...] | list[str] | None = None, -) -> PolicyEngineUSEntityTableBundle: - """Load a PE-US time-period dataset into a multientity table bundle.""" - period_key = str(period) - requested_variables = ( - None if variables is None else {str(variable) for variable in variables} - ) - try: - tax_benefit_system = _resolve_policyengine_us_tax_benefit_system( - simulation_cls=None - ) - except (ImportError, ValueError): - tax_benefit_system = None - arrays = _load_policyengine_us_period_arrays( - dataset, - period_key=period_key, - variables=requested_variables, - ) - - required_structural = { - "household_id", - "person_id", - "person_household_id", - } - missing = sorted(required_structural - set(arrays)) - if missing: - raise ValueError( - "PolicyEngine US dataset is missing required structural arrays: " - + ", ".join(missing) - ) - - households = pd.DataFrame( - {"household_id": _normalize_id_value(arrays["household_id"])} - ) - household_weight = arrays.get("household_weight") - households["household_weight"] = ( - _normalize_weight_value(household_weight) - if household_weight is not None - else np.ones(len(households), dtype=float) - ) - - persons = pd.DataFrame( - { - "person_id": _normalize_id_value(arrays["person_id"]), - "household_id": _normalize_id_value(arrays["person_household_id"]), - } - ) - if "person_weight" in arrays: - persons["weight"] = _normalize_weight_value(arrays["person_weight"]) - - group_specs = ( - ("tax_unit", "tax_unit_id", "person_tax_unit_id"), - ("spm_unit", "spm_unit_id", "person_spm_unit_id"), - ("family", "family_id", "person_family_id"), - ("marital_unit", "marital_unit_id", "person_marital_unit_id"), - ) - group_tables: dict[str, pd.DataFrame | None] = {} - entity_lengths = { - EntityType.HOUSEHOLD: len(households), - EntityType.PERSON: len(persons), - } - excluded_variable_names = { - "household_id", - "household_weight", - "person_id", - "person_household_id", - "person_weight", - } - for group_name, id_column, membership_column in group_specs: - group_ids = arrays.get(id_column) - membership = arrays.get(membership_column) - if membership is not None: - persons[id_column] = _normalize_id_value(membership) - if group_ids is None: - group_tables[group_name] = None - continue - group_table = pd.DataFrame({id_column: _normalize_id_value(group_ids)}) - if membership is not None: - group_table["household_id"] = group_table[id_column].map( - _build_group_household_map( - group_name=group_name, - group_ids=pd.Series(_normalize_id_value(membership)), - household_ids=persons["household_id"], - ) - ) - group_tables[group_name] = group_table - entity_type = _policyengine_group_entity_type(group_name) - if entity_type is not None: - entity_lengths[entity_type] = len(group_table) - excluded_variable_names.add(id_column) - excluded_variable_names.add(membership_column) - - group_entity_to_table = { - EntityType.TAX_UNIT: group_tables["tax_unit"], - EntityType.SPM_UNIT: group_tables["spm_unit"], - EntityType.FAMILY: group_tables["family"], - } - for variable_name, values in arrays.items(): - if variable_name in excluded_variable_names: - continue - decoded = _decode_policyengine_array(values) - prefixed_table = _resolve_prefixed_policyengine_table( - variable_name=variable_name, - households=households, - persons=persons, - group_tables=group_tables, - ) - if prefixed_table is not None: - prefixed_table[variable_name] = decoded - continue - try: - entity = _infer_policyengine_array_entity( - variable_name=variable_name, - values=values, - entity_lengths=entity_lengths, - tax_benefit_system=tax_benefit_system, - ) - except ValueError: - if requested_variables is None: - continue - raise - if entity is EntityType.HOUSEHOLD: - households[variable_name] = decoded - continue - if entity is EntityType.PERSON: - persons[variable_name] = decoded - continue - group_table = group_entity_to_table.get(entity) - if group_table is None: - raise ValueError( - f"Loaded variable '{variable_name}' for entity '{entity.value}' " - "but no structural table exists for that entity" - ) - group_table[variable_name] = decoded - - return PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=group_tables["tax_unit"], - spm_units=group_tables["spm_unit"], - families=group_tables["family"], - marital_units=group_tables["marital_unit"], - ) - - -def infer_policyengine_us_variable_bindings( - tables: PolicyEngineUSEntityTableBundle, -) -> dict[str, PolicyEngineUSVariableBinding]: - """Infer variable bindings from currently materialized PE-style tables.""" - bindings: dict[str, PolicyEngineUSVariableBinding] = {} - table_specs = ( - ( - tables.households, - EntityType.HOUSEHOLD, - "household_id", - {"household_id", "household_weight", "weight"}, - ), - ( - tables.persons, - EntityType.PERSON, - "household_id", - {"person_id", "household_id", "weight"}, - ), - ( - tables.tax_units, - EntityType.TAX_UNIT, - "household_id", - {"tax_unit_id", "household_id"}, - ), - ( - tables.spm_units, - EntityType.SPM_UNIT, - "household_id", - {"spm_unit_id", "household_id"}, - ), - ( - tables.families, - EntityType.FAMILY, - "household_id", - {"family_id", "household_id"}, - ), - ) - for table, entity, household_id_column, excluded_columns in table_specs: - if table is None: - continue - for column in table.columns: - if column in excluded_columns or column.endswith("_id"): - continue - bindings.setdefault( - column, - PolicyEngineUSVariableBinding( - entity=entity, - column=column, - household_id_column=household_id_column, - ), - ) - return bindings - - -def filter_supported_policyengine_us_targets( - targets: list[TargetSpec], - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], -) -> list[TargetSpec]: - """Return the targets that can be evaluated with current tables/bindings.""" - supported: list[TargetSpec] = [] - for target in targets: - if not _has_policyengine_entity_table(target.entity, tables): - continue - if any(feature not in bindings for feature in target.required_features): - continue - supported.append(target) - return supported - - -def is_unsupported_policyengine_us_target_error(error: ValueError) -> bool: - """Return whether a target-compilation failure indicates unsupported structure.""" - message = str(error) - return ( - "Cross-entity constraints are only supported against household targets " - "or household metadata" in message - ) - - -def compile_supported_policyengine_us_household_linear_constraints( - targets: list[TargetSpec], - tables: PolicyEngineUSEntityTableBundle, - *, - variable_bindings: dict[str, PolicyEngineUSVariableBinding], - household_id_column: str = "household_id", -) -> tuple[list[TargetSpec], list[TargetSpec], tuple[LinearConstraint, ...]]: - """Compile the subset of targets that the current household compiler can handle.""" - filtered_targets = filter_supported_policyengine_us_targets( - targets, - tables, - variable_bindings, - ) - if not filtered_targets: - return [], [], () - - try: - batched_constraints = compile_policyengine_us_household_linear_constraints( - filtered_targets, - tables, - variable_bindings=variable_bindings, - household_id_column=household_id_column, - ) - except ValueError as error: - if not is_unsupported_policyengine_us_target_error(error): - raise - else: - return filtered_targets, [], batched_constraints - - supported_targets: list[TargetSpec] = [] - unsupported_targets: list[TargetSpec] = [] - constraints: list[LinearConstraint] = [] - for target in filtered_targets: - try: - constraint = compile_policyengine_us_household_linear_constraints( - [target], - tables, - variable_bindings=variable_bindings, - household_id_column=household_id_column, - )[0] - except ValueError as error: - if is_unsupported_policyengine_us_target_error(error): - unsupported_targets.append(target) - continue - raise - supported_targets.append(target) - constraints.append(constraint) - return supported_targets, unsupported_targets, tuple(constraints) - - -def _policyengine_us_target_required_variables(targets: list[TargetSpec]) -> set[str]: - return {feature for target in targets for feature in target.required_features} - - -def policyengine_us_formula_variables_for_targets( - targets: list[TargetSpec], - *, - simulation_cls: Any | None = None, - tax_benefit_system: Any | None = None, - direct_override_variables: tuple[str, ...] = (), -) -> set[str]: - """Return target features that should be recalculated by PolicyEngine.""" - required_variables = _policyengine_us_target_required_variables(targets) - if not required_variables: - return set() - if tax_benefit_system is None: - tax_benefit_system = _resolve_policyengine_us_tax_benefit_system(simulation_cls) - variables = getattr(tax_benefit_system, "variables", {}) - direct_overrides = set(direct_override_variables) - formula_variables: set[str] = set() - for variable in required_variables: - if variable in direct_overrides: - continue - variable_metadata = variables.get(variable) - if variable_metadata is None: - continue - if _policyengine_us_variable_is_calculated(variable_metadata): - formula_variables.add(variable) - return formula_variables - - -def _policyengine_us_variable_is_calculated(variable_metadata: Any) -> bool: - if getattr(variable_metadata, "formulas", {}): - return True - if getattr(variable_metadata, "adds", ()) or getattr( - variable_metadata, "subtracts", () - ): - return True - is_input_variable = getattr(variable_metadata, "is_input_variable", None) - if callable(is_input_variable): - try: - return not bool(is_input_variable()) - except TypeError: - return False - return False - - -def policyengine_us_variables_to_materialize( - targets: list[TargetSpec], - bindings: dict[str, PolicyEngineUSVariableBinding], - *, - force_materialize_variables: set[str] | tuple[str, ...] | None = None, -) -> set[str]: - """Compute the missing features required to score the given targets.""" - requested_variables = _policyengine_us_target_required_variables(targets) - force_variables = set(force_materialize_variables or ()) - return { - variable - for variable in requested_variables - if variable not in bindings or variable in force_variables - } - - -def _load_policyengine_us_period_arrays( - dataset: str | Path | Any, - *, - period_key: str, - variables: set[str] | None, -) -> dict[str, np.ndarray]: - source = _resolve_policyengine_us_dataset_source(dataset) - structural_variables = { - "household_id", - "household_weight", - "person_id", - "person_household_id", - "person_weight", - "tax_unit_id", - "person_tax_unit_id", - "spm_unit_id", - "person_spm_unit_id", - "family_id", - "person_family_id", - "marital_unit_id", - "person_marital_unit_id", - } - if isinstance(source, Path): - if not source.exists(): - raise FileNotFoundError(f"PolicyEngine dataset not found: {source}") - with h5py.File(source, "r") as handle: - requested = ( - set(handle.keys()) - if variables is None - else structural_variables | variables - ) - return { - variable: np.asarray(handle[variable][period_key]) - for variable in requested - if variable in handle and period_key in handle[variable] - } - - loaded = source.load_dataset() - requested = set(loaded) if variables is None else structural_variables | variables - arrays: dict[str, np.ndarray] = {} - for variable in requested: - variable_periods = loaded.get(variable) - if variable_periods is None: - continue - value = variable_periods.get(period_key) - if value is None: - value = variable_periods.get(int(period_key)) - if value is None: - continue - arrays[variable] = np.asarray(value) - return arrays - - -def _resolve_policyengine_us_dataset_source(dataset: str | Path | Any) -> Path | Any: - if isinstance(dataset, (str, Path)): - return Path(dataset) - - file_path = getattr(dataset, "file_path", None) - if file_path is not None: - return Path(file_path) - - if hasattr(dataset, "load_dataset"): - return dataset - - raise TypeError( - "dataset must be a path, a dataset-like object with file_path, " - "or an object exposing load_dataset()" - ) - - -def _infer_policyengine_array_entity( - *, - variable_name: str, - values: np.ndarray, - entity_lengths: dict[EntityType, int], - tax_benefit_system: Any | None, -) -> EntityType: - legacy_entity_key = POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES.get( - variable_name - ) - if legacy_entity_key in POLICYENGINE_US_ENTITY_KEY_TO_ENTITY_TYPE: - return POLICYENGINE_US_ENTITY_KEY_TO_ENTITY_TYPE[legacy_entity_key] - if tax_benefit_system is not None: - try: - return _resolve_policyengine_variable_entity( - variable_name, - tax_benefit_system=tax_benefit_system, - ) - except (KeyError, ValueError): - pass - matching_entities = [ - entity for entity, length in entity_lengths.items() if len(values) == length - ] - if len(matching_entities) == 1: - return matching_entities[0] - if not matching_entities: - raise ValueError( - f"Cannot infer PolicyEngine entity for variable '{variable_name}' " - f"with length {len(values)}" - ) - raise ValueError( - f"Ambiguous PolicyEngine entity for variable '{variable_name}' " - f"with length {len(values)}: {[entity.value for entity in matching_entities]}" - ) - - -def _decode_policyengine_array(values: np.ndarray) -> np.ndarray: - if values.dtype.kind != "S": - return values - return values.astype(str) - - -def _resolve_prefixed_policyengine_table( - *, - variable_name: str, - households: pd.DataFrame, - persons: pd.DataFrame, - group_tables: dict[str, pd.DataFrame | None], -) -> pd.DataFrame | None: - if variable_name.startswith("household_"): - return households - if variable_name.startswith("person_"): - return persons - for prefix, group_name in ( - ("tax_unit_", "tax_unit"), - ("spm_unit_", "spm_unit"), - ("family_", "family"), - ("marital_unit_", "marital_unit"), - ): - if variable_name.startswith(prefix): - return group_tables.get(group_name) - return None - - -def _policyengine_group_entity_type(group_name: str) -> EntityType | None: - mapping = { - "tax_unit": EntityType.TAX_UNIT, - "spm_unit": EntityType.SPM_UNIT, - "family": EntityType.FAMILY, - } - return mapping.get(group_name) - - -def _has_policyengine_entity_table( - entity: EntityType, - tables: PolicyEngineUSEntityTableBundle, -) -> bool: - entity_tables = { - EntityType.HOUSEHOLD: tables.households, - EntityType.PERSON: tables.persons, - EntityType.TAX_UNIT: tables.tax_units, - EntityType.SPM_UNIT: tables.spm_units, - EntityType.FAMILY: tables.families, - } - return entity_tables.get(entity) is not None - - -def _compile_household_coefficients( - *, - target: PolicyEngineUSDBTarget | TargetSpec, - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], - household_ids: pd.Index, - household_id_column: str, -) -> np.ndarray: - target_binding = _resolve_target_binding(target, bindings, tables) - target_table = tables.table_for(target_binding.entity) - target_household_ids = _household_ids_for_entity_table( - target_table, - target_binding, - household_id_column, - ) - - if _target_aggregation(target) is TargetAggregation.COUNT: - mask = pd.Series(True, index=target_table.index, dtype=bool) - for constraint in _target_constraints(target): - mask &= _evaluate_constraint_mask( - target_rows=target_table, - target_binding=target_binding, - target_household_ids=target_household_ids, - constraint=constraint, - tables=tables, - bindings=bindings, - household_id_column=household_id_column, - ) - values = np.ones(len(target_table), dtype=float) - contributions = pd.Series( - np.where(mask.to_numpy(), values, 0.0), - index=target_household_ids.to_numpy(), - dtype=float, - ) - grouped = contributions.groupby(level=0).sum() - return grouped.reindex(household_ids, fill_value=0.0).to_numpy(dtype=float) - - target_measure = _target_measure(target) - if target_binding.column is None or target_measure is None: - raise ValueError( - f"Target '{_policyengine_target_name(target)}' has no source column" - ) - - target_values = pd.to_numeric( - target_table[target_binding.column], errors="coerce" - ).fillna(0.0) - row_mask = pd.Series(True, index=target_table.index, dtype=bool) - household_constraints: list[PolicyEngineUSConstraint | TargetFilter] = [] - for constraint in _target_constraints(target): - constraint_binding = _resolve_binding( - _constraint_feature(constraint), - bindings, - tables, - ) - if constraint_binding.entity in { - target_binding.entity, - EntityType.HOUSEHOLD, - } or _can_align_constraint_to_target_rows( - target_rows=target_table, - constraint_binding=constraint_binding, - ): - row_mask &= _evaluate_constraint_mask( - target_rows=target_table, - target_binding=target_binding, - target_household_ids=target_household_ids, - constraint=constraint, - tables=tables, - bindings=bindings, - household_id_column=household_id_column, - ) - else: - household_constraints.append(constraint) - - household_totals = ( - target_values.where(row_mask, 0.0).groupby(target_household_ids).sum() - ) - household_mask = pd.Series(True, index=household_ids, dtype=bool) - for constraint in household_constraints: - household_mask &= _evaluate_constraint_on_households( - constraint=constraint, - tables=tables, - bindings=bindings, - household_ids=household_ids, - household_id_column=household_id_column, - ) - - return ( - household_totals.reindex(household_ids, fill_value=0.0).astype(float) - * household_mask.astype(float) - ).to_numpy(dtype=float) - - -def _entity_primary_id_column(entity: EntityType) -> str: - return { - EntityType.HOUSEHOLD: "household_id", - EntityType.PERSON: "person_id", - EntityType.TAX_UNIT: "tax_unit_id", - EntityType.SPM_UNIT: "spm_unit_id", - EntityType.FAMILY: "family_id", - }[entity] - - -def _can_align_constraint_to_target_rows( - *, - target_rows: pd.DataFrame, - constraint_binding: PolicyEngineUSVariableBinding, -) -> bool: - return _entity_primary_id_column(constraint_binding.entity) in target_rows.columns - - -def _resolve_binding( - variable: str, - bindings: dict[str, PolicyEngineUSVariableBinding], - tables: PolicyEngineUSEntityTableBundle, -) -> PolicyEngineUSVariableBinding: - if variable in bindings: - return bindings[variable] - if variable in tables.households.columns: - return PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, column=variable - ) - raise KeyError(f"No PolicyEngine binding configured for variable '{variable}'") - - -def _resolve_target_binding( - target: PolicyEngineUSDBTarget | TargetSpec, - bindings: dict[str, PolicyEngineUSVariableBinding], - tables: PolicyEngineUSEntityTableBundle, -) -> PolicyEngineUSVariableBinding: - if isinstance(target, TargetSpec): - if target.aggregation is TargetAggregation.COUNT: - return PolicyEngineUSVariableBinding(entity=target.entity) - if target.measure is None: - raise ValueError(f"Target '{target.name}' is missing a measure") - binding = _resolve_binding(target.measure, bindings, tables) - return PolicyEngineUSVariableBinding( - entity=binding.entity, - column=binding.column, - household_id_column=binding.household_id_column, - ) - - return _resolve_binding(target.variable, bindings, tables) - - -def _require_binding_column( - binding: PolicyEngineUSVariableBinding, - *, - feature: str, -) -> str: - if binding.column is None: - raise ValueError( - f"Constraint variable '{feature}' does not map to a source column" - ) - return binding.column - - -def _household_ids_for_entity_table( - table: pd.DataFrame, - binding: PolicyEngineUSVariableBinding, - household_id_column: str, -) -> pd.Series: - if binding.entity is EntityType.HOUSEHOLD: - if household_id_column not in table.columns: - raise ValueError( - f"Household table is missing household id column '{household_id_column}'" - ) - return table[household_id_column] - - if binding.household_id_column not in table.columns: - raise ValueError( - f"Entity table for '{binding.entity.value}' is missing " - f"household link column '{binding.household_id_column}'" - ) - return table[binding.household_id_column] - - -def _apply_constraint_filter( - values: pd.Series, - constraint: PolicyEngineUSConstraint | TargetFilter, -) -> pd.Series: - return _apply_constraint( - values, - _constraint_operator(constraint), - _constraint_value(constraint), - ) - - -def _evaluate_constraint_mask( - *, - target_rows: pd.DataFrame, - target_binding: PolicyEngineUSVariableBinding, - target_household_ids: pd.Series, - constraint: PolicyEngineUSConstraint | TargetFilter, - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], - household_id_column: str, -) -> pd.Series: - constraint_binding = _resolve_binding( - _constraint_feature(constraint), bindings, tables - ) - constraint_column = _require_binding_column( - constraint_binding, - feature=_constraint_feature(constraint), - ) - - if constraint_binding.entity is target_binding.entity: - return _apply_constraint_filter(target_rows[constraint_column], constraint) - - if constraint_binding.entity is EntityType.HOUSEHOLD: - household_values = tables.households.set_index(household_id_column)[ - constraint_column - ] - aligned = target_household_ids.map(household_values) - return _apply_constraint_filter(aligned, constraint) - - aligned = _align_related_entity_constraint_values( - target_rows=target_rows, - constraint_binding=constraint_binding, - feature=_constraint_feature(constraint), - tables=tables, - ) - if aligned is not None: - return _apply_constraint_filter(aligned, constraint) - - aligned_mask = _align_related_entity_constraint_mask( - target_rows=target_rows, - target_binding=target_binding, - constraint_binding=constraint_binding, - constraint=constraint, - tables=tables, - ) - if aligned_mask is not None: - return aligned_mask - - if target_binding.entity is EntityType.HOUSEHOLD: - aligned = _evaluate_constraint_on_households( - constraint=constraint, - tables=tables, - bindings=bindings, - household_ids=pd.Index(target_household_ids), - household_id_column=household_id_column, - ).reindex(pd.Index(target_household_ids), fill_value=False) - return pd.Series(aligned.to_numpy(), index=target_rows.index, dtype=bool) - - raise ValueError( - "Cross-entity constraints are only supported against household targets " - "or household metadata" - ) - - -def _align_related_entity_constraint_values( - *, - target_rows: pd.DataFrame, - constraint_binding: PolicyEngineUSVariableBinding, - feature: str, - tables: PolicyEngineUSEntityTableBundle, -) -> pd.Series | None: - related_id_column = _entity_primary_id_column(constraint_binding.entity) - if related_id_column not in target_rows.columns: - return None - - related_table = tables.table_for(constraint_binding.entity) - if related_id_column not in related_table.columns: - raise ValueError( - f"Entity table for '{constraint_binding.entity.value}' is missing " - f"primary id column '{related_id_column}'" - ) - - constraint_column = _require_binding_column( - constraint_binding, - feature=feature, - ) - related_values = related_table.set_index(related_id_column)[constraint_column] - return target_rows[related_id_column].map(related_values) - - -def _align_related_entity_constraint_mask( - *, - target_rows: pd.DataFrame, - target_binding: PolicyEngineUSVariableBinding, - constraint_binding: PolicyEngineUSVariableBinding, - constraint: PolicyEngineUSConstraint | TargetFilter, - tables: PolicyEngineUSEntityTableBundle, -) -> pd.Series | None: - if constraint_binding.entity is not EntityType.PERSON: - return None - if target_binding.entity not in { - EntityType.TAX_UNIT, - EntityType.SPM_UNIT, - EntityType.FAMILY, - }: - return None - persons = tables.persons - if persons is None: - return None - related_id_column = _entity_primary_id_column(target_binding.entity) - if ( - related_id_column not in target_rows.columns - or related_id_column not in persons.columns - ): - return None - constraint_column = _require_binding_column( - constraint_binding, - feature=_constraint_feature(constraint), - ) - person_matches = _apply_constraint_filter(persons[constraint_column], constraint) - group_matches = person_matches.groupby(persons[related_id_column]).any() - aligned = target_rows[related_id_column].map(group_matches).fillna(False) - return pd.Series(aligned.to_numpy(dtype=bool), index=target_rows.index, dtype=bool) - - -def _evaluate_constraint_on_households( - *, - constraint: PolicyEngineUSConstraint | TargetFilter, - tables: PolicyEngineUSEntityTableBundle, - bindings: dict[str, PolicyEngineUSVariableBinding], - household_ids: pd.Index, - household_id_column: str, -) -> pd.Series: - binding = _resolve_binding(_constraint_feature(constraint), bindings, tables) - binding_column = _require_binding_column( - binding, - feature=_constraint_feature(constraint), - ) - if binding.entity is EntityType.HOUSEHOLD: - values = tables.households.set_index(household_id_column)[binding_column] - return _apply_constraint_filter(values, constraint).reindex( - household_ids, - fill_value=False, - ) - - table = tables.table_for(binding.entity) - related_household_ids = _household_ids_for_entity_table( - table, binding, household_id_column - ) - row_matches = _apply_constraint_filter(table[binding_column], constraint) - return ( - row_matches.groupby(related_household_ids) - .any() - .reindex( - household_ids, - fill_value=False, - ) - ) - - -def _apply_constraint(series: pd.Series, operation: str, raw_value: Any) -> pd.Series: - operation = "==" if operation == "=" else operation - if operation not in {"==", "!=", ">", ">=", "<", "<=", "in", "not_in"}: - raise ValueError(f"Unsupported PolicyEngine constraint operation: {operation}") - - if operation in {"in", "not_in"}: - if not isinstance(raw_value, (list, tuple, set, frozenset)): - raw_values = [raw_value] - else: - raw_values = list(raw_value) - value = [_coerce_constraint_value(series, item) for item in raw_values] - else: - value = _coerce_constraint_value(series, raw_value) - if operation == "==": - return series == value - if operation == "!=": - return series != value - if operation == "in": - return series.isin(value) - if operation == "not_in": - return ~series.isin(value) - if operation == ">": - return series > value - if operation == ">=": - return series >= value - if operation == "<": - return series < value - return series <= value - - -def _coerce_constraint_value(series: pd.Series, raw_value: Any) -> Any: - if pd.api.types.is_bool_dtype(series): - return str(raw_value).strip().lower() in {"1", "true", "t", "yes"} - if pd.api.types.is_numeric_dtype(series): - return pd.to_numeric(pd.Series([raw_value]), errors="raise").iloc[0] - return str(raw_value) - - -def _policyengine_target_name(target: PolicyEngineUSDBTarget | TargetSpec) -> str: - if isinstance(target, TargetSpec): - return target.name - return f"target_{target.target_id}_{target.variable}" - - -def _target_value(target: PolicyEngineUSDBTarget | TargetSpec) -> float: - return float(target.value) - - -def _target_aggregation( - target: PolicyEngineUSDBTarget | TargetSpec, -) -> TargetAggregation: - if isinstance(target, TargetSpec): - return target.aggregation - if target.variable in DEFAULT_POLICYENGINE_US_VARIABLE_BINDINGS: - return TargetAggregation.COUNT - return TargetAggregation.SUM - - -def _target_measure(target: PolicyEngineUSDBTarget | TargetSpec) -> str | None: - if isinstance(target, TargetSpec): - return target.measure - if target.variable in DEFAULT_POLICYENGINE_US_VARIABLE_BINDINGS: - return None - return target.variable - - -def _target_constraints( - target: PolicyEngineUSDBTarget | TargetSpec, -) -> tuple[PolicyEngineUSConstraint | TargetFilter, ...]: - if isinstance(target, TargetSpec): - return target.filters - return target.constraints - - -def _constraint_feature(constraint: PolicyEngineUSConstraint | TargetFilter) -> str: - return ( - constraint.feature - if isinstance(constraint, TargetFilter) - else constraint.variable - ) - - -def _constraint_operator(constraint: PolicyEngineUSConstraint | TargetFilter) -> str: - if isinstance(constraint, TargetFilter): - return str(constraint.operator.value) - return str(constraint.operation) - - -def _constraint_value(constraint: PolicyEngineUSConstraint | TargetFilter) -> Any: - return constraint.value - - -def build_policyengine_us_export_variable_maps( - tables: PolicyEngineUSEntityTableBundle, - *, - tax_benefit_system: Any, - direct_override_variables: tuple[str, ...] = (), -) -> dict[str, dict[str, str]]: - """Infer PE export variable maps from entity-table columns.""" - variable_metadata = getattr(tax_benefit_system, "variables", {}) - allowed_variables_by_entity = _group_policyengine_us_export_variables_by_entity( - variable_metadata, - direct_override_variables=direct_override_variables, - ) - household_table = _with_policyengine_household_export_derivatives(tables.households) - person_table = _with_policyengine_person_export_derivatives(tables.persons) - tax_unit_table = _with_policyengine_tax_unit_export_derivatives(tables.tax_units) - table_specs = ( - ( - "household", - household_table, - {"household_id", "household_weight", "weight"}, - ), - ("person", person_table, {"person_id", "household_id"}), - ("tax_unit", tax_unit_table, {"tax_unit_id", "household_id"}), - ("spm_unit", tables.spm_units, {"spm_unit_id", "household_id"}), - ("family", tables.families, {"family_id", "household_id"}), - ) - export_maps: dict[str, dict[str, str]] = {} - for entity_key, table, structural_columns in table_specs: - export_maps[entity_key] = _infer_policyengine_us_table_variable_map( - table=table, - allowed_variables=allowed_variables_by_entity.get(entity_key, set()), - excluded_columns=structural_columns, - ) - return export_maps - - -def build_policyengine_us_export_column_names( - tables: PolicyEngineUSEntityTableBundle, - *, - tax_benefit_system: Any | None = None, - simulation_cls: Any | None = None, - direct_override_variables: tuple[str, ...] = (), -) -> set[str]: - """Return the final PE-US H5 column names without materializing arrays. - - This is the schema-only counterpart to - :func:`build_policyengine_us_time_period_arrays` + - :func:`write_policyengine_us_time_period_dataset`. It lets slow builds - verify the eCPS export contract from the saved post-imputation entity - tables, before microsimulation/calibration changes weights. - """ - if tables.persons is None: - raise ValueError("PolicyEngine US export requires a person table") - if tax_benefit_system is None: - tax_benefit_system = _resolve_policyengine_us_tax_benefit_system( - simulation_cls=simulation_cls - ) - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=tax_benefit_system, - direct_override_variables=direct_override_variables, - ) - exported_inputs = { - target - for variable_map in export_maps.values() - for target in variable_map.values() - } - excluded_variables = resolve_policyengine_excluded_export_variables( - tax_benefit_system, - sorted(exported_inputs), - direct_override_variables=direct_override_variables, - ) - return ( - exported_inputs - excluded_variables - ) | POLICYENGINE_US_STRUCTURAL_EXPORT_COLUMNS - - -def build_policyengine_us_time_period_arrays( - tables: PolicyEngineUSEntityTableBundle, - *, - period: int, - household_variable_map: dict[str, str] | None = None, - person_variable_map: dict[str, str] | None = None, - tax_unit_variable_map: dict[str, str] | None = None, - spm_unit_variable_map: dict[str, str] | None = None, - family_variable_map: dict[str, str] | None = None, - marital_unit_variable_map: dict[str, str] | None = None, - household_id_column: str = "household_id", - person_id_column: str = "person_id", - household_weight_column: str = "household_weight", -) -> dict[str, dict[str, np.ndarray]]: - """Build a PE-US TIME_PERIOD_ARRAYS payload from multientity tables.""" - if tables.persons is None: - raise ValueError("PolicyEngine US export requires a person table") - - period_key = str(period) - household_table = _with_policyengine_household_export_derivatives(tables.households) - households = _prepare_household_export_table( - household_table, - household_id_column=household_id_column, - household_weight_column=household_weight_column, - ) - person_table = _with_policyengine_person_export_derivatives( - tables.persons, - period=int(period), - ) - tax_unit_table = _with_policyengine_tax_unit_export_derivatives(tables.tax_units) - persons = _prepare_person_export_table( - person_table, - person_id_column=person_id_column, - household_id_column=household_id_column, - household_ids=pd.Index(households[household_id_column]), - ) - - arrays: dict[str, dict[str, np.ndarray]] = { - "household_id": { - period_key: _normalize_id_value(households[household_id_column]), - }, - "person_id": { - period_key: _normalize_id_value(persons[person_id_column]), - }, - "person_household_id": { - period_key: _normalize_id_value(persons[household_id_column]), - }, - "household_weight": { - period_key: _normalize_weight_value(households[household_weight_column]), - }, - } - - arrays.update( - _project_table_to_time_period_arrays( - households, - period_key=period_key, - column_map=household_variable_map, - excluded_columns={household_id_column, household_weight_column}, - ) - ) - arrays.update( - _project_table_to_time_period_arrays( - persons, - period_key=period_key, - column_map=person_variable_map, - excluded_columns={person_id_column, household_id_column}, - ) - ) - - group_specs = ( - ( - "tax_unit", - "tax_unit_id", - tax_unit_table, - tax_unit_variable_map, - "household", - ), - ( - "spm_unit", - "spm_unit_id", - tables.spm_units, - spm_unit_variable_map, - "household", - ), - ("family", "family_id", tables.families, family_variable_map, "household"), - ( - "marital_unit", - "marital_unit_id", - tables.marital_units, - marital_unit_variable_map, - "tax_unit", - ), - ) - for group_name, id_column, provided_table, variable_map, fallback in group_specs: - person_group_ids = _resolve_person_group_ids( - group_name=group_name, - id_column=id_column, - persons=persons, - provided_table=provided_table, - person_id_column=person_id_column, - household_id_column=household_id_column, - fallback=fallback, - ) - group_table = _resolve_group_export_table( - group_name=group_name, - id_column=id_column, - provided_table=provided_table, - person_group_ids=person_group_ids, - person_household_ids=persons[household_id_column], - household_id_column=household_id_column, - ) - arrays[f"{group_name}_id"] = { - period_key: _normalize_id_value(group_table[id_column]), - } - arrays[f"person_{group_name}_id"] = { - period_key: _normalize_id_value(person_group_ids), - } - arrays.update( - _project_table_to_time_period_arrays( - group_table, - period_key=period_key, - column_map=variable_map, - excluded_columns={id_column, household_id_column}, - ) - ) - - return arrays - - -def _resolve_policyengine_us_tax_benefit_system(simulation_cls: Any | None) -> Any: - if simulation_cls is None: - try: - import policyengine_us - except ImportError as exc: - raise ImportError( - "policyengine_us is required to materialize PolicyEngine US variables" - ) from exc - return getattr(policyengine_us.system, "system", policyengine_us.system) - - tax_benefit_system = getattr(simulation_cls, "tax_benefit_system", None) - if tax_benefit_system is None: - tax_benefit_system = getattr(simulation_cls, "system", None) - if tax_benefit_system is not None: - tax_benefit_system = getattr(tax_benefit_system, "system", tax_benefit_system) - if tax_benefit_system is None: - raise ValueError( - "simulation_cls must expose a 'tax_benefit_system' attribute to materialize variables" - ) - return tax_benefit_system - - -def _resolve_policyengine_variable_entity( - variable: str, - *, - tax_benefit_system: Any, -) -> EntityType: - variables = getattr(tax_benefit_system, "variables", {}) - variable_metadata = variables.get(variable) - if variable_metadata is None: - raise KeyError( - f"PolicyEngine variable '{variable}' not found in tax-benefit system" - ) - entity_key = getattr(getattr(variable_metadata, "entity", None), "key", None) - if entity_key not in POLICYENGINE_US_ENTITY_KEY_TO_ENTITY_TYPE: - raise ValueError( - f"Unsupported PolicyEngine entity '{entity_key}' for variable '{variable}'" - ) - return POLICYENGINE_US_ENTITY_KEY_TO_ENTITY_TYPE[entity_key] - - -def _infer_policyengine_us_table_variable_map( - *, - table: pd.DataFrame | None, - allowed_variables: set[str], - excluded_columns: set[str], -) -> dict[str, str]: - if table is None: - return {} - variable_map = { - column: column - for column in table.columns - if column in allowed_variables - and column not in excluded_columns - and not column.endswith("_id") - } - exported_targets = set(variable_map.values()) - available_columns = set(table.columns) - for source_column, target_variable in POLICYENGINE_US_EXPORT_COLUMN_ALIASES.items(): - if target_variable not in allowed_variables: - continue - if source_column not in available_columns or source_column in excluded_columns: - continue - if source_column.endswith("_id") or target_variable in exported_targets: - continue - variable_map[source_column] = target_variable - exported_targets.add(target_variable) - for target_variable in sorted( - set(POLICYENGINE_US_EXPORT_DEFAULTS) & allowed_variables - ): - if target_variable in exported_targets: - continue - variable_map[target_variable] = target_variable - exported_targets.add(target_variable) - return variable_map - - -def _group_policyengine_us_export_variables_by_entity( - variable_metadata: dict[str, Any], - *, - direct_override_variables: tuple[str, ...] = (), -) -> dict[str, set[str]]: - allowed_variable_names = ( - SAFE_POLICYENGINE_US_EXPORT_VARIABLES - | set(POLICYENGINE_US_EXPORT_DEFAULTS) - | set(direct_override_variables) - ) - allowed_variables_by_entity: dict[str, set[str]] = { - entity_key: set() for entity_key in POLICYENGINE_US_ENTITY_KEY_TO_ENTITY_TYPE - } - for variable_name, metadata in variable_metadata.items(): - if variable_name not in allowed_variable_names: - continue - if ( - variable_name not in POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES - and _policyengine_us_variable_is_computed_export(metadata) - ): - continue - entity_key = getattr(getattr(metadata, "entity", None), "key", None) - if entity_key not in allowed_variables_by_entity: - continue - allowed_variables_by_entity[entity_key].add(variable_name) - for ( - variable_name, - entity_key, - ) in POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES.items(): - if entity_key in allowed_variables_by_entity: - allowed_variables_by_entity[entity_key].add(variable_name) - return allowed_variables_by_entity - - -def _attach_policyengine_variables_to_tables( - tables: PolicyEngineUSEntityTableBundle, - *, - variables: tuple[str, ...], - period: int, - adapter: PolicyEngineUSMicrosimulationAdapter, -) -> tuple[PolicyEngineUSEntityTableBundle, dict[str, PolicyEngineUSVariableBinding]]: - households = tables.households.copy() - persons = tables.persons.copy() if tables.persons is not None else None - tax_units = tables.tax_units.copy() if tables.tax_units is not None else None - spm_units = tables.spm_units.copy() if tables.spm_units is not None else None - families = tables.families.copy() if tables.families is not None else None - marital_units = ( - tables.marital_units.copy() if tables.marital_units is not None else None - ) - bindings: dict[str, PolicyEngineUSVariableBinding] = {} - - for variable in variables: - entity = adapter.variable_entity(variable) - entity_key = ENTITY_TYPE_TO_POLICYENGINE_US_ENTITY_KEY[entity] - table = _table_for_policyengine_entity( - entity=entity, - households=households, - persons=persons, - tax_units=tax_units, - spm_units=spm_units, - families=families, - ) - values = _coerce_policyengine_calculation_values( - adapter.calculate(variable, period=period) - ) - if len(values) != len(table): - values = _coerce_policyengine_calculation_values( - adapter.calculate(variable, period=period, map_to=entity_key) - ) - if len(values) != len(table): - raise ValueError( - f"PolicyEngine variable '{variable}' returned {len(values)} values for " - f"{entity.value}, expected {len(table)}" - ) - table[variable] = values - bindings[variable] = PolicyEngineUSVariableBinding( - entity=entity, column=variable - ) - - return ( - PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=tax_units, - spm_units=spm_units, - families=families, - marital_units=marital_units, - ), - bindings, - ) - - -def _table_for_policyengine_entity( - *, - entity: EntityType, - households: pd.DataFrame, - persons: pd.DataFrame | None, - tax_units: pd.DataFrame | None, - spm_units: pd.DataFrame | None, - families: pd.DataFrame | None, -) -> pd.DataFrame: - if entity is EntityType.HOUSEHOLD: - return households - if entity is EntityType.PERSON and persons is not None: - return persons - if entity is EntityType.TAX_UNIT and tax_units is not None: - return tax_units - if entity is EntityType.SPM_UNIT and spm_units is not None: - return spm_units - if entity is EntityType.FAMILY and families is not None: - return families - raise ValueError(f"No table available to materialize '{entity.value}' variables") - - -def _coerce_policyengine_calculation_values(values: Any) -> np.ndarray: - if hasattr(values, "values"): - return np.asarray(values.values) - return np.asarray(values) - - -def project_frame_to_time_period_arrays( - frame: pd.DataFrame, - *, - period: int, - column_map: dict[str, str], -) -> dict[str, dict[str, np.ndarray]]: - """Project a richer Microplex frame into PE-style time-period arrays.""" - arrays: dict[str, dict[str, np.ndarray]] = {} - for source_column, target_variable in column_map.items(): - if source_column not in frame.columns: - raise ValueError(f"Projection source column not found: {source_column}") - arrays[target_variable] = { - str(period): _normalize_h5_value(frame[source_column]) - } - return arrays - - -def write_policyengine_us_time_period_dataset( - data: dict[str, dict[str, np.ndarray]], - path: str | Path, - *, - excluded_variables: set[str] | None = None, - tax_benefit_system: Any | None = None, -) -> Path: - """Write PolicyEngine-readable time-period arrays to HDF5.""" - output_path = Path(path) - output_path.parent.mkdir(parents=True, exist_ok=True) - excluded = excluded_variables or set() - written_variables = [variable for variable in data if variable not in excluded] - if tax_benefit_system is not None: - computed_exports = detect_policyengine_computed_export_variables( - tax_benefit_system, - written_variables, - ) - if computed_exports: - formatted = ", ".join(sorted(computed_exports)[:10]) - suffix = "" if len(computed_exports) <= 10 else ", ..." - raise ValueError( - "PolicyEngine US export contains computed variables: " - f"{formatted}{suffix}. Drop construction-only intermediates " - "or export the underlying leaf input instead." - ) - - with h5py.File(output_path, "w") as handle: - for variable, periods in data.items(): - if variable in excluded: - continue - group = handle.create_group(variable) - for period, values in periods.items(): - group.create_dataset(str(period), data=_normalize_h5_value(values)) - - return output_path - - -def _normalize_h5_value(values: Any) -> np.ndarray: - """Normalize values so h5py can persist them predictably.""" - array = np.asarray(values) - if array.dtype.kind in {"U", "O"}: - return array.astype("S") - return array - - -def _with_policyengine_household_export_derivatives( - households: pd.DataFrame | None, -) -> pd.DataFrame | None: - """Attach eCPS persisted household fields derivable from table columns.""" - if households is None or "in_nyc" in households.columns: - return households - if "county_fips" not in households.columns: - return households - - household_table = households.copy() - county_numeric = pd.to_numeric(household_table["county_fips"], errors="coerce") - if "state_fips" in household_table.columns: - state_numeric = pd.to_numeric(household_table["state_fips"], errors="coerce") - else: - state_numeric = pd.Series(np.nan, index=household_table.index) - full_county_fips = county_numeric.copy() - county_fragment = ( - county_numeric.notna() - & county_numeric.gt(0) - & county_numeric.lt(1000) - & state_numeric.notna() - ) - full_county_fips.loc[county_fragment] = state_numeric.loc[ - county_fragment - ].round().astype(int) * 1000 + county_numeric.loc[county_fragment].round().astype( - int - ) - household_table["in_nyc"] = ( - full_county_fips.round() - .astype("Int64") - .isin(NYC_FULL_COUNTY_FIPS) - .fillna(False) - ) - return household_table - - -def _prepare_household_export_table( - households: pd.DataFrame, - *, - household_id_column: str, - household_weight_column: str, -) -> pd.DataFrame: - household_table = households.copy() - if household_id_column not in household_table.columns: - raise ValueError( - f"Household table must contain '{household_id_column}' for export" - ) - if household_weight_column not in household_table.columns: - if "weight" not in household_table.columns: - raise ValueError( - f"Household table must contain '{household_weight_column}' or 'weight'" - ) - household_table[household_weight_column] = household_table["weight"] - household_table[household_id_column] = _normalize_id_value( - household_table[household_id_column] - ) - if pd.Index(household_table[household_id_column]).duplicated().any(): - raise ValueError("Household export table must have unique household ids") - household_table[household_weight_column] = _normalize_weight_value( - household_table[household_weight_column] - ) - return household_table - - -def _prepare_person_export_table( - persons: pd.DataFrame, - *, - person_id_column: str, - household_id_column: str, - household_ids: pd.Index, -) -> pd.DataFrame: - person_table = persons.copy() - missing_columns = { - column - for column in (person_id_column, household_id_column) - if column not in person_table.columns - } - if missing_columns: - missing = ", ".join(sorted(missing_columns)) - raise ValueError(f"Person table is missing required export columns: {missing}") - person_table[person_id_column] = _normalize_id_value(person_table[person_id_column]) - person_table[household_id_column] = _normalize_id_value( - person_table[household_id_column] - ) - if pd.Index(person_table[person_id_column]).duplicated().any(): - raise ValueError("Person export table must have unique person ids") - if not pd.Index(person_table[household_id_column]).isin(household_ids).all(): - raise ValueError("Every exported person must belong to an exported household") - return person_table - - -def _with_policyengine_person_export_derivatives( - persons: pd.DataFrame | None, - *, - period: int | None = None, -) -> pd.DataFrame | None: - if persons is None: - return persons - - person_table = persons.copy() - if ( - "is_household_head" not in person_table.columns - and "relationship_to_head" in person_table.columns - ): - relationship = pd.to_numeric( - person_table["relationship_to_head"], - errors="coerce", - ) - person_table["is_household_head"] = relationship.eq(0).fillna(False) - - if ( - "hours_worked_last_week" not in person_table.columns - and "hours_worked" in person_table.columns - ): - person_table["hours_worked_last_week"] = pd.to_numeric( - person_table["hours_worked"], - errors="coerce", - ).fillna(0.0) - if ( - "weekly_hours_worked_before_lsr" not in person_table.columns - and "hours_worked_last_week" in person_table.columns - ): - person_table["weekly_hours_worked_before_lsr"] = pd.to_numeric( - person_table["hours_worked_last_week"], - errors="coerce", - ).fillna(0.0) - elif ( - "weekly_hours_worked_before_lsr" not in person_table.columns - and "hours_worked" in person_table.columns - ): - person_table["weekly_hours_worked_before_lsr"] = pd.to_numeric( - person_table["hours_worked"], - errors="coerce", - ).fillna(0.0) - if "has_tin" not in person_table.columns: - person_table["has_tin"] = _derive_has_tin_for_export(person_table) - if "has_itin" not in person_table.columns: - person_table["has_itin"] = person_table["has_tin"].astype(bool) - if "meets_ssi_disability_criteria" not in person_table.columns: - person_table["meets_ssi_disability_criteria"] = ( - _derive_meets_ssi_disability_criteria_for_export(person_table) - ) - if "fsla_overtime_premium" not in person_table.columns: - fsla_premium = _derive_flsa_overtime_premium_for_export( - person_table, - period=period, - ) - if fsla_premium is not None: - person_table["fsla_overtime_premium"] = fsla_premium - return person_table - - -def _with_policyengine_tax_unit_export_derivatives( - tax_units: pd.DataFrame | None, -) -> pd.DataFrame | None: - if tax_units is None: - return tax_units - - if not MARKETPLACE_PLAN_BENCHMARK_RATIO_SOURCE_COLUMNS.issubset(tax_units.columns): - return tax_units - - tax_unit_table = tax_units.copy() - tax_unit_table[MARKETPLACE_PLAN_BENCHMARK_RATIO_COLUMN] = ( - compute_marketplace_plan_benchmark_ratio( - reported_premium=pd.to_numeric( - tax_unit_table["health_insurance_premiums_without_medicare_part_b"], - errors="coerce", - ).fillna(0.0), - aca_ptc=pd.to_numeric(tax_unit_table["aca_ptc"], errors="coerce").fillna( - 0.0 - ), - slcsp=pd.to_numeric(tax_unit_table["slcsp"], errors="coerce").fillna(0.0), - takes_up_aca=_truthy_series( - tax_unit_table["takes_up_aca_if_eligible"], - index=tax_unit_table.index, - ), - ) - ) - return tax_unit_table - - -def _derive_has_tin_for_export(persons: pd.DataFrame) -> pd.Series: - """Mirror PE-US has_tin default while honoring MP's SSN-card type signal.""" - if "ssn_card_type" not in persons.columns: - return pd.Series(True, index=persons.index, dtype=bool) - ssn_card_type = persons["ssn_card_type"].astype("string").str.upper() - return ssn_card_type.ne("NONE").fillna(True) - - -def _derive_meets_ssi_disability_criteria_for_export( - persons: pd.DataFrame, -) -> pd.Series: - """Approximate eCPS's persisted SSI disability input from available signals.""" - signals: list[pd.Series] = [] - bool_columns = ( - "is_disabled", - "difficulty_seeing", - "difficulty_hearing", - "difficulty_walking_or_climbing_stairs", - "difficulty_dressing_or_bathing", - "difficulty_doing_errands", - "difficulty_remembering_or_making_decisions", - ) - for column in bool_columns: - if column in persons.columns: - signals.append(_truthy_series(persons[column], index=persons.index)) - amount_columns = ( - "ssi", - "ssi_reported", - "disability_benefits", - "social_security_disability", - ) - for column in amount_columns: - if column in persons.columns: - signals.append( - pd.to_numeric(persons[column], errors="coerce").fillna(0.0).gt(0) - ) - if not signals: - return pd.Series(False, index=persons.index, dtype=bool) - result = signals[0].copy() - for signal in signals[1:]: - result |= signal - return result.fillna(False).astype(bool) - - -def _truthy_series(values: pd.Series, *, index: pd.Index) -> pd.Series: - if values.dtype == bool: - return values.fillna(False) - if pd.api.types.is_numeric_dtype(values): - return pd.to_numeric(values, errors="coerce").fillna(0.0).gt(0) - return ( - values.astype("string") - .str.upper() - .isin({"TRUE", "T", "YES", "Y", "1"}) - .reindex(index, fill_value=False) - ) - - -def _derive_flsa_overtime_premium_for_export( - persons: pd.DataFrame, - *, - period: int | None, -) -> pd.Series | None: - """Derive the data-backed FLSA overtime proxy when ORG inputs are present.""" - required_columns = { - "employment_income", - "hours_worked_last_week", - "weeks_worked", - "is_paid_hourly", - "has_never_worked", - "is_military", - "is_executive_administrative_professional", - "is_farmer_fisher", - "is_computer_scientist", - } - if not required_columns.issubset(persons.columns): - return None - - ( - hce_salary_threshold, - salary_basis_threshold, - computer_salary_threshold, - hours_threshold, - rate_multiplier, - ) = _flsa_overtime_policy_for_export(period or 2024) - - employment_income = ( - pd.to_numeric(persons["employment_income"], errors="coerce") - .fillna(0.0) - .clip(lower=0) - ) - hours_worked_last_week = ( - pd.to_numeric(persons["hours_worked_last_week"], errors="coerce") - .fillna(0.0) - .clip(lower=0) - ) - weeks_worked = ( - pd.to_numeric(persons["weeks_worked"], errors="coerce") - .fillna(0.0) - .clip(lower=0) - ) - overtime_hours = (hours_worked_last_week - hours_threshold).clip(lower=0) - straight_time_equivalent_hours = ( - hours_worked_last_week.clip(upper=hours_threshold) - + overtime_hours * rate_multiplier - ) - premium_share = pd.Series(0.0, index=persons.index, dtype=float) - positive_hours = straight_time_equivalent_hours.gt(0) - premium_share.loc[positive_hours] = ( - (rate_multiplier - 1) * overtime_hours.loc[positive_hours] - ) / straight_time_equivalent_hours.loc[positive_hours] - - is_paid_hourly = _truthy_series(persons["is_paid_hourly"], index=persons.index) - has_never_worked = _truthy_series(persons["has_never_worked"], index=persons.index) - is_military = _truthy_series(persons["is_military"], index=persons.index) - is_eap = _truthy_series( - persons["is_executive_administrative_professional"], - index=persons.index, - ) - is_farmer_fisher = _truthy_series(persons["is_farmer_fisher"], index=persons.index) - is_computer_scientist = _truthy_series( - persons["is_computer_scientist"], - index=persons.index, - ) - - salary_threshold = pd.Series( - hce_salary_threshold, - index=persons.index, - dtype=float, - ) - salary_threshold.loc[is_computer_scientist] = min( - computer_salary_threshold, - hce_salary_threshold, - ) - salary_threshold.loc[is_eap | is_farmer_fisher] = min( - salary_basis_threshold, - hce_salary_threshold, - ) - always_exempt = has_never_worked | is_military - salary_threshold.loc[always_exempt] = 0 - is_exempt = always_exempt | ( - employment_income.ge(salary_threshold) & ~is_paid_hourly - ) - eligible = ~is_exempt & weeks_worked.gt(0) - premium = pd.Series(0.0, index=persons.index, dtype=float) - premium.loc[eligible] = ( - employment_income.loc[eligible] * premium_share.loc[eligible] - ) - return premium.clip(lower=0, upper=employment_income).astype(np.float32) - - -@lru_cache(maxsize=8) -def _flsa_overtime_policy_for_export( - period: int, -) -> tuple[float, float, float, float, float]: - """Return eCPS-compatible FLSA overtime thresholds for one year.""" - try: - import policyengine_us - from policyengine_us.model_api import WEEKS_IN_YEAR - - system = getattr(policyengine_us.system, "system", policyengine_us.system) - overtime = system.parameters( - f"{int(period)}-01-01" - ).gov.irs.income.exemption.overtime - hours_threshold = float(overtime.hours_threshold) - return ( - float(overtime.hce_salary_threshold), - float(overtime.salary_basis_threshold) * float(WEEKS_IN_YEAR), - float(overtime.computer_salary_threshold) - * hours_threshold - * float(WEEKS_IN_YEAR), - hours_threshold, - float(overtime.rate_multiplier), - ) - except Exception: - return (132_964.0, 35_568.0, 57_470.4, 40.0, 1.5) - - -def _resolve_person_group_ids( - *, - group_name: str, - id_column: str, - persons: pd.DataFrame, - provided_table: pd.DataFrame | None, - person_id_column: str, - household_id_column: str, - fallback: Literal["household", "tax_unit"], -) -> pd.Series: - if id_column in persons.columns: - return pd.Series( - _normalize_id_value(persons[id_column]), - index=persons.index, - name=id_column, - ) - - if provided_table is not None: - resolved = _extract_membership_ids_from_group_table( - group_name=group_name, - id_column=id_column, - provided_table=provided_table, - persons=persons, - person_id_column=person_id_column, - household_id_column=household_id_column, - ) - if resolved is not None: - return resolved - - if fallback == "tax_unit" and "tax_unit_id" in persons.columns: - return pd.Series( - _normalize_id_value(persons["tax_unit_id"]), - index=persons.index, - name=id_column, - ) - - return pd.Series( - _normalize_id_value(persons[household_id_column]), - index=persons.index, - name=id_column, - ) - - -def _extract_membership_ids_from_group_table( - *, - group_name: str, - id_column: str, - provided_table: pd.DataFrame, - persons: pd.DataFrame, - person_id_column: str, - household_id_column: str, -) -> pd.Series | None: - if id_column not in provided_table.columns: - return None - - mapping: dict[int, int] = {} - for member_column in ("member_ids", "filer_ids", "dependent_ids"): - if member_column not in provided_table.columns: - continue - for _, row in provided_table[[id_column, member_column]].iterrows(): - group_id = int(_normalize_id_value([row[id_column]])[0]) - members = row[member_column] - if not isinstance(members, (list, tuple, np.ndarray, pd.Series)): - continue - for member_id in members: - if pd.isna(member_id): - continue - mapping[int(member_id)] = group_id - if mapping: - membership = persons[person_id_column].map(mapping) - if membership.isna().any(): - missing = persons.loc[membership.isna(), person_id_column].tolist() - raise ValueError( - f"Could not derive '{group_name}' membership for persons: {missing}" - ) - return pd.Series( - _normalize_id_value(membership), - index=persons.index, - name=id_column, - ) - - if ( - household_id_column in provided_table.columns - and not provided_table[household_id_column].duplicated().any() - ): - household_map = ( - provided_table[[id_column, household_id_column]] - .assign( - **{ - id_column: _normalize_id_value(provided_table[id_column]), - household_id_column: _normalize_id_value( - provided_table[household_id_column] - ), - } - ) - .set_index(household_id_column)[id_column] - ) - membership = persons[household_id_column].map(household_map) - if membership.notna().all(): - return pd.Series( - _normalize_id_value(membership), - index=persons.index, - name=id_column, - ) - - return None - - -def _resolve_group_export_table( - *, - group_name: str, - id_column: str, - provided_table: pd.DataFrame | None, - person_group_ids: pd.Series, - person_household_ids: pd.Series, - household_id_column: str, -) -> pd.DataFrame: - if provided_table is None: - group_table = pd.DataFrame({id_column: pd.unique(person_group_ids)}) - else: - group_table = provided_table.copy() - if id_column not in group_table.columns: - group_table[id_column] = pd.unique(person_group_ids) - - group_table[id_column] = _normalize_id_value(group_table[id_column]) - if pd.Index(group_table[id_column]).duplicated().any(): - raise ValueError(f"{group_name} export table must have unique ids") - - household_map = _build_group_household_map( - group_name=group_name, - group_ids=person_group_ids, - household_ids=person_household_ids, - ) - if household_id_column in group_table.columns: - normalized_households = _normalize_id_value(group_table[household_id_column]) - group_table[household_id_column] = normalized_households - expected = group_table[id_column].map(household_map) - mismatch = expected.notna() & pd.Series( - normalized_households, index=group_table.index - ).ne(expected) - if mismatch.any(): - raise ValueError( - f"{group_name} export table household links are inconsistent with person memberships" - ) - else: - group_table[household_id_column] = group_table[id_column].map(household_map) - - if group_table[household_id_column].isna().any(): - missing = group_table.loc[ - group_table[household_id_column].isna(), id_column - ].tolist() - raise ValueError( - f"Could not derive household links for {group_name} ids: {missing}" - ) - return group_table - - -def _build_group_household_map( - *, - group_name: str, - group_ids: pd.Series, - household_ids: pd.Series, -) -> pd.Series: - mapping = pd.DataFrame( - { - "group_id": _normalize_id_value(group_ids), - "household_id": _normalize_id_value(household_ids), - } - ).drop_duplicates() - if mapping.groupby("group_id")["household_id"].nunique().gt(1).any(): - raise ValueError( - f"{group_name} members must all belong to the same household for PE export" - ) - return mapping.set_index("group_id")["household_id"] - - -def _project_table_to_time_period_arrays( - table: pd.DataFrame, - *, - period_key: str, - column_map: dict[str, str] | None, - excluded_columns: set[str], -) -> dict[str, dict[str, np.ndarray]]: - if not column_map: - return {} - - arrays: dict[str, dict[str, np.ndarray]] = {} - for source_column, target_variable in column_map.items(): - if source_column in excluded_columns: - continue - has_default = target_variable in POLICYENGINE_US_EXPORT_DEFAULTS - if source_column not in table.columns and not has_default: - raise ValueError(f"Projection source column not found: {source_column}") - values = ( - pd.Series(table[source_column]) - if source_column in table.columns - else pd.Series( - POLICYENGINE_US_EXPORT_DEFAULTS[target_variable], - index=table.index, - ) - ) - if has_default: - default_value = POLICYENGINE_US_EXPORT_DEFAULTS[target_variable] - values = values.where(values.notna(), other=default_value) - if isinstance(default_value, str): - string_values = values.astype("string") - values = string_values.where( - string_values.notna() & string_values.ne(""), - other=default_value, - ) - values = _normalize_policyengine_us_export_enum_values(target_variable, values) - arrays[target_variable] = { - period_key: _normalize_h5_value(values), - } - return arrays - - -def _normalize_policyengine_us_export_enum_values( - target_variable: str, - values: pd.Series, -) -> pd.Series: - enum_map = POLICYENGINE_US_NUMERIC_ENUM_EXPORT_MAPS.get(target_variable) - if enum_map is None: - return values - - numeric = pd.to_numeric(values, errors="coerce") - numeric_mask = numeric.notna() - if not numeric_mask.any(): - return values - - normalized = values.copy() - mapped = numeric.map(enum_map) - if mapped[numeric_mask].isna().any(): - bad_values = sorted( - { - str(value) - for value in values[numeric_mask & mapped.isna()].dropna().unique() - } - ) - raise ValueError( - f"Unsupported numeric {target_variable} value(s) for PE-US export: " - + ", ".join(bad_values) - ) - normalized.loc[numeric_mask] = mapped[numeric_mask] - return normalized - - -def _normalize_id_value(values: Any) -> np.ndarray: - return pd.to_numeric(pd.Series(values), errors="raise").astype(np.int64).to_numpy() - - -def _normalize_weight_value(values: Any) -> np.ndarray: - return ( - pd.to_numeric(pd.Series(values), errors="coerce") - .fillna(0.0) - .astype(np.float32) - .to_numpy() - ) diff --git a/src/microplex_us/source_manifests.py b/src/microplex_us/source_manifests.py deleted file mode 100644 index 12284f2b..00000000 --- a/src/microplex_us/source_manifests.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Typed source-manifest accessors for microplex-us.""" - -from __future__ import annotations - -from functools import cache -from pathlib import Path - -from microplex.core import SourceManifest, load_source_manifest - - -def _manifest_dir() -> Path: - return Path(__file__).resolve().parent / "manifests" - - -@cache -def load_us_source_manifest(name: str) -> SourceManifest: - """Load one US source manifest by name.""" - return load_source_manifest(_manifest_dir() / f"{name}.json") diff --git a/src/microplex_us/source_registry.py b/src/microplex_us/source_registry.py deleted file mode 100644 index 79ede109..00000000 --- a/src/microplex_us/source_registry.py +++ /dev/null @@ -1,131 +0,0 @@ -"""Declarative source-variable capability registry for US source providers.""" - -from __future__ import annotations - -from collections.abc import Iterable, Mapping, Sequence -from dataclasses import dataclass - -from microplex.core import SourceVariableCapability - -from microplex_us.variables import resolve_variable_semantic_capabilities - - -@dataclass(frozen=True) -class SourceVariablePolicy: - """Declarative overrides for how one source variable should be used.""" - - authoritative: bool | None = None - usable_as_condition: bool | None = None - notes: str | None = None - - def apply(self, base: SourceVariableCapability | None = None) -> SourceVariableCapability: - """Resolve this policy against an optional base capability.""" - base = base or SourceVariableCapability() - return SourceVariableCapability( - authoritative=( - base.authoritative - if self.authoritative is None - else self.authoritative - ), - usable_as_condition=( - base.usable_as_condition - if self.usable_as_condition is None - else self.usable_as_condition - ), - notes=self.notes if self.notes is not None else base.notes, - ) - - -@dataclass(frozen=True) -class SourceVariablePolicySpec: - """Variable-usage policy for one source family.""" - - source_prefixes: tuple[str, ...] - variable_policies: Mapping[str, SourceVariablePolicy] - - def matches(self, source_name: str) -> bool: - return any( - source_name == prefix or source_name.startswith(f"{prefix}_") - for prefix in self.source_prefixes - ) - - -PUF_SOURCE_VARIABLE_POLICY = SourceVariablePolicySpec( - source_prefixes=("irs_soi_puf",), - variable_policies={ - "state_fips": SourceVariablePolicy( - authoritative=False, - usable_as_condition=False, - notes="PUF does not carry usable state geography in the current microdata build.", - ), - "tenure": SourceVariablePolicy( - authoritative=False, - usable_as_condition=False, - notes="PUF tenure is scaffold filler rather than a native source attribute.", - ), - "income": SourceVariablePolicy( - authoritative=False, - usable_as_condition=False, - notes="PUF income is a derived convenience column, not an atomic donor target.", - ), - "employment_status": SourceVariablePolicy( - authoritative=False, - usable_as_condition=False, - notes="PUF employment status is derived from tax-line amounts, not observed directly.", - ), - "employment_income": SourceVariablePolicy( - authoritative=True, - usable_as_condition=False, - notes="PUF wage income is source-native but should not be used as a shared donor condition.", - ), - "filing_status_code": SourceVariablePolicy( - authoritative=True, - usable_as_condition=False, - notes="PUF filing status is source-native tax-unit structure and should survive rebuild donor integration.", - ), - }, -) - -SURVEY_DONOR_FILLER_POLICY = SourceVariablePolicySpec( - source_prefixes=("sipp", "scf"), - variable_policies={ - "state_fips": SourceVariablePolicy( - authoritative=False, - usable_as_condition=False, - notes="SIPP/SCF donor survey adapters do not carry real state geography in the rebuild path.", - ), - "tenure": SourceVariablePolicy( - authoritative=False, - usable_as_condition=False, - notes="SIPP/SCF donor survey adapters use filler tenure only to satisfy the household schema.", - ), - }, -) - -DEFAULT_SOURCE_VARIABLE_POLICIES: tuple[SourceVariablePolicySpec, ...] = ( - PUF_SOURCE_VARIABLE_POLICY, - SURVEY_DONOR_FILLER_POLICY, -) - - -def resolve_source_variable_capabilities( - source_name: str, - variable_names: Iterable[str], - *, - policy_specs: Sequence[SourceVariablePolicySpec] = DEFAULT_SOURCE_VARIABLE_POLICIES, -) -> dict[str, SourceVariableCapability]: - """Build per-variable capabilities for a source from declarative policy specs.""" - variables = tuple(dict.fromkeys(variable_names)) - resolved = resolve_variable_semantic_capabilities(variables) - matching_specs = [spec for spec in policy_specs if spec.matches(source_name)] - - for variable in variables: - capability = resolved.get(variable, SourceVariableCapability()) - for spec in matching_specs: - policy = spec.variable_policies.get(variable) - if policy is None: - continue - capability = policy.apply(capability) - if capability != SourceVariableCapability(): - resolved[variable] = capability - return resolved diff --git a/src/microplex_us/specs/__init__.py b/src/microplex_us/specs/__init__.py deleted file mode 100644 index 42b391c3..00000000 --- a/src/microplex_us/specs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Packaged Microplex-US declarative specs.""" diff --git a/src/microplex_us/specs/us-2024.yaml b/src/microplex_us/specs/us-2024.yaml index 22eea5a6..cd03b1a3 100644 --- a/src/microplex_us/specs/us-2024.yaml +++ b/src/microplex_us/specs/us-2024.yaml @@ -12,8 +12,8 @@ sources: spine: base: cps_asec - method: clone - clone: { seed: 20260529 } + method: support_spine + support: { seed: 20260529 } halves: - { name: cps_keep, keep: all } - { name: synthetic_puf, strip_to: [demographics] } @@ -519,14 +519,14 @@ variables: symbol: BlockGeography notes: Manifest flagged possible encoding/export divergence; exporter must preserve eCPS-compatible widths. mp_spec: - method: GeoCloner/exporter derived geography with eCPS-compatible encoding + method: geography assignment/exporter derived geography with eCPS-compatible encoding operation: kind: encode_geoid source: geography encoding: S15 code: - path: src/microplex_us/specs/us-2024.yaml - summary: Temporary declaration for future GeoCloner/exporter rules. + summary: Temporary declaration for future geography assignment/exporter rules. bond_assets: entity: person role: sipp_asset_source_imputed @@ -773,14 +773,14 @@ variables: symbol: BlockGeography notes: Manifest flagged possible encoding/export divergence; exporter must preserve eCPS-compatible widths. mp_spec: - method: GeoCloner/exporter derived geography with eCPS-compatible encoding + method: geography assignment/exporter derived geography with eCPS-compatible encoding operation: kind: encode_geoid source: geography encoding: S4 code: - path: src/microplex_us/specs/us-2024.yaml - summary: Temporary declaration for future GeoCloner/exporter rules. + summary: Temporary declaration for future geography assignment/exporter rules. count_under_18: entity: person role: cps_passthrough_or_constructed @@ -876,14 +876,14 @@ variables: symbol: BlockGeography notes: Manifest flagged possible encoding/export divergence; exporter must preserve eCPS-compatible widths. mp_spec: - method: GeoCloner/exporter derived geography with eCPS-compatible encoding + method: geography assignment/exporter derived geography with eCPS-compatible encoding operation: kind: encode_geoid source: geography encoding: S5 code: - path: src/microplex_us/specs/us-2024.yaml - summary: Temporary declaration for future GeoCloner/exporter rules. + summary: Temporary declaration for future geography assignment/exporter rules. cps_race: entity: person role: cps_passthrough_or_constructed @@ -2598,14 +2598,14 @@ variables: symbol: BlockGeography notes: Manifest flagged possible encoding/export divergence; exporter must preserve eCPS-compatible widths. mp_spec: - method: GeoCloner/exporter derived geography with eCPS-compatible encoding + method: geography assignment/exporter derived geography with eCPS-compatible encoding operation: kind: derive transform: geography_in_nyc depends_on: [county_fips] code: - path: src/microplex_us/specs/us-2024.yaml - summary: Temporary declaration for future GeoCloner/exporter rules. + summary: Temporary declaration for future geography assignment/exporter rules. investment_income_elected_form_4952: entity: person role: puf_imputed_overridden @@ -6718,14 +6718,14 @@ variables: symbol: BlockGeography notes: Manifest flagged possible encoding/export divergence; exporter must preserve eCPS-compatible widths. mp_spec: - method: GeoCloner/exporter derived geography with eCPS-compatible encoding + method: geography assignment/exporter derived geography with eCPS-compatible encoding operation: kind: encode_geoid source: geography encoding: S2 code: - path: src/microplex_us/specs/us-2024.yaml - summary: Temporary declaration for future GeoCloner/exporter rules. + summary: Temporary declaration for future geography assignment/exporter rules. stock_assets: entity: person role: sipp_asset_source_imputed @@ -7671,14 +7671,14 @@ variables: symbol: BlockGeography notes: Manifest flagged possible encoding/export divergence; exporter must preserve eCPS-compatible widths. mp_spec: - method: GeoCloner/exporter derived geography with eCPS-compatible encoding + method: geography assignment/exporter derived geography with eCPS-compatible encoding operation: kind: encode_geoid source: geography encoding: S11 code: - path: src/microplex_us/specs/us-2024.yaml - summary: Temporary declaration for future GeoCloner/exporter rules. + summary: Temporary declaration for future geography assignment/exporter rules. traditional_401k_contributions: entity: person role: cps_passthrough_or_constructed diff --git a/src/microplex_us/supabase_targets.py b/src/microplex_us/supabase_targets.py deleted file mode 100644 index 0a5c79bd..00000000 --- a/src/microplex_us/supabase_targets.py +++ /dev/null @@ -1,594 +0,0 @@ -"""US Supabase calibration target loader.""" - -from __future__ import annotations - -import os -from typing import Any - -import requests -from microplex.core import EntityType -from microplex.targets import ( - FilterOperator, - TargetAggregation, - TargetFilter, - TargetQuery, - TargetSet, - TargetSpec, - apply_target_query, -) - -from microplex_us.target_registry import ( - US_TARGET_AVAILABLE_KEY, - US_TARGET_CATEGORY_KEY, - US_TARGET_GROUP_KEY, - US_TARGET_IMPUTATION_KEY, - US_TARGET_LEVEL_KEY, - TargetCategory, - TargetLevel, -) - -SUPABASE_TARGET_ID_KEY = "supabase_target_id" -SUPABASE_VARIABLE_KEY = "supabase_variable" -SUPABASE_TARGET_TYPE_KEY = "supabase_target_type" -SUPABASE_JURISDICTION_KEY = "supabase_jurisdiction" -SUPABASE_STRATUM_NAME_KEY = "supabase_stratum_name" -SUPABASE_SOURCE_INSTITUTION_KEY = "supabase_source_institution" -SUPABASE_SUPPORTED_BY_COLUMN_MAP_KEY = "supabase_supported_by_column_map" - -_COUNT_ALL_VARIABLES = { - "family_count", - "household_count", - "person_count", - "spm_unit_count", - "tax_unit_count", -} - -_COUNT_ENTITY_MAP = { - "family_count": EntityType.FAMILY, - "household_count": EntityType.HOUSEHOLD, - "person_count": EntityType.PERSON, - "spm_unit_count": EntityType.SPM_UNIT, - "tax_unit_count": EntityType.TAX_UNIT, -} - -_INCOME_VARIABLES = { - "alimony_income", - "dividend_income", - "employment_income", - "farm_income", - "interest_income", - "long_term_capital_gains", - "partnership_s_corp_income", - "rental_income", - "self_employment_income", - "short_term_capital_gains", - "social_security", - "tax_exempt_pension_income", - "taxable_pension_income", - "unemployment_compensation", -} - -_BENEFIT_VARIABLES = { - "eitc_spending", - "snap_households", - "snap_spending", - "social_security_spending", - "ssi_spending", - "unemployment_spending", -} - -_HEALTH_VARIABLES = { - "aca_enrollment", - "health_insurance_premiums", - "medicaid_enrollment", - "other_medical_expenses", -} - -_TAX_UNIT_VARIABLES = { - "eitc_spending", -} - -_HOUSEHOLD_VARIABLES = { - "snap_households", - "snap_spending", -} - - -class SupabaseTargetLoader: - """Load US calibration targets from the microplex Supabase schema.""" - - # Mapping from Supabase variable names to CPS column names. - CPS_COLUMN_MAP = { - "employment_income": "employment_income", - "self_employment_income": "self_employment_income", - "dividend_income": "dividend_income", - "interest_income": "interest_income", - "rental_income": "rental_income", - "social_security": "social_security", - "unemployment_compensation": "unemployment_compensation", - "taxable_pension_income": "taxable_pension_income", - "tax_exempt_pension_income": "tax_exempt_pension_income", - "long_term_capital_gains": "long_term_capital_gains", - "short_term_capital_gains": "short_term_capital_gains", - "partnership_s_corp_income": "partnership_s_corp_income", - "farm_income": "farm_income", - "alimony_income": "alimony_income", - "snap_spending": "snap", - "ssi_spending": "ssi", - "eitc_spending": "eitc", - "social_security_spending": "social_security", - "unemployment_spending": "unemployment_compensation", - "medicaid_enrollment": "medicaid", - "aca_enrollment": "aca", - "snap_households": "snap", - "health_insurance_premiums": "health_insurance_premiums", - "other_medical_expenses": "medical_expenses", - } - - STATE_FIPS = { - "01": "al", - "02": "ak", - "04": "az", - "05": "ar", - "06": "ca", - "08": "co", - "09": "ct", - "10": "de", - "11": "dc", - "12": "fl", - "13": "ga", - "15": "hi", - "16": "id", - "17": "il", - "18": "in", - "19": "ia", - "20": "ks", - "21": "ky", - "22": "la", - "23": "me", - "24": "md", - "25": "ma", - "26": "mi", - "27": "mn", - "28": "ms", - "29": "mo", - "30": "mt", - "31": "ne", - "32": "nv", - "33": "nh", - "34": "nj", - "35": "nm", - "36": "ny", - "37": "nc", - "38": "nd", - "39": "oh", - "40": "ok", - "41": "or", - "42": "pa", - "44": "ri", - "45": "sc", - "46": "sd", - "47": "tn", - "48": "tx", - "49": "ut", - "50": "vt", - "51": "va", - "53": "wa", - "54": "wv", - "55": "wi", - "56": "wy", - } - - def __init__( - self, - url: str | None = None, - key: str | None = None, - schema: str = "microplex", - ) -> None: - """Initialize the loader. - - Args: - url: Supabase URL. Defaults to SUPABASE_URL env var. - key: Supabase key. Defaults to POLICYENGINE_SUPABASE_SERVICE_KEY env var. - schema: Schema to use. Defaults to 'microplex'. - """ - self.url = url or os.environ.get( - "SUPABASE_URL", - "https://nsupqhfchdtqclomlrgs.supabase.co", - ) - self.key = key or os.environ.get("POLICYENGINE_SUPABASE_SERVICE_KEY") - if not self.key: - raise ValueError( - "Supabase service key must be provided via the key argument or " - "POLICYENGINE_SUPABASE_SERVICE_KEY." - ) - self.base_url = f"{self.url}/rest/v1" - self.headers = { - "apikey": self.key, - "Authorization": f"Bearer {self.key}", - "Content-Type": "application/json", - "Accept-Profile": schema, - "Content-Profile": schema, - } - self._cache = {} - - def _get( - self, - endpoint: str, - params: dict[str, Any] | None = None, - paginate: bool = True, - ) -> list[dict[str, Any]]: - """Make a GET request to Supabase with optional pagination.""" - url = f"{self.base_url}/{endpoint}" - params = params or {} - - if not paginate: - response = requests.get( - url, - headers=self.headers, - params=params, - timeout=30, - ) - response.raise_for_status() - return response.json() - - all_results = [] - offset = 0 - limit = 1000 - - while True: - page_params = {**params, "limit": limit, "offset": offset} - response = requests.get( - url, - headers=self.headers, - params=page_params, - timeout=30, - ) - response.raise_for_status() - results = response.json() - - if not results: - break - - all_results.extend(results) - offset += limit - - if len(results) < limit: - break - - return all_results - - def load_all(self, period: int | None = None) -> list[dict[str, Any]]: - """Load all targets with source and stratum info.""" - params = { - "select": "id,variable,value,target_type,period,notes,source:sources(id,name,institution),stratum:strata(id,name,jurisdiction)", - } - if period: - params["period"] = f"eq.{period}" - - return self._get("targets", params) - - def load_by_institution( - self, - institution: str, - period: int | None = None, - ) -> list[dict[str, Any]]: - """Load targets from a specific source institution.""" - sources = self._get("sources", {"institution": f"eq.{institution}"}) - source_ids = [source["id"] for source in sources] - - if not source_ids: - return [] - - params = { - "select": "id,variable,value,target_type,period,notes,source:sources(id,name,institution),stratum:strata(id,name,jurisdiction)", - "source_id": f"in.({','.join(source_ids)})", - } - if period: - params["period"] = f"eq.{period}" - - return self._get("targets", params) - - def load_by_period(self, period: int) -> list[dict[str, Any]]: - """Load targets for a specific year.""" - return self.load_all(period=period) - - def get_cps_column_map(self) -> dict[str, str]: - """Get the mapping from Supabase variable names to CPS columns.""" - return self.CPS_COLUMN_MAP.copy() - - def _parse_jurisdiction(self, jurisdiction: str) -> str | None: - """Parse jurisdiction to get the state code when applicable.""" - if jurisdiction in {"us", "us-national"}: - return None - - if jurisdiction.startswith("us-") and len(jurisdiction) == 5: - suffix = jurisdiction[3:].lower() - if suffix in self.STATE_FIPS: - return self.STATE_FIPS[suffix] - if suffix in _state_abbr_to_fips(self.STATE_FIPS): - return suffix - - return None - - def build_calibration_constraints( - self, - period: int = 2024, - include_states: bool = False, - target_types: list[str] | None = None, - ) -> dict[str, float]: - """Build a CPS-column calibration constraint dict from Supabase targets.""" - targets = self.load_all(period=period) - constraints = {} - - for target in targets: - variable = target["variable"] - value = target["value"] - target_type = target.get("target_type", "amount") - stratum = target.get("stratum", {}) - jurisdiction = stratum.get("jurisdiction", "us") - - if target_types and target_type not in target_types: - continue - - cps_col = self.CPS_COLUMN_MAP.get(variable) - if not cps_col: - continue - - state = self._parse_jurisdiction(jurisdiction) - - if state and include_states: - constraints[f"{cps_col}_{state}"] = value - elif not state and cps_col not in constraints: - constraints[cps_col] = value - - return constraints - - def get_summary(self) -> dict[str, Any]: - """Get summary counts for available targets in Supabase.""" - targets = self.load_all() - - by_institution = {} - by_variable = {} - by_type = {} - - for target in targets: - institution = target.get("source", {}).get("institution", "Unknown") - by_institution[institution] = by_institution.get(institution, 0) + 1 - - variable = target["variable"] - by_variable[variable] = by_variable.get(variable, 0) + 1 - - target_type = target.get("target_type", "amount") - by_type[target_type] = by_type.get(target_type, 0) + 1 - - return { - "total": len(targets), - "by_institution": by_institution, - "by_variable": by_variable, - "by_type": by_type, - } - - -class SupabaseTargetProvider(SupabaseTargetLoader): - """Load Supabase targets as canonical core target specs.""" - - def load_target_set(self, query: TargetQuery | None = None) -> TargetSet: - """Load a canonical target set through the core provider protocol.""" - query = query or TargetQuery() - provider_filters = query.provider_filters - period = _query_period(query.period) - institution = provider_filters.get("institution") - target_types = _as_string_set(provider_filters.get("target_types")) - include_unsupported = bool(provider_filters.get("include_unsupported", True)) - include_states = bool(provider_filters.get("include_states", True)) - - if institution: - rows = self.load_by_institution(str(institution), period=period) - else: - rows = self.load_all(period=period) - - specs: list[TargetSpec] = [] - for row in rows: - target_type = _target_type(row) - if target_types and target_type not in target_types: - continue - - spec = self.target_from_row(row) - if ( - not include_states - and spec.metadata.get(US_TARGET_LEVEL_KEY) == TargetLevel.STATE.value - ): - continue - if ( - not include_unsupported - and not spec.metadata[SUPABASE_SUPPORTED_BY_COLUMN_MAP_KEY] - ): - continue - specs.append(spec) - - return apply_target_query( - TargetSet(specs), - TargetQuery( - period=period if period is not None else query.period, - entity=query.entity, - names=query.names, - metadata_filters=query.metadata_filters, - ), - ) - - def target_from_row(self, row: dict[str, Any]) -> TargetSpec: - """Translate one Supabase target row into the canonical target IR.""" - variable = str(row["variable"]) - jurisdiction = _target_jurisdiction(row) - state_fips, state_abbr = _jurisdiction_state(jurisdiction, self.STATE_FIPS) - target_type = _target_type(row) - aggregation = _aggregation_for_target_type(target_type) - measure = self.CPS_COLUMN_MAP.get(variable, variable) - supported = variable in self.CPS_COLUMN_MAP - source = row.get("source") if isinstance(row.get("source"), dict) else {} - source_name = source.get("name") or source.get("institution") - source_institution = source.get("institution") - stratum = row.get("stratum") if isinstance(row.get("stratum"), dict) else {} - category = _category_for_variable(variable) - level = TargetLevel.STATE if state_fips is not None else TargetLevel.NATIONAL - - filters: list[TargetFilter] = [] - if aggregation is TargetAggregation.COUNT and variable not in _COUNT_ALL_VARIABLES: - filters.append( - TargetFilter( - feature=measure, - operator=FilterOperator.GT, - value=0, - ) - ) - - if state_fips is not None: - filters.append( - TargetFilter( - feature="state_fips", - operator=FilterOperator.EQ, - value=state_fips, - ) - ) - - metadata: dict[str, Any] = { - SUPABASE_TARGET_ID_KEY: row.get("id"), - SUPABASE_VARIABLE_KEY: variable, - SUPABASE_TARGET_TYPE_KEY: target_type, - SUPABASE_JURISDICTION_KEY: jurisdiction, - SUPABASE_STRATUM_NAME_KEY: stratum.get("name"), - SUPABASE_SOURCE_INSTITUTION_KEY: source_institution, - SUPABASE_SUPPORTED_BY_COLUMN_MAP_KEY: supported, - US_TARGET_LEVEL_KEY: level.value, - US_TARGET_GROUP_KEY: _group_for_category(category), - US_TARGET_AVAILABLE_KEY: supported, - US_TARGET_IMPUTATION_KEY: not supported, - } - if category is not None: - metadata[US_TARGET_CATEGORY_KEY] = category.value - if state_fips is not None: - metadata["state_fips"] = state_fips - metadata["state_abbr"] = state_abbr - - return TargetSpec( - name=_target_name(variable, jurisdiction), - entity=_entity_for_variable(variable), - value=float(row["value"]), - period=int(row["period"]), - measure=None if aggregation is TargetAggregation.COUNT else measure, - aggregation=aggregation, - filters=tuple(filters), - source=source_name, - units=_units_for_target_type(target_type), - description=row.get("notes"), - metadata=metadata, - ) - - -def _target_type(row: dict[str, Any]) -> str: - return str(row.get("target_type") or "amount").lower() - - -def _aggregation_for_target_type(target_type: str) -> TargetAggregation: - if target_type == "count": - return TargetAggregation.COUNT - if target_type == "mean": - return TargetAggregation.MEAN - return TargetAggregation.SUM - - -def _target_jurisdiction(row: dict[str, Any]) -> str: - stratum = row.get("stratum") if isinstance(row.get("stratum"), dict) else {} - return str(stratum.get("jurisdiction") or "us") - - -def _target_name(variable: str, jurisdiction: str) -> str: - if jurisdiction in {"us", "us-national"}: - return variable - return f"{variable}_{jurisdiction.replace('-', '_')}" - - -def _query_period(period: int | str | None) -> int | None: - if isinstance(period, int): - return period - if isinstance(period, str) and period.isdigit(): - return int(period) - return None - - -def _as_string_set(value: Any) -> set[str]: - if value is None: - return set() - if isinstance(value, str): - return {value} - return {str(item) for item in value} - - -def _state_abbr_to_fips(state_fips: dict[str, str]) -> dict[str, str]: - return {abbr: fips for fips, abbr in state_fips.items()} - - -def _jurisdiction_state( - jurisdiction: str, - state_fips: dict[str, str], -) -> tuple[str | None, str | None]: - if not jurisdiction.startswith("us-") or len(jurisdiction) != 5: - return None, None - - suffix = jurisdiction[3:].lower() - if suffix in state_fips: - return suffix, state_fips[suffix] - - abbr_to_fips = _state_abbr_to_fips(state_fips) - if suffix in abbr_to_fips: - return abbr_to_fips[suffix], suffix - - return None, None - - -def _category_for_variable(variable: str) -> TargetCategory | None: - if variable in _INCOME_VARIABLES: - return TargetCategory.INCOME - if variable in _BENEFIT_VARIABLES: - return TargetCategory.BENEFITS - if variable in _HEALTH_VARIABLES: - return TargetCategory.HEALTH - if variable.endswith("_tax") or variable.endswith("_credit"): - return TargetCategory.TAX - if variable in _COUNT_ALL_VARIABLES: - return TargetCategory.DEMOGRAPHICS - return None - - -def _entity_for_variable(variable: str) -> EntityType: - if variable in _COUNT_ENTITY_MAP: - return _COUNT_ENTITY_MAP[variable] - if variable in _TAX_UNIT_VARIABLES: - return EntityType.TAX_UNIT - if variable in _HOUSEHOLD_VARIABLES: - return EntityType.HOUSEHOLD - return EntityType.PERSON - - -def _group_for_category(category: TargetCategory | None) -> str: - if category is None: - return "supabase_targets" - return f"supabase_{category.value}" - - -def _units_for_target_type(target_type: str) -> str | None: - return "USD" if target_type == "amount" else None - - -__all__ = [ - "SUPABASE_JURISDICTION_KEY", - "SUPABASE_SOURCE_INSTITUTION_KEY", - "SUPABASE_STRATUM_NAME_KEY", - "SUPABASE_SUPPORTED_BY_COLUMN_MAP_KEY", - "SUPABASE_TARGET_ID_KEY", - "SUPABASE_TARGET_TYPE_KEY", - "SUPABASE_VARIABLE_KEY", - "SupabaseTargetLoader", - "SupabaseTargetProvider", -] diff --git a/src/microplex_us/target_registry.py b/src/microplex_us/target_registry.py deleted file mode 100644 index 92919b09..00000000 --- a/src/microplex_us/target_registry.py +++ /dev/null @@ -1,863 +0,0 @@ -"""Registry of US calibration targets expressed in the core microplex target IR.""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from enum import Enum -from typing import Any - -import pandas as pd -from microplex.core import EntityType -from microplex.targets import ( - FilterOperator, - TargetAggregation, - TargetFilter, - TargetQuery, - TargetSet, - TargetSpec, - apply_target_query, -) - -US_TARGET_CATEGORY_KEY = "us_category" -US_TARGET_LEVEL_KEY = "us_level" -US_TARGET_GROUP_KEY = "us_group" -US_TARGET_AVAILABLE_KEY = "available_in_cps" -US_TARGET_IMPUTATION_KEY = "requires_imputation" -US_TARGET_NOTES_KEY = "notes" - - -class TargetCategory(str, Enum): - """High-level US calibration target categories.""" - - GEOGRAPHY = "geography" - INCOME = "income" - BENEFITS = "benefits" - DEMOGRAPHICS = "demographics" - HEALTH = "health" - TAX = "tax" - - -class TargetLevel(str, Enum): - """Geographic level of a US target slice.""" - - NATIONAL = "national" - STATE = "state" - CD = "cd" - COUNTY = "county" - TRACT = "tract" - - -def target_category(target: TargetSpec) -> TargetCategory | None: - """Return the US category metadata for a target.""" - value = target.metadata.get(US_TARGET_CATEGORY_KEY) - return TargetCategory(value) if value is not None else None - - -def target_level(target: TargetSpec) -> TargetLevel | None: - """Return the US level metadata for a target.""" - value = target.metadata.get(US_TARGET_LEVEL_KEY) - return TargetLevel(value) if value is not None else None - - -def target_group_name(target: TargetSpec) -> str | None: - """Return the US group metadata for a target.""" - value = target.metadata.get(US_TARGET_GROUP_KEY) - return str(value) if value is not None else None - - -def target_available_in_cps(target: TargetSpec) -> bool: - """Whether the target is directly available in CPS-like source data.""" - return bool(target.metadata.get(US_TARGET_AVAILABLE_KEY, False)) - - -def target_requires_imputation(target: TargetSpec) -> bool: - """Whether the target depends on imputation or external modeling.""" - return bool(target.metadata.get(US_TARGET_IMPUTATION_KEY, False)) - - -def target_notes(target: TargetSpec) -> str: - """Free-form US notes metadata.""" - value = target.metadata.get(US_TARGET_NOTES_KEY) - return str(value) if value is not None else "" - - -@dataclass -class TargetGroup: - """A named US target family backed by canonical core targets.""" - - name: str - category: TargetCategory - targets: list[TargetSpec] = field(default_factory=list) - - def add(self, target: TargetSpec) -> TargetGroup: - self.targets.append(target) - return self - - def __len__(self) -> int: - return len(self.targets) - - -class TargetRegistry: - """US target registry that emits canonical microplex targets.""" - - def __init__( - self, - groups: dict[str, TargetGroup] | None = None, - *, - build_defaults: bool = True, - ): - self.groups: dict[str, TargetGroup] = dict(groups or {}) - if build_defaults: - self._build_registry() - - def _build_registry(self) -> None: - self._add_geography_targets() - self._add_income_targets() - self._add_benefit_targets() - self._add_health_targets() - self._add_tax_targets() - self._add_demographic_targets() - - def _get_or_create_group( - self, - name: str, - category: TargetCategory, - ) -> TargetGroup: - group = self.groups.get(name) - if group is None: - group = TargetGroup(name=name, category=category) - self.groups[name] = group - return group - - def _add_target( - self, - *, - group_name: str, - category: TargetCategory, - level: TargetLevel, - name: str, - value: float, - entity: EntityType, - period: int = 2024, - aggregation: TargetAggregation | str = TargetAggregation.SUM, - measure: str | None = None, - filters: tuple[TargetFilter, ...] = (), - source: str = "", - units: str = "", - description: str = "", - available_in_cps: bool = True, - requires_imputation: bool = False, - notes: str = "", - ) -> TargetSpec: - target = TargetSpec( - name=name, - entity=entity, - value=value, - period=period, - measure=measure, - aggregation=aggregation, - filters=filters, - source=source or None, - units=units or None, - description=description or None, - metadata={ - US_TARGET_CATEGORY_KEY: category.value, - US_TARGET_LEVEL_KEY: level.value, - US_TARGET_GROUP_KEY: group_name, - US_TARGET_AVAILABLE_KEY: available_in_cps, - US_TARGET_IMPUTATION_KEY: requires_imputation, - US_TARGET_NOTES_KEY: notes, - }, - ) - self._get_or_create_group(group_name, category).add(target) - return target - - def _add_geography_targets(self) -> None: - census_2020 = { - "01": 5024279, - "02": 733391, - "04": 7151502, - "05": 3011524, - "06": 39538223, - "08": 5773714, - "09": 3605944, - "10": 989948, - "11": 689545, - "12": 21538187, - "13": 10711908, - "15": 1455271, - "16": 1839106, - "17": 12812508, - "18": 6785528, - "19": 3190369, - "20": 2937880, - "21": 4505836, - "22": 4657757, - "23": 1362359, - "24": 6177224, - "25": 7029917, - "26": 10077331, - "27": 5706494, - "28": 2961279, - "29": 6154913, - "30": 1084225, - "31": 1961504, - "32": 3104614, - "33": 1377529, - "34": 9288994, - "35": 2117522, - "36": 20201249, - "37": 10439388, - "38": 779094, - "39": 11799448, - "40": 3959353, - "41": 4237256, - "42": 13002700, - "44": 1097379, - "45": 5118425, - "46": 886667, - "47": 6910840, - "48": 29145505, - "49": 3271616, - "50": 643077, - "51": 8631393, - "53": 7705281, - "54": 1793716, - "55": 5893718, - "56": 576851, - } - - for fips, population in census_2020.items(): - self._add_target( - group_name="state_population", - category=TargetCategory.GEOGRAPHY, - level=TargetLevel.STATE, - name=f"population_{fips}", - value=population, - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="state_fips", - operator=FilterOperator.EQ, - value=fips, - ), - ), - source="Census 2020", - units="persons", - ) - - fips_to_abbr = { - "01": "AL", - "02": "AK", - "04": "AZ", - "05": "AR", - "06": "CA", - "08": "CO", - "09": "CT", - "10": "DE", - "11": "DC", - "12": "FL", - "13": "GA", - "15": "HI", - "16": "ID", - "17": "IL", - "18": "IN", - "19": "IA", - "20": "KS", - "21": "KY", - "22": "LA", - "23": "ME", - "24": "MD", - "25": "MA", - "26": "MI", - "27": "MN", - "28": "MS", - "29": "MO", - "30": "MT", - "31": "NE", - "32": "NV", - "33": "NH", - "34": "NJ", - "35": "NM", - "36": "NY", - "37": "NC", - "38": "ND", - "39": "OH", - "40": "OK", - "41": "OR", - "42": "PA", - "44": "RI", - "45": "SC", - "46": "SD", - "47": "TN", - "48": "TX", - "49": "UT", - "50": "VT", - "51": "VA", - "53": "WA", - "54": "WV", - "55": "WI", - "56": "WY", - } - - try: - cd_df = pd.read_parquet("data/district_targets.parquet") - except FileNotFoundError: - cd_df = None - - if cd_df is not None: - for _, row in cd_df.iterrows(): - fips_id = row["district_id"] - state_fips, district_num = fips_id.split("-") - state_abbr = fips_to_abbr.get(state_fips, state_fips) - cd_id = f"{state_abbr}-AL" if district_num == "00" else f"{state_abbr}-{district_num}" - self._add_target( - group_name="cd_population", - category=TargetCategory.GEOGRAPHY, - level=TargetLevel.CD, - name=f"cd_{fips_id}", - value=float(row["population"]), - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="cd_id", - operator=FilterOperator.EQ, - value=cd_id, - ), - ), - source="Census ACS", - units="persons", - ) - - try: - blocks = pd.read_parquet("data/block_probabilities.parquet") - except FileNotFoundError: - blocks = None - - if blocks is None: - return - - sldu_col = "sldu_id" if "sldu_id" in blocks.columns else "sldu_geoid" - if sldu_col in blocks.columns: - sldu_pop = blocks.groupby(sldu_col)["population"].sum() - for sldu_id, population in sldu_pop.items(): - if pd.notna(sldu_id) and population > 0: - self._add_target( - group_name="sldu_population", - category=TargetCategory.GEOGRAPHY, - level=TargetLevel.STATE, - name=f"sldu_{sldu_id}", - value=float(population), - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="sldu_id", - operator=FilterOperator.EQ, - value=sldu_id, - ), - ), - source="Census", - units="persons", - ) - - sldl_col = "sldl_id" if "sldl_id" in blocks.columns else "sldl_geoid" - if sldl_col in blocks.columns: - sldl_pop = blocks.groupby(sldl_col)["population"].sum() - for sldl_id, population in sldl_pop.items(): - if pd.notna(sldl_id) and population > 0: - self._add_target( - group_name="sldl_population", - category=TargetCategory.GEOGRAPHY, - level=TargetLevel.STATE, - name=f"sldl_{sldl_id}", - value=float(population), - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="sldl_id", - operator=FilterOperator.EQ, - value=sldl_id, - ), - ), - source="Census", - units="persons", - ) - - def _add_income_targets(self) -> None: - soi_income = { - "employment_income": (9_022_352_941_000, "employment_income", True), - "self_employment_income": (436_400_000_000, "self_employment_income", True), - "social_security": (774_000_000_000, "social_security", True), - "taxable_pension_income": (827_600_000_000, "taxable_pension_income", True), - "tax_exempt_pension_income": (580_400_000_000, "tax_exempt_pension_income", True), - "unemployment_compensation": (208_000_000_000, "unemployment_compensation", True), - "dividend_income": (260_200_000_000, "dividend_income", False), - "interest_income": (127_400_000_000, "interest_income", False), - "rental_income": (46_000_000_000, "rental_income", True), - "long_term_capital_gains": (1_137_000_000_000, "long_term_capital_gains", False), - "short_term_capital_gains": (-72_000_000_000, "short_term_capital_gains", False), - "partnership_s_corp_income": (976_000_000_000, "partnership_s_corp_income", False), - "farm_income": (-26_141_944_000, "farm_income", False), - "alimony_income": (8_500_000_000, "alimony_income", True), - } - - for name, (value, measure, in_cps) in soi_income.items(): - self._add_target( - group_name="irs_soi_income", - category=TargetCategory.INCOME, - level=TargetLevel.NATIONAL, - name=name, - value=value, - entity=EntityType.PERSON, - aggregation=TargetAggregation.SUM, - measure=measure, - source="IRS SOI", - units="USD", - available_in_cps=in_cps, - requires_imputation=not in_cps, - notes="" if in_cps else "Underreported in CPS, requires imputation", - ) - - def _add_benefit_targets(self) -> None: - benefit_totals = { - "snap_spending": ( - 103_100_000_000, - EntityType.HOUSEHOLD, - "snap", - TargetAggregation.SUM, - (), - "USD", - "CBO", - ), - "snap_participation": ( - 41_209_000, - EntityType.PERSON, - None, - TargetAggregation.COUNT, - ( - TargetFilter( - feature="snap", - operator=FilterOperator.GT, - value=0, - ), - ), - "persons", - "USDA", - ), - "ssi_spending": ( - 78_500_000_000, - EntityType.PERSON, - "ssi", - TargetAggregation.SUM, - (), - "USD", - "CBO", - ), - "ssi_participation": ( - 7_400_000, - EntityType.PERSON, - None, - TargetAggregation.COUNT, - ( - TargetFilter( - feature="ssi", - operator=FilterOperator.GT, - value=0, - ), - ), - "persons", - "SSA", - ), - "social_security_spending": ( - 2_623_800_000_000, - EntityType.PERSON, - "social_security", - TargetAggregation.SUM, - (), - "USD", - "CBO", - ), - "social_security_participation": ( - 66_000_000, - EntityType.PERSON, - None, - TargetAggregation.COUNT, - ( - TargetFilter( - feature="social_security", - operator=FilterOperator.GT, - value=0, - ), - ), - "persons", - "SSA", - ), - "eitc_spending": ( - 72_700_000_000, - EntityType.TAX_UNIT, - "eitc", - TargetAggregation.SUM, - (), - "USD", - "Treasury", - ), - "unemployment_spending": ( - 59_100_000_000, - EntityType.PERSON, - "unemployment_compensation", - TargetAggregation.SUM, - (), - "USD", - "CBO", - ), - } - - for name, ( - value, - entity, - measure, - aggregation, - filters, - units, - source, - ) in benefit_totals.items(): - self._add_target( - group_name="benefit_programs", - category=TargetCategory.BENEFITS, - level=TargetLevel.NATIONAL, - name=name, - value=value, - entity=entity, - aggregation=aggregation, - measure=measure, - filters=filters, - source=source, - units=units, - available_in_cps=True, - ) - - def _add_health_targets(self) -> None: - medicaid_categories = [ - "child", - "aged", - "disabled", - "expansion_adults", - "non_expansion_adults", - ] - - for category in medicaid_categories: - self._add_target( - group_name="health_insurance", - category=TargetCategory.HEALTH, - level=TargetLevel.NATIONAL, - name=f"medicaid_{category}_national", - value=0, - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="medicaid", - operator=FilterOperator.GT, - value=0, - ), - ), - source="HHS/CMS", - units="persons", - available_in_cps=False, - requires_imputation=True, - notes="Requires eligibility modeling", - ) - - self._add_target( - group_name="health_insurance", - category=TargetCategory.HEALTH, - level=TargetLevel.NATIONAL, - name="chip_enrollment_national", - value=0, - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="chip", - operator=FilterOperator.GT, - value=0, - ), - ), - source="CMS", - units="persons", - available_in_cps=False, - requires_imputation=True, - ) - - self._add_target( - group_name="health_insurance", - category=TargetCategory.HEALTH, - level=TargetLevel.NATIONAL, - name="aca_enrollment_national", - value=0, - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="aca_enrolled", - operator=FilterOperator.GT, - value=0, - ), - ), - source="CMS", - units="persons", - available_in_cps=False, - requires_imputation=True, - ) - - def _add_tax_targets(self) -> None: - tax_targets = { - "income_tax_total": ( - 4_412_800_000_000, - TargetAggregation.SUM, - "income_tax", - (), - "USD", - ), - "payroll_tax_total": ( - 2_605_200_000_000, - TargetAggregation.SUM, - "payroll_tax", - (), - "USD", - ), - "eitc_claims": ( - 25_000_000, - TargetAggregation.COUNT, - None, - ( - TargetFilter( - feature="eitc", - operator=FilterOperator.GT, - value=0, - ), - ), - "returns", - ), - "ctc_claims": ( - 35_000_000, - TargetAggregation.COUNT, - None, - ( - TargetFilter( - feature="ctc", - operator=FilterOperator.GT, - value=0, - ), - ), - "returns", - ), - } - - for name, (value, aggregation, measure, filters, units) in tax_targets.items(): - self._add_target( - group_name="tax_aggregates", - category=TargetCategory.TAX, - level=TargetLevel.NATIONAL, - name=name, - value=value, - entity=EntityType.TAX_UNIT, - aggregation=aggregation, - measure=measure, - filters=filters, - source="CBO/IRS", - units=units, - available_in_cps=False, - requires_imputation=True, - notes="Requires tax calculation", - ) - - def _add_demographic_targets(self) -> None: - filing_status = { - "single": 75_000_000, - "married_joint": 55_000_000, - "married_separate": 3_000_000, - "head_of_household": 22_000_000, - } - - for status, count in filing_status.items(): - self._add_target( - group_name="demographics", - category=TargetCategory.DEMOGRAPHICS, - level=TargetLevel.NATIONAL, - name=f"filing_status_{status}", - value=count, - entity=EntityType.TAX_UNIT, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="filing_status", - operator=FilterOperator.EQ, - value=status, - ), - ), - source="IRS SOI", - units="returns", - available_in_cps=False, - requires_imputation=True, - notes="Requires tax unit modeling", - ) - - def get_group(self, name: str) -> TargetGroup | None: - """Get a target group by name.""" - return self.groups.get(name) - - def get_all_targets(self) -> list[TargetSpec]: - """Get all targets as a flat list.""" - all_targets: list[TargetSpec] = [] - for group in self.groups.values(): - all_targets.extend(group.targets) - return all_targets - - def load_target_set(self, query: TargetQuery | None = None) -> TargetSet: - """Load a canonical target set through the core provider protocol.""" - query = query or TargetQuery() - provider_filters = query.provider_filters - targets = self.select_targets( - categories=provider_filters.get("categories"), - levels=provider_filters.get("levels"), - groups=provider_filters.get("groups"), - only_available=bool(provider_filters.get("only_available", False)), - entity=query.entity, - ) - return apply_target_query( - TargetSet(targets), - TargetQuery( - period=query.period, - entity=query.entity, - names=query.names, - metadata_filters=query.metadata_filters, - ), - ) - - def select_targets( - self, - *, - categories: list[TargetCategory] | None = None, - levels: list[TargetLevel] | None = None, - groups: list[str] | None = None, - only_available: bool = False, - entity: EntityType | str | None = None, - ) -> list[TargetSpec]: - """Select canonical targets by US metadata and entity.""" - resolved_entity = ( - entity - if entity is None or isinstance(entity, EntityType) - else EntityType(entity) - ) - - selected: list[TargetSpec] = [] - for target in self.get_all_targets(): - if categories and target_category(target) not in categories: - continue - if levels and target_level(target) not in levels: - continue - if groups and target_group_name(target) not in groups: - continue - if only_available and not target_available_in_cps(target): - continue - if resolved_entity is not None and target.entity is not resolved_entity: - continue - selected.append(target) - return selected - - def get_available_targets(self) -> list[TargetSpec]: - """Get targets that are available in CPS data.""" - return [target for target in self.get_all_targets() if target_available_in_cps(target)] - - def get_targets_by_category(self, category: TargetCategory) -> list[TargetSpec]: - """Get targets by US category metadata.""" - return [target for target in self.get_all_targets() if target_category(target) is category] - - def get_targets_by_level(self, level: TargetLevel) -> list[TargetSpec]: - """Get targets by US level metadata.""" - return [target for target in self.get_all_targets() if target_level(target) is level] - - def summary(self) -> dict[str, Any]: - """Get summary of registry contents.""" - all_targets = self.get_all_targets() - available = self.get_available_targets() - - by_category = {category.value: len(self.get_targets_by_category(category)) for category in TargetCategory} - by_level = {level.value: len(self.get_targets_by_level(level)) for level in TargetLevel} - - return { - "total_targets": len(all_targets), - "available_in_cps": len(available), - "requires_imputation": sum( - 1 for target in all_targets if target_requires_imputation(target) - ), - "by_category": by_category, - "by_level": by_level, - "groups": {name: len(group) for name, group in self.groups.items()}, - } - - def to_dataframe(self) -> pd.DataFrame: - """Convert the registry to a tabular summary.""" - records = [] - for target in self.get_all_targets(): - records.append( - { - "name": target.name, - "entity": target.entity.value, - "category": target_category(target).value if target_category(target) else None, - "level": target_level(target).value if target_level(target) else None, - "group": target_group_name(target), - "value": target.value, - "measure": target.measure, - "aggregation": target.aggregation.value, - "source": target.source, - "units": target.units, - "available_in_cps": target_available_in_cps(target), - "requires_imputation": target_requires_imputation(target), - "notes": target_notes(target), - "filters": [ - { - "feature": target_filter.feature, - "operator": target_filter.operator.value, - "value": target_filter.value, - } - for target_filter in target.filters - ], - } - ) - return pd.DataFrame(records) - - -def get_registry() -> TargetRegistry: - """Get the default US target registry.""" - return TargetRegistry() - - -def print_registry_summary() -> None: - """Print a summary of available US targets.""" - registry = get_registry() - summary = registry.summary() - - print("=" * 70) - print("MICROPLEX TARGET REGISTRY") - print("=" * 70) - print(f"\nTotal targets: {summary['total_targets']}") - print(f"Available in CPS: {summary['available_in_cps']}") - print(f"Requires imputation: {summary['requires_imputation']}") - - print("\nBy category:") - for category, count in summary["by_category"].items(): - print(f" {category}: {count}") - - print("\nBy level:") - for level, count in summary["by_level"].items(): - print(f" {level}: {count}") - - print("\nBy group:") - for name, count in summary["groups"].items(): - print(f" {name}: {count}") diff --git a/src/microplex_us/targets/__init__.py b/src/microplex_us/targets/__init__.py deleted file mode 100644 index 33769e19..00000000 --- a/src/microplex_us/targets/__init__.py +++ /dev/null @@ -1,111 +0,0 @@ -"""US-specific target mappings.""" - -from microplex_us.targets.aca_ptc import ( - ACA_AVERAGE_MONTHLY_APTC_CONCEPT, - ACA_MARKETPLACE_EFFECTUATED_ENROLLMENT_CONCEPT, - ACAPTCBaseAPTCPolicy, - ACAPTCMultiplierInput, - ACAPTCMultiplierRow, - aca_ptc_multiplier_inputs_from_arch_consumer_facts, - build_aca_ptc_multiplier_rows, - load_arch_consumer_fact_jsonl_rows, - write_policyengine_aca_ptc_multiplier_csv, -) -from microplex_us.targets.adapters import ( - POLICYENGINE_US_COUNT_ENTITIES, - policyengine_db_target_to_canonical_spec, - policyengine_db_targets_to_canonical_set, -) -from microplex_us.targets.arch import ( - ArchCompositeSQLiteTargetProvider, - ArchConsumerFactJSONLTargetProvider, - ArchFactSQLiteTargetProvider, - ArchSQLiteTargetProvider, - ArchTargetCellCoverage, - ArchTargetGapQueueReport, - ArchTargetGapQueueRow, - ArchTargetParityReport, - ArchTargetParityRow, - ArchTargetProfileCoverageReport, - ArchTargetRecord, - SOIAgingFactors, - arch_target_record_to_canonical_spec, - resolve_arch_sqlite_target_provider, - summarize_arch_target_gap_queue, - summarize_arch_target_parity, - summarize_arch_target_profile_coverage, -) -from microplex_us.targets.census_blocks import ( - CENSUS_BLOCK_GEOGRAPHY_YEAR, - CENSUS_BLOCK_POPULATION_GEO_LEVELS, - CENSUS_BLOCK_POPULATION_ROLLUPS, - CENSUS_BLOCK_POPULATION_SOURCE, - CENSUS_BLOCK_POPULATION_UNITS, - CENSUS_BLOCK_POPULATION_VARIABLE, - CENSUS_BLOCK_SOURCE_YEAR, - CENSUS_BLOCK_TARGET_PERIOD, - DEFAULT_CENSUS_BLOCK_POPULATION_GEO_LEVELS, - CensusBlockPopulationRollup, - CensusBlockPopulationTargetProvider, - build_census_block_population_targets, -) -from microplex_us.targets.rac_mapping import ( - MICRODATA_TO_RAC, - POLICYENGINE_TO_RAC, - RAC_VARIABLE_MAP, - RACVariable, - get_rac_for_microdata_column, - get_rac_for_pe_variable, - get_rac_for_target, -) - -__all__ = [ - "ArchTargetCellCoverage", - "ArchTargetProfileCoverageReport", - "ArchSQLiteTargetProvider", - "ArchCompositeSQLiteTargetProvider", - "ArchConsumerFactJSONLTargetProvider", - "ArchFactSQLiteTargetProvider", - "ArchTargetRecord", - "ArchTargetGapQueueReport", - "ArchTargetGapQueueRow", - "ArchTargetParityReport", - "ArchTargetParityRow", - "POLICYENGINE_US_COUNT_ENTITIES", - "CENSUS_BLOCK_GEOGRAPHY_YEAR", - "CENSUS_BLOCK_POPULATION_GEO_LEVELS", - "CENSUS_BLOCK_POPULATION_ROLLUPS", - "CENSUS_BLOCK_POPULATION_SOURCE", - "CENSUS_BLOCK_POPULATION_UNITS", - "CENSUS_BLOCK_POPULATION_VARIABLE", - "CENSUS_BLOCK_SOURCE_YEAR", - "CENSUS_BLOCK_TARGET_PERIOD", - "DEFAULT_CENSUS_BLOCK_POPULATION_GEO_LEVELS", - "CensusBlockPopulationRollup", - "CensusBlockPopulationTargetProvider", - "SOIAgingFactors", - "arch_target_record_to_canonical_spec", - "build_census_block_population_targets", - "summarize_arch_target_gap_queue", - "summarize_arch_target_parity", - "summarize_arch_target_profile_coverage", - "policyengine_db_target_to_canonical_spec", - "policyengine_db_targets_to_canonical_set", - "resolve_arch_sqlite_target_provider", - "ACA_AVERAGE_MONTHLY_APTC_CONCEPT", - "ACA_MARKETPLACE_EFFECTUATED_ENROLLMENT_CONCEPT", - "ACAPTCBaseAPTCPolicy", - "ACAPTCMultiplierInput", - "ACAPTCMultiplierRow", - "aca_ptc_multiplier_inputs_from_arch_consumer_facts", - "build_aca_ptc_multiplier_rows", - "load_arch_consumer_fact_jsonl_rows", - "write_policyengine_aca_ptc_multiplier_csv", - "RACVariable", - "RAC_VARIABLE_MAP", - "POLICYENGINE_TO_RAC", - "MICRODATA_TO_RAC", - "get_rac_for_target", - "get_rac_for_pe_variable", - "get_rac_for_microdata_column", -] diff --git a/src/microplex_us/targets/aca_ptc.py b/src/microplex_us/targets/aca_ptc.py deleted file mode 100644 index 9f61623a..00000000 --- a/src/microplex_us/targets/aca_ptc.py +++ /dev/null @@ -1,465 +0,0 @@ -"""ACA PTC target-construction helpers for US target sources.""" - -from __future__ import annotations - -import argparse -import csv -from collections.abc import Iterable, Mapping -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Literal - -from microplex.targets import ( - arch_consumer_fact_concept, - arch_consumer_fact_numeric_value, - arch_consumer_fact_period, - arch_consumer_fact_source_record_id, - load_arch_consumer_fact_jsonl_rows, -) - -ACAPTCBaseAPTCPolicy = Literal[ - "oep", - "effectuated", - "oep_with_effectuated_fallback", -] - -ACA_MARKETPLACE_EFFECTUATED_ENROLLMENT_CONCEPT = ( - "cms_aca.marketplace_effectuated_enrollment" -) -ACA_AVERAGE_MONTHLY_APTC_CONCEPT = "cms_aca.average_monthly_aptc" - - -@dataclass(frozen=True) -class ACAPTCMultiplierInput: - """Publisher-source inputs for one state's ACA PTC multiplier row.""" - - state: str - enroll_base: float - enroll_target: float - aptc_base: float - aptc_target: float - base_year: int = 2022 - target_year: int = 2024 - enroll_base_source_record_id: str | None = None - enroll_target_source_record_id: str | None = None - aptc_base_source_record_id: str | None = None - aptc_target_source_record_id: str | None = None - aptc_base_source_kind: str | None = None - aptc_target_source_kind: str | None = None - - -@dataclass(frozen=True) -class ACAPTCMultiplierRow: - """PE-compatible ACA PTC multiplier row for one state.""" - - state: str - enroll_base: float - enroll_target: float - vol_mult: float - aptc_base: float - aptc_target: float - val_mult: float - base_year: int = 2022 - target_year: int = 2024 - enroll_base_source_record_id: str | None = None - enroll_target_source_record_id: str | None = None - aptc_base_source_record_id: str | None = None - aptc_target_source_record_id: str | None = None - aptc_base_source_kind: str | None = None - aptc_target_source_kind: str | None = None - - @property - def amount_mult(self) -> float: - """Multiplier PE applies to the ACA PTC amount target.""" - - return self.vol_mult * self.val_mult - - def target_factors(self) -> dict[str, float]: - """Return the variable factors consumed by PE's state uprating path.""" - - return { - "tax_unit_count": self.vol_mult, - "aca_ptc": self.amount_mult, - } - - def to_policyengine_csv_row(self) -> dict[str, float | int | str]: - """Return a row with PE's incumbent ACA multiplier CSV column names.""" - - return { - "state": self.state, - f"enroll_{self.base_year}": _source_csv_number(self.enroll_base), - f"enroll_{self.target_year}": _source_csv_number(self.enroll_target), - "vol_mult": self.vol_mult, - f"aptc_{self.base_year}": _source_csv_number(self.aptc_base), - f"aptc_{self.target_year}": _source_csv_number(self.aptc_target), - "val_mult": self.val_mult, - } - - -@dataclass(frozen=True) -class _ACAStateFact: - state: str - period: int - value: float - concept: str - source_record_id: str | None - source_kind: str | None - - -def build_aca_ptc_multiplier_rows( - inputs: Iterable[ACAPTCMultiplierInput], -) -> tuple[ACAPTCMultiplierRow, ...]: - """Build state ACA PTC multiplier rows from explicit source inputs.""" - - rows = [] - for item in inputs: - _validate_positive_source_value(item.enroll_base, "enroll_base", item.state) - _validate_positive_source_value(item.enroll_target, "enroll_target", item.state) - _validate_positive_source_value(item.aptc_base, "aptc_base", item.state) - _validate_positive_source_value(item.aptc_target, "aptc_target", item.state) - rows.append( - ACAPTCMultiplierRow( - state=item.state, - base_year=item.base_year, - target_year=item.target_year, - enroll_base=item.enroll_base, - enroll_target=item.enroll_target, - vol_mult=item.enroll_target / item.enroll_base, - aptc_base=item.aptc_base, - aptc_target=item.aptc_target, - val_mult=item.aptc_target / item.aptc_base, - enroll_base_source_record_id=item.enroll_base_source_record_id, - enroll_target_source_record_id=item.enroll_target_source_record_id, - aptc_base_source_record_id=item.aptc_base_source_record_id, - aptc_target_source_record_id=item.aptc_target_source_record_id, - aptc_base_source_kind=item.aptc_base_source_kind, - aptc_target_source_kind=item.aptc_target_source_kind, - ) - ) - return tuple(sorted(rows, key=lambda row: row.state)) - - -def aca_ptc_multiplier_inputs_from_arch_consumer_facts( - rows: Iterable[Mapping[str, Any]], - *, - base_year: int = 2022, - target_year: int = 2024, - base_aptc_policy: ACAPTCBaseAPTCPolicy = "oep_with_effectuated_fallback", -) -> tuple[ACAPTCMultiplierInput, ...]: - """Collect PE-style ACA PTC multiplier inputs from Arch consumer facts. - - The publisher-source recipe uses KFF full-year effectuated enrollment for - the volume ratio, CMS OEP average APTC where available for the base-year - value ratio base, CMS full-year 2022 APTC as the fallback for missing OEP - state values, and CMS OEP average APTC for the target-year value ratio. - """ - - enrollment: dict[tuple[int, str], _ACAStateFact] = {} - oep_aptc: dict[tuple[int, str], _ACAStateFact] = {} - effectuated_aptc: dict[tuple[int, str], _ACAStateFact] = {} - - for row in rows: - fact = _aca_state_fact_from_arch_consumer_fact(row) - if fact is None: - continue - key = (fact.period, fact.state) - if fact.concept == ACA_MARKETPLACE_EFFECTUATED_ENROLLMENT_CONCEPT: - enrollment[key] = fact - elif fact.concept == ACA_AVERAGE_MONTHLY_APTC_CONCEPT: - if fact.source_kind == "oep": - oep_aptc[key] = fact - elif fact.source_kind == "effectuated": - effectuated_aptc[key] = fact - - states = sorted( - { - state - for period, state in enrollment - if period == base_year and (target_year, state) in enrollment - } - ) - inputs = [] - missing: list[str] = [] - for state in states: - enroll_base = enrollment[(base_year, state)] - enroll_target = enrollment[(target_year, state)] - aptc_base = _select_base_aptc_fact( - state, - base_year=base_year, - policy=base_aptc_policy, - oep_aptc=oep_aptc, - effectuated_aptc=effectuated_aptc, - ) - aptc_target = oep_aptc.get((target_year, state)) - if aptc_base is None: - missing.append(f"{state} {base_year} average APTC") - continue - if aptc_target is None: - missing.append(f"{state} {target_year} OEP average APTC") - continue - inputs.append( - ACAPTCMultiplierInput( - state=state, - base_year=base_year, - target_year=target_year, - enroll_base=enroll_base.value, - enroll_target=enroll_target.value, - aptc_base=aptc_base.value, - aptc_target=aptc_target.value, - enroll_base_source_record_id=enroll_base.source_record_id, - enroll_target_source_record_id=enroll_target.source_record_id, - aptc_base_source_record_id=aptc_base.source_record_id, - aptc_target_source_record_id=aptc_target.source_record_id, - aptc_base_source_kind=aptc_base.source_kind, - aptc_target_source_kind=aptc_target.source_kind, - ) - ) - - if missing: - preview = ", ".join(missing[:5]) - suffix = "" if len(missing) <= 5 else f", and {len(missing) - 5} more" - raise ValueError(f"Missing ACA PTC source facts: {preview}{suffix}") - return tuple(inputs) - - -def write_policyengine_aca_ptc_multiplier_csv( - rows: Iterable[ACAPTCMultiplierRow], - path: str | Path, -) -> None: - """Write PE-compatible ACA PTC multiplier rows.""" - - rows = tuple(rows) - if not rows: - raise ValueError("Cannot write ACA PTC multiplier CSV with no rows.") - year_pairs = {(row.base_year, row.target_year) for row in rows} - if len(year_pairs) != 1: - raise ValueError("ACA PTC multiplier CSV rows must use one year pair.") - base_year, target_year = next(iter(year_pairs)) - fieldnames = [ - "state", - f"enroll_{base_year}", - f"enroll_{target_year}", - "vol_mult", - f"aptc_{base_year}", - f"aptc_{target_year}", - "val_mult", - ] - with Path(path).open("w", newline="") as file: - writer = csv.DictWriter(file, fieldnames=fieldnames) - writer.writeheader() - for row in rows: - writer.writerow(row.to_policyengine_csv_row()) - - -def main(argv: list[str] | None = None) -> int: - """Build a PE-compatible ACA PTC multiplier CSV from Arch consumer facts.""" - - parser = argparse.ArgumentParser( - description=( - "Build a PE-compatible ACA PTC multiplier CSV from Arch " - "consumer_facts.jsonl files." - ) - ) - parser.add_argument( - "consumer_facts", - nargs="+", - help="Arch consumer_facts.jsonl path(s) containing ACA source facts.", - ) - parser.add_argument( - "--out", - required=True, - help="Output CSV path.", - ) - parser.add_argument( - "--base-year", - type=int, - default=2022, - help="Source year for the multiplier denominator.", - ) - parser.add_argument( - "--target-year", - type=int, - default=2024, - help="Target year for the multiplier numerator.", - ) - parser.add_argument( - "--base-aptc-policy", - choices=("oep", "effectuated", "oep_with_effectuated_fallback"), - default="oep_with_effectuated_fallback", - help="Source selection policy for base-year average monthly APTC.", - ) - args = parser.parse_args(argv) - - consumer_fact_rows = load_arch_consumer_fact_jsonl_rows(args.consumer_facts) - inputs = aca_ptc_multiplier_inputs_from_arch_consumer_facts( - consumer_fact_rows, - base_year=args.base_year, - target_year=args.target_year, - base_aptc_policy=args.base_aptc_policy, - ) - rows = build_aca_ptc_multiplier_rows(inputs) - write_policyengine_aca_ptc_multiplier_csv(rows, args.out) - print(f"Wrote {len(rows)} ACA PTC multiplier rows to {args.out}") - return 0 - - -def _select_base_aptc_fact( - state: str, - *, - base_year: int, - policy: ACAPTCBaseAPTCPolicy, - oep_aptc: Mapping[tuple[int, str], _ACAStateFact], - effectuated_aptc: Mapping[tuple[int, str], _ACAStateFact], -) -> _ACAStateFact | None: - key = (base_year, state) - if policy == "oep": - return oep_aptc.get(key) - if policy == "effectuated": - return effectuated_aptc.get(key) - if policy == "oep_with_effectuated_fallback": - return oep_aptc.get(key) or effectuated_aptc.get(key) - raise ValueError(f"Unsupported ACA PTC base APTC policy: {policy}") - - -def _aca_state_fact_from_arch_consumer_fact( - row: Mapping[str, Any], -) -> _ACAStateFact | None: - concept = _arch_consumer_fact_concept(row) - if concept not in { - ACA_MARKETPLACE_EFFECTUATED_ENROLLMENT_CONCEPT, - ACA_AVERAGE_MONTHLY_APTC_CONCEPT, - }: - return None - geography = _mapping(row.get("geography")) - if str(geography.get("level") or "").lower() != "state": - return None - state = _arch_consumer_fact_state(row, geography) - if not state: - return None - return _ACAStateFact( - state=state, - period=_arch_consumer_fact_period(row), - value=_json_numeric_value(row.get("value")), - concept=concept, - source_record_id=_arch_consumer_fact_source_record_id(row), - source_kind=_aca_source_kind(row), - ) - - -def _arch_consumer_fact_concept(row: Mapping[str, Any]) -> str | None: - return arch_consumer_fact_concept(row) - - -def _arch_consumer_fact_period(row: Mapping[str, Any]) -> int: - return arch_consumer_fact_period(row) - - -def _arch_consumer_fact_state( - row: Mapping[str, Any], - geography: Mapping[str, Any], -) -> str | None: - name = geography.get("name") - if name: - return str(name) - source_record_id = _arch_consumer_fact_source_record_id(row) or "" - for token in source_record_id.split("."): - state = _STATE_ABBR_TO_NAME.get(token.lower()) - if state is not None: - return state - return None - - -def _arch_consumer_fact_source_record_id(row: Mapping[str, Any]) -> str | None: - source_record_id = arch_consumer_fact_source_record_id(row) - if source_record_id is not None: - return source_record_id - fallback = row.get("source_record_id") - return str(fallback) if fallback else None - - -def _aca_source_kind(row: Mapping[str, Any]) -> str | None: - source_record_id = (_arch_consumer_fact_source_record_id(row) or "").lower() - if ".oep" in source_record_id: - return "oep" - if ".effectuated_enrollment." in source_record_id: - return "effectuated" - source = _mapping(row.get("source")) - source_table = str(source.get("source_table") or "").lower() - if "open enrollment" in source_table or "oep" in source_table: - return "oep" - if "effectuated enrollment" in source_table: - return "effectuated" - return None - - -def _validate_positive_source_value(value: float, label: str, state: str) -> None: - if value <= 0: - raise ValueError(f"{state} {label} must be positive; got {value}.") - - -def _json_numeric_value(value: Any) -> float: - return arch_consumer_fact_numeric_value(value) - - -def _source_csv_number(value: float) -> float | int: - numeric = float(value) - return int(numeric) if numeric.is_integer() else numeric - - -def _mapping(value: Any) -> Mapping[str, Any]: - return value if isinstance(value, Mapping) else {} - - -_STATE_ABBR_TO_NAME = { - "ak": "Alaska", - "al": "Alabama", - "ar": "Arkansas", - "az": "Arizona", - "ca": "California", - "co": "Colorado", - "ct": "Connecticut", - "dc": "District of Columbia", - "de": "Delaware", - "fl": "Florida", - "ga": "Georgia", - "hi": "Hawaii", - "ia": "Iowa", - "id": "Idaho", - "il": "Illinois", - "in": "Indiana", - "ks": "Kansas", - "ky": "Kentucky", - "la": "Louisiana", - "ma": "Massachusetts", - "md": "Maryland", - "me": "Maine", - "mi": "Michigan", - "mn": "Minnesota", - "mo": "Missouri", - "ms": "Mississippi", - "mt": "Montana", - "nc": "North Carolina", - "nd": "North Dakota", - "ne": "Nebraska", - "nh": "New Hampshire", - "nj": "New Jersey", - "nm": "New Mexico", - "nv": "Nevada", - "ny": "New York", - "oh": "Ohio", - "ok": "Oklahoma", - "or": "Oregon", - "pa": "Pennsylvania", - "ri": "Rhode Island", - "sc": "South Carolina", - "sd": "South Dakota", - "tn": "Tennessee", - "tx": "Texas", - "ut": "Utah", - "va": "Virginia", - "vt": "Vermont", - "wa": "Washington", - "wi": "Wisconsin", - "wv": "West Virginia", - "wy": "Wyoming", -} diff --git a/src/microplex_us/targets/adapters.py b/src/microplex_us/targets/adapters.py deleted file mode 100644 index e9183b8e..00000000 --- a/src/microplex_us/targets/adapters.py +++ /dev/null @@ -1,201 +0,0 @@ -"""Adapters from US-specific target representations to core microplex target specs.""" - -from __future__ import annotations - -from collections.abc import Iterable - -from microplex.core import EntityType -from microplex.targets import ( - TargetAggregation, - TargetFilter, - TargetSet, - TargetSimulationModifier, -) -from microplex.targets import ( - TargetSpec as CanonicalTargetSpec, -) - -from microplex_us.microdata_roles import ( - PolicyEngineUSVariableRole, - policyengine_us_variable_role, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSConstraint, - PolicyEngineUSDBTarget, -) - -POLICYENGINE_US_COUNT_ENTITIES: dict[str, EntityType] = { - "household_count": EntityType.HOUSEHOLD, - "person_count": EntityType.PERSON, - "tax_unit_count": EntityType.TAX_UNIT, - "spm_unit_count": EntityType.SPM_UNIT, - "family_count": EntityType.FAMILY, -} - -POLICYENGINE_US_ACTUAL_ACA_PTC_VARIABLE = "assigned_aca_ptc" - - -def policyengine_db_target_to_canonical_spec( - target: PolicyEngineUSDBTarget, - *, - default_entity: EntityType | str = EntityType.HOUSEHOLD, - entity_overrides: dict[str, EntityType] | None = None, -) -> CanonicalTargetSpec: - """Translate a PolicyEngine US DB target row into the canonical core spec.""" - resolved_default_entity = ( - default_entity - if isinstance(default_entity, EntityType) - else EntityType(default_entity) - ) - resolved_entity = ( - (entity_overrides or {}).get(target.variable) - or POLICYENGINE_US_COUNT_ENTITIES.get(target.variable) - or resolved_default_entity - ) - aggregation = ( - TargetAggregation.COUNT - if target.variable.endswith("_count") - else TargetAggregation.SUM - ) - measure_variable = _policyengine_db_target_measure_variable(target) - measure = None if aggregation is TargetAggregation.COUNT else measure_variable - model_variable = measure_variable if measure is not None else target.variable - filters = tuple( - _policyengine_db_constraint_to_target_filter(target, constraint) - for constraint in target.constraints - ) - - return CanonicalTargetSpec( - name=f"policyengine_us_target_{target.target_id}", - entity=resolved_entity, - value=target.value, - period=target.period, - measure=measure, - aggregation=aggregation, - filters=filters, - tolerance=target.tolerance, - source=target.source, - description=target.notes, - sim_modifiers=_policyengine_db_target_sim_modifiers( - target=target, - model_variable=model_variable, - ), - metadata={ - "target_id": target.target_id, - "variable": target.variable, - "stratum_id": target.stratum_id, - "stratum_definition_hash": target.definition_hash, - "parent_stratum_id": target.parent_stratum_id, - "reform_id": target.reform_id, - "active": target.active, - "geo_level": target.geo_level, - "geographic_id": target.geographic_id, - "domain_variable": target.domain_variable, - "domain_variables": target.domain_variables, - "model_variable_role": policyengine_us_variable_role(model_variable).value, - "target_semantic": ( - "count" if aggregation is TargetAggregation.COUNT else "amount" - ), - "constraint_count": len(target.constraints), - }, - ) - - -def _policyengine_db_target_uses_aca_ptc(target: PolicyEngineUSDBTarget) -> bool: - return ( - target.variable == "aca_ptc" - or "aca_ptc" in target.domain_variables - or any(constraint.variable == "aca_ptc" for constraint in target.constraints) - ) - - -def _policyengine_db_target_measure_variable(target: PolicyEngineUSDBTarget) -> str: - if target.variable == "aca_ptc": - return POLICYENGINE_US_ACTUAL_ACA_PTC_VARIABLE - return target.variable - - -def _policyengine_db_target_sim_modifiers( - *, - target: PolicyEngineUSDBTarget, - model_variable: str, -) -> tuple[TargetSimulationModifier, ...]: - features = tuple( - dict.fromkeys( - ( - model_variable, - *( - _policyengine_db_constraint_feature(target, constraint) - for constraint in target.constraints - ), - ) - ) - ) - calculated_features = sorted( - feature - for feature in features - if policyengine_us_variable_role(feature) - is PolicyEngineUSVariableRole.CALCULATED_OUTPUT - ) - takeup_features = sorted( - feature - for feature in features - if policyengine_us_variable_role(feature) is PolicyEngineUSVariableRole.TAKEUP_INPUT - ) - - modifiers: list[TargetSimulationModifier] = [] - if calculated_features: - modifiers.append( - TargetSimulationModifier( - "policyengine_us_materialize", - parameters={"features": calculated_features}, - ) - ) - if takeup_features: - modifiers.append( - TargetSimulationModifier( - "rerandomize_takeup", - parameters={"features": takeup_features}, - ) - ) - return tuple(modifiers) - - -def _policyengine_db_constraint_to_target_filter( - target: PolicyEngineUSDBTarget, - constraint: PolicyEngineUSConstraint, -) -> TargetFilter: - return TargetFilter( - feature=_policyengine_db_constraint_feature(target, constraint), - operator=constraint.operation, - value=constraint.value, - ) - - -def _policyengine_db_constraint_feature( - target: PolicyEngineUSDBTarget, - constraint: PolicyEngineUSConstraint, -) -> str: - feature = constraint.variable - if feature == "aca_ptc" and _policyengine_db_target_uses_aca_ptc(target): - return POLICYENGINE_US_ACTUAL_ACA_PTC_VARIABLE - return feature - - -def policyengine_db_targets_to_canonical_set( - targets: Iterable[PolicyEngineUSDBTarget], - *, - default_entity: EntityType | str = EntityType.HOUSEHOLD, - entity_overrides: dict[str, EntityType] | None = None, -) -> TargetSet: - """Translate a sequence of PolicyEngine US DB targets into a canonical target set.""" - return TargetSet( - [ - policyengine_db_target_to_canonical_spec( - target, - default_entity=default_entity, - entity_overrides=entity_overrides, - ) - for target in targets - ] - ) diff --git a/src/microplex_us/targets/arch.py b/src/microplex_us/targets/arch.py deleted file mode 100644 index 34a9d341..00000000 --- a/src/microplex_us/targets/arch.py +++ /dev/null @@ -1,7133 +0,0 @@ -"""Adapters from Arch target records to core Microplex target specs.""" - -from __future__ import annotations - -import json -import sqlite3 -from collections import Counter -from dataclasses import dataclass, replace -from hashlib import sha1 -from pathlib import Path -from typing import Any - -from microplex.core import EntityType -from microplex.targets import ( - TargetAggregation, - TargetFilter, - TargetQuery, - TargetSet, - apply_target_query, - arch_consumer_fact_concept, - arch_consumer_fact_numeric_value, - arch_consumer_fact_period, - arch_consumer_fact_source_record_id, - load_arch_consumer_fact_jsonl_rows, -) -from microplex.targets import ( - TargetSpec as CanonicalTargetSpec, -) - -from microplex_us.geography import ( - US_STATE_ABBR_BY_FIPS, - normalize_state_legislative_district_id, -) -from microplex_us.microdata_roles import policyengine_us_variable_role -from microplex_us.policyengine.target_profiles import ( - PolicyEngineUSTargetCell, - resolve_policyengine_us_target_profile, -) - -ARCH_SOURCE_ALIASES = { - "bea": "BEA", - "bea-nipa": "BEA", - "bea-regional": "BEA", - "cbo": "CBO", - "census-decennial": "CENSUS_DECENNIAL", - "census_population_projections": "CENSUS_POPULATION_PROJECTIONS", - "irs-soi": "IRS_SOI", - "census-acs": "CENSUS_ACS", - "census-pep": "CENSUS_PEP", - "census-stc": "CENSUS_STC", - "usda-snap": "USDA_SNAP", - "cms-aca": "CMS_ACA", - "cms-medicare": "CMS_MEDICARE", - "cms-medicaid": "CMS_MEDICAID", - "federal-reserve": "FEDERAL_RESERVE", - "hhs-acf-liheap": "HHS_ACF_LIHEAP", - "hhs-acf-tanf": "HHS_ACF_TANF", -} - -ARCH_CONSTRAINT_VARIABLE_ALIASES = { - "eitc_qualifying_children": "eitc_child_count", - "is_tax_filer": "tax_unit_is_filer", -} - -ARCH_POSITIVE_CONSTRAINT_ALIASES = { - "aca": "aca_ptc", - "aca_marketplace": "aca_ptc", - "aca_ptc": "aca_ptc", - "is_aca_ptc_eligible": "aca_ptc", - "selected_marketplace_plan_benchmark_ratio": "aca_ptc", - "total_self_employment_income": "self_employment_income", - "used_aca_ptc": "aca_ptc", - "is_medicaid": "medicaid_enrolled", - "medicaid": "medicaid_enrolled", - "medicaid_enrolled": "medicaid_enrolled", - "snap": "snap", - "ssi": "ssi", -} - -ARCH_CONSTRAINT_OPERATOR_ALIASES = { - "=": "==", - "eq": "==", - "<>": "!=", - "ne": "!=", - "neq": "!=", -} - -ARCH_AMOUNT_VARIABLE_ALIASES = { - "adjusted_gross_income": "adjusted_gross_income", - "income_tax_liability": "income_tax", - "income_tax_before_credits_amount": "income_tax_before_credits", - "eitc_amount": "eitc", - "ctc_amount": "non_refundable_ctc", - "actc_amount": "refundable_ctc", - "taxable_interest_amount": "taxable_interest_income", - "tax_exempt_interest_amount": "tax_exempt_interest_income", - "alimony_received_amount": "alimony_income", - "alimony_paid_amount": "alimony_expense", - "personal_dividend_income_amount": "dividend_income", - "ordinary_dividends_amount": "dividend_income", - "qualified_dividends_amount": "qualified_dividend_income", - "long_term_capital_gains_amount": "long_term_capital_gains", - "short_term_capital_gains_amount": "short_term_capital_gains", - "employment_income_before_lsr_amount": "employment_income_before_lsr", - "wages_salaries_amount": "employment_income", - "net_capital_gains_amount": "net_capital_gains", - "taxable_ira_distributions_amount": "taxable_ira_distributions", - "traditional_ira_contributions": "traditional_ira_contributions", - "roth_ira_contributions": "roth_ira_contributions", - "taxable_pension_income_amount": "taxable_pension_income", - "taxable_social_security_amount": "taxable_social_security", - "unemployment_insurance_benefits": "unemployment_compensation", - "unemployment_compensation_amount": "unemployment_compensation", - "tip_income": "tip_income", - "rental_income_amount": "rental_income", - "rental_royalty_income_amount": "rental_income", - "partnership_scorp_income_amount": "tax_unit_partnership_s_corp_income", - "schedule_c_income_amount": "self_employment_income", - "state_local_refunds_amount": "salt_refund_income", - "qbi_amount": "qualified_business_income_deduction", - "salt_amount": "salt", - "limited_state_local_taxes_amount": "salt_deduction", - "state_local_income_or_sales_tax_amount": ( - "state_and_local_sales_or_income_tax" - ), - "charitable_amount": "charitable_deduction", - "mortgage_interest_amount": "deductible_mortgage_interest", - "mortgage_interest_paid_amount": "deductible_mortgage_interest", - "home_mortgage_personal_seller_amount": "deductible_mortgage_interest", - "deductible_points_amount": "deductible_mortgage_interest", - "investment_interest_paid_amount": "investment_interest_expense", - "interest_paid_deduction_amount": "interest_deduction", - "medical_amount": "medical_expense_deduction", - "medical_dental_expense_amount": "medical_expense_deduction", - "net_worth_amount": "net_worth", - "real_estate_taxes_amount": "real_estate_taxes", - "aca_aptc_amount": "aca_ptc", - "medicaid_benefits": "medicaid", - "medicare_part_b_premiums": "medicare_part_b_premiums", - "social_security_benefits": "social_security", - "social_security_dependents_benefits": "social_security_dependents", - "social_security_disability_benefits": "social_security_disability", - "social_security_retirement_benefits": "social_security_retirement", - "social_security_survivors_benefits": "social_security_survivors", - "snap_benefits": "snap", - "state_individual_income_tax_collections": "state_income_tax", - "ssi_payments": "ssi", - "ssi_total_payments": "ssi", - "tanf_cash_assistance": "tanf", - "net_worth": "net_worth", -} - -ARCH_SELF_DOMAIN_AMOUNT_VARIABLES = frozenset( - set(ARCH_AMOUNT_VARIABLE_ALIASES.values()) - {"adjusted_gross_income"} -) - -ARCH_SSA_PAYMENT_TYPE_AMOUNT_VARIABLES = frozenset( - { - "social_security_benefits", - "social_security_dependents_benefits", - "social_security_disability_benefits", - "social_security_retirement_benefits", - "social_security_survivors_benefits", - "ssi_payments", - } -) - -ARCH_IRS_SOI_ITEMIZED_DEDUCTION_AMOUNT_VARIABLES = frozenset( - { - "charitable_amount", - "deductible_points_amount", - "home_mortgage_personal_seller_amount", - "interest_paid_deduction_amount", - "investment_interest_paid_amount", - "limited_state_local_taxes_amount", - "medical_amount", - "medical_dental_expense_amount", - "mortgage_interest_paid_amount", - "real_estate_taxes_amount", - "salt_amount", - "state_local_income_or_sales_tax_amount", - } -) - -ARCH_IRS_SOI_ITEMIZED_DEDUCTION_COUNT_VARIABLES = frozenset( - { - "charitable_returns", - "deductible_points_returns", - "home_mortgage_personal_seller_returns", - "interest_paid_deduction_returns", - "investment_interest_paid_returns", - "limited_state_local_taxes_returns", - "medical_claims", - "mortgage_interest_paid_returns", - "real_estate_taxes_claims", - "salt_claims", - "state_local_income_or_sales_tax_returns", - } -) - -ARCH_IRS_SOI_ITEMIZED_DEDUCTION_TABLE_MARKERS = ( - "itemized", - "historic table 2", - "table 2.", -) - -ARCH_IRS_SOI_CREDIT_AGI_DOMAIN_VARIABLES = frozenset( - { - "actc_amount", - "actc_claims", - "ctc_amount", - "ctc_claims", - } -) - -ARCH_STATE_TO_NATIONAL_ROLLUP_VARIABLES = frozenset( - { - "aca_aptc_amount", - "actc_amount", - "actc_claims", - "charitable_amount", - "charitable_claims", - "ctc_amount", - "ctc_claims", - "medical_amount", - "medical_claims", - "mortgage_interest_amount", - "mortgage_interest_claims", - "qbi_amount", - "qbi_claims", - "salt_amount", - "salt_claims", - } -) - -ARCH_COMPONENT_SUM_TARGETS = { - "salt_amount": ( - "state_local_income_or_sales_tax_amount", - "real_estate_taxes_amount", - ), -} - -ARCH_NATIONAL_ROLLUP_STATE_FIPS = frozenset( - state_fips for state_fips in US_STATE_ABBR_BY_FIPS if state_fips != "72" -) - -ARCH_BEA_REGIONAL_WAGE_COMPONENTS = { - "bea_regional_wages_salaries_place_of_work_amount": "wages", - "regional_supplements_to_wages_and_salaries": "supplements", - "regional_contributions_for_government_social_insurance": "contributions", - "regional_residence_adjustment": "residence_adjustment", -} - -ARCH_BEA_REGIONAL_WAGE_COMPONENT_VARIABLES = frozenset( - ARCH_BEA_REGIONAL_WAGE_COMPONENTS -) - -ARCH_BEA_STATE_EMPLOYMENT_INCOME_BEFORE_LSR_VARIABLE = ( - "employment_income_before_lsr_amount" -) - -ARCH_POSITIVE_AMOUNT_FILTER_VARIABLES = frozenset( - { - # SOI Table 1.4's taxable net capital gains amount is paired with - # returns with taxable net capital gains; PolicyEngine's variable can be - # negative, so the amount target must use the same positive domain. - "net_capital_gains", - } -) - -ARCH_TARGET_CELL_VARIABLE_ALIASES = { - "income_tax": frozenset({"income_tax_positive"}), - "self_employment_income": frozenset({"total_self_employment_income"}), -} - -ARCH_BROAD_BUSINESS_INCOME_SELF_EMPLOYMENT_BLOCKLIST = frozenset( - { - "bea_nipa.proprietors_income_with_inventory_valuation_and_capital_consumption_adjustments", - "bea_nipa.a041rc_proprietors_income_with_inventory_valuation_and_capital_consumption_adjustments", - "bea_regional.proprietors_income", - "bea_regional.sainc5n_line_70_proprietors_income", - "cbo.net_business_income", - "cbo.net_business_income_projection", - "cbo.income_source:net_business_income", - } -) - -ARCH_UNSUPPORTED_RATIO_OR_COMPONENT_VARIABLES = frozenset( - { - # BEA regional wage components are raw source ingredients. Microplex - # derives residence-adjusted state employment_income_before_lsr targets - # from the complete component panel instead of exposing the components. - *ARCH_BEA_REGIONAL_WAGE_COMPONENT_VARIABLES, - # Arch carries this as a convenient SSA diagnostic, but Microplex - # calibration constraints must be additive quantities. - "ssi_avg_monthly_payment", - # These are useful SSA components, but PolicyEngine exposes total SSI - # payments rather than separate federal and state supplementation - # payment variables in the exported calibration surface. - "ssi_federal_payments", - "ssi_state_supplementation", - } -) - -ARCH_SKIPPED_FACT_CONCEPTS = frozenset( - { - # CBO revenue-projection rows are useful benchmark facts, but these - # concepts are not one-to-one source-backed calibration targets. Keep - # them out of the active MP target surface until an explicit adapter - # handles each tax-return projection/composite definition. - "cbo.adjusted_gross_income_projection", - "cbo.wages_and_salaries_projection", - "cbo.taxable_interest_and_ordinary_dividends_excluding_qualified_dividends_projection", - "cbo.qualified_dividend_income_projection", - "cbo.net_capital_gain_projection", - "cbo.net_business_income_projection", - } -) - -ARCH_LATEST_CARRY_FORWARD_VARIABLES = frozenset( - { - # SSA publishes detailed SSI count/payment slices with a lag relative - # to the model year. Carry only these additive SSI controls forward; - # leave broader non-SOI sources to exact-year records unless they have - # source-specific aging logic. - "ssi_payments", - "ssi_recipients", - "ssi_total_payments", - } -) - -ARCH_COUNT_VARIABLE_ALIASES = { - "tax_unit_count": ("tax_unit_count", EntityType.TAX_UNIT, None), - "income_tax_liability_returns": ( - "tax_unit_count", - EntityType.TAX_UNIT, - "income_tax", - ), - "income_tax_before_credits_returns": ( - "tax_unit_count", - EntityType.TAX_UNIT, - "income_tax_before_credits", - ), - "household_count": ("household_count", EntityType.HOUSEHOLD, None), - "population": ("person_count", EntityType.PERSON, None), - "tax_filer_individual_count": ("person_count", EntityType.PERSON, None), - "snap_household_count": ("household_count", EntityType.HOUSEHOLD, "snap"), - "snap_participant_count": ("person_count", EntityType.PERSON, "snap"), - "aca_marketplace_enrollment": ( - "person_count", - EntityType.PERSON, - "aca_ptc", - ), - "aca_ptc_returns": ("tax_unit_count", EntityType.TAX_UNIT, "aca_ptc"), - "medicaid_total_enrollment": ( - "person_count", - EntityType.PERSON, - "medicaid_enrolled", - ), - "medicaid_enrollment": ("person_count", EntityType.PERSON, "medicaid_enrolled"), - "liheap_household_count": ( - "household_count", - EntityType.HOUSEHOLD, - "spm_unit_energy_subsidy_reported", - ), - "tanf_family_count": ("spm_unit_count", EntityType.SPM_UNIT, "tanf"), - "tanf_recipient_count": ("person_count", EntityType.PERSON, "tanf"), - "alimony_received_returns": ( - "tax_unit_count", - EntityType.TAX_UNIT, - "alimony_income", - ), - "alimony_paid_returns": ( - "tax_unit_count", - EntityType.TAX_UNIT, - "alimony_expense", - ), - "ssi_recipients": ("person_count", EntityType.PERSON, "ssi"), -} - -ARCH_FACT_CONCEPT_TO_TARGET = { - "irs_soi.individual_income_tax_returns": ("tax_unit_count", "COUNT"), - "irs_soi.returns_with_total_wages": ("wages_salaries_returns", "COUNT"), - "irs_soi.returns_with_taxable_net_capital_gains": ( - "net_capital_gains_returns", - "COUNT", - ), - "irs_soi.returns_with_taxable_ira_distributions": ( - "taxable_ira_distributions_returns", - "COUNT", - ), - "irs_soi.returns_with_taxable_pension_income": ( - "taxable_pension_income_returns", - "COUNT", - ), - "irs_soi.returns_with_unemployment_compensation": ( - "unemployment_compensation_returns", - "COUNT", - ), - "irs_soi.returns_with_taxable_social_security_benefits": ( - "taxable_social_security_returns", - "COUNT", - ), - "irs_soi.returns_with_alimony_received": ( - "alimony_received_returns", - "COUNT", - ), - "irs_soi.alimony_received": ("alimony_received_amount", "AMOUNT"), - "irs_soi.returns_with_alimony_paid": ( - "alimony_paid_returns", - "COUNT", - ), - "irs_soi.alimony_paid": ("alimony_paid_amount", "AMOUNT"), - "irs_soi.returns_with_income_tax_after_credits": ( - "income_tax_liability_returns", - "COUNT", - ), - "irs_soi.tax_filer_individuals": ( - "tax_filer_individual_count", - "COUNT", - ), - "irs_soi.returns_with_income_tax_before_credits": ( - "income_tax_before_credits_returns", - "COUNT", - ), - "irs_soi.income_tax_before_credits": ( - "income_tax_before_credits_amount", - "AMOUNT", - ), - "irs_soi.income_tax_after_credits": ("income_tax_liability", "AMOUNT"), - "irs_soi.returns_with_premium_tax_credit": ( - "aca_ptc_returns", - "COUNT", - ), - "irs_soi.premium_tax_credit": ("aca_aptc_amount", "AMOUNT"), - "irs_soi.returns_with_earned_income_credit": ("eitc_claims", "COUNT"), - "irs_soi.earned_income_credit": ("eitc_amount", "AMOUNT"), - "irs_soi.total_earned_income_credit": ("eitc_amount", "AMOUNT"), - "irs_soi.returns_with_total_earned_income_credit": ("eitc_claims", "COUNT"), - "irs_soi.returns_with_child_tax_credit": ("ctc_claims", "COUNT"), - "irs_soi.child_tax_credit": ("ctc_amount", "AMOUNT"), - "irs_soi.returns_with_additional_child_tax_credit": ( - "actc_claims", - "COUNT", - ), - "irs_soi.additional_child_tax_credit": ("actc_amount", "AMOUNT"), - "irs_soi.returns_with_real_estate_taxes": ( - "real_estate_taxes_claims", - "COUNT", - ), - "irs_soi.real_estate_taxes": ("real_estate_taxes_amount", "AMOUNT"), - "irs_soi.returns_with_limited_state_local_taxes": ( - "limited_state_local_taxes_returns", - "COUNT", - ), - "irs_soi.limited_state_local_taxes": ( - "limited_state_local_taxes_amount", - "AMOUNT", - ), - "irs_soi.returns_with_state_local_income_or_sales_taxes": ( - "state_local_income_or_sales_tax_returns", - "COUNT", - ), - "irs_soi.state_local_income_or_sales_taxes": ( - "state_local_income_or_sales_tax_amount", - "AMOUNT", - ), - "irs_soi.returns_with_interest_paid_deduction": ( - "interest_paid_deduction_returns", - "COUNT", - ), - "irs_soi.interest_paid_deduction": ( - "interest_paid_deduction_amount", - "AMOUNT", - ), - "irs_soi.returns_with_home_mortgage_interest_paid_to_financial_institutions": ( - "mortgage_interest_paid_returns", - "COUNT", - ), - "irs_soi.home_mortgage_interest_paid_to_financial_institutions": ( - "mortgage_interest_paid_amount", - "AMOUNT", - ), - "irs_soi.returns_with_home_mortgage_interest_paid_to_individuals": ( - "home_mortgage_personal_seller_returns", - "COUNT", - ), - "irs_soi.home_mortgage_interest_paid_to_individuals": ( - "home_mortgage_personal_seller_amount", - "AMOUNT", - ), - "irs_soi.returns_with_deductible_points": ( - "deductible_points_returns", - "COUNT", - ), - "irs_soi.deductible_points": ("deductible_points_amount", "AMOUNT"), - "irs_soi.returns_with_investment_interest_expense_deduction": ( - "investment_interest_paid_returns", - "COUNT", - ), - "irs_soi.investment_interest_expense_deduction": ( - "investment_interest_paid_amount", - "AMOUNT", - ), - "irs_soi.returns_with_contributions_deduction": ( - "charitable_returns", - "COUNT", - ), - "irs_soi.contributions_deduction": ("charitable_amount", "AMOUNT"), - "us:statutes/26/62#adjusted_gross_income": ( - "adjusted_gross_income", - "AMOUNT", - ), - "us:statutes/26/62#input.wages": ("wages_salaries_amount", "AMOUNT"), - "irs_soi.adjusted_gross_income": ("adjusted_gross_income", "AMOUNT"), - "irs_soi.total_income_tax": ("income_tax_liability", "AMOUNT"), - "irs_soi.total_wages": ("wages_salaries_amount", "AMOUNT"), - "irs_soi.returns_with_ordinary_dividends": ( - "ordinary_dividends_returns", - "COUNT", - ), - "irs_soi.ordinary_dividends": ("ordinary_dividends_amount", "AMOUNT"), - "irs_soi.returns_with_qualified_dividends": ( - "qualified_dividends_returns", - "COUNT", - ), - "irs_soi.qualified_dividends": ("qualified_dividends_amount", "AMOUNT"), - "irs_soi.returns_with_qualified_business_income_deduction": ( - "qbi_claims", - "COUNT", - ), - "irs_soi.qualified_business_income_deduction": ("qbi_amount", "AMOUNT"), - "irs_soi.returns_with_taxable_interest": ( - "taxable_interest_returns", - "COUNT", - ), - "irs_soi.taxable_interest": ("taxable_interest_amount", "AMOUNT"), - "irs_soi.returns_with_tax_exempt_interest": ( - "tax_exempt_interest_returns", - "COUNT", - ), - "irs_soi.tax_exempt_interest": ("tax_exempt_interest_amount", "AMOUNT"), - "irs_soi.returns_with_schedule_c_income": ( - "schedule_c_income_returns", - "COUNT", - ), - "irs_soi.schedule_c_income": ("schedule_c_income_amount", "AMOUNT"), - "irs_soi.taxable_net_capital_gains": ("net_capital_gains_amount", "AMOUNT"), - "irs_soi.returns_with_partnership_scorp_income": ( - "partnership_scorp_income_returns", - "COUNT", - ), - "irs_soi.partnership_scorp_income": ( - "partnership_scorp_income_amount", - "AMOUNT", - ), - "irs_soi.returns_with_rental_royalty_income": ( - "rental_royalty_income_returns", - "COUNT", - ), - "irs_soi.rental_royalty_income": ( - "rental_royalty_income_amount", - "AMOUNT", - ), - "irs_soi.taxable_ira_distributions": ( - "taxable_ira_distributions_amount", - "AMOUNT", - ), - "irs_soi.taxable_pension_income": ("taxable_pension_income_amount", "AMOUNT"), - "irs_soi.unemployment_compensation": ( - "unemployment_compensation_amount", - "AMOUNT", - ), - "irs_soi.taxable_social_security_benefits": ( - "taxable_social_security_amount", - "AMOUNT", - ), - "irs_soi.total_itemized_deductions": ("itemized_deductions", "AMOUNT"), - "irs_soi.returns_with_itemized_deductions": ( - "itemized_deductions_returns", - "COUNT", - ), - "irs_soi.returns_with_medical_dental_expense_deduction": ( - "medical_claims", - "COUNT", - ), - "irs_soi.medical_dental_expense_deduction": ( - "medical_dental_expense_amount", - "AMOUNT", - ), - "irs_soi.standard_deduction": ("standard_deduction", "AMOUNT"), - "irs_soi.taxable_income": ("taxable_income", "AMOUNT"), - "irs_soi.total_income": ("total_income", "AMOUNT"), - "irs_soi.returns_with_total_income": ("total_income_returns", "COUNT"), - "irs_soi.capital_asset_net_gain_less_loss": ( - "capital_asset_net_gain_less_loss", - "AMOUNT", - ), - "irs_soi.returns_with_capital_asset_net_gain_less_loss": ( - "capital_asset_net_gain_less_loss_returns", - "COUNT", - ), - "irs_soi.tax_credits": ("tax_credits", "AMOUNT"), - "irs_soi.returns_with_tax_credits": ("tax_credits_returns", "COUNT"), - "irs_soi.returns_with_taxable_income": ("taxable_income_returns", "COUNT"), - "irs_soi.returns_with_total_income_tax": ( - "income_tax_liability_returns", - "COUNT", - ), - "irs_soi.individual_income_tax_returns_excluding_dependents": ( - "tax_unit_count", - "COUNT", - ), - "irs_soi.eic_earned_income": ("eic_earned_income", "AMOUNT"), - "irs_soi.returns_with_eic_earned_income": ( - "eic_earned_income_returns", - "COUNT", - ), - "irs_soi.eic_refundable_portion": ("eitc_refundable_portion", "AMOUNT"), - "irs_soi.returns_with_eic_refundable_portion": ( - "eitc_refundable_portion_returns", - "COUNT", - ), - "irs_soi.roth_ira_contributions": ("roth_ira_contributions", "AMOUNT"), - "irs_soi.roth_ira_contributors": ("roth_ira_contributors", "COUNT"), - "irs_soi.traditional_ira_contributions": ( - "traditional_ira_contributions", - "AMOUNT", - ), - "irs_soi.traditional_ira_contributors": ( - "traditional_ira_contributors", - "COUNT", - ), - "irs_soi.form_w2_social_security_tip_income": ("tip_income", "AMOUNT"), - "irs_soi.form_w2_social_security_tip_returns": ( - "tip_income_returns", - "COUNT", - ), - "irs_soi.form_w2_social_security_tip_taxpayers": ( - "tip_income_taxpayers", - "COUNT", - ), - "irs_soi.form_w2_401k_elective_deferrals": ( - "traditional_401k_contributions", - "AMOUNT", - ), - "irs_soi.form_w2_designated_roth_401k_contributions": ( - "roth_401k_contributions", - "AMOUNT", - ), - "irs_soi.payments_to_keogh_plan": ( - "self_employed_pension_contribution_ald", - "AMOUNT", - ), - "cms_medicare.part_b_premium_income": ( - "medicare_part_b_premiums", - "AMOUNT", - ), - "census_acs.household_count": ("household_count", "COUNT"), - "census_acs.person_count": ("population", "COUNT"), - "census_decennial.resident_population": ("population", "COUNT"), - "census_decennial.occupied_housing_units": ("household_count", "COUNT"), - "census.population_projection": ("population", "COUNT"), - "census_pep.resident_population": ("population", "COUNT"), - "census_stc.individual_income_tax_collections": ( - "state_individual_income_tax_collections", - "AMOUNT", - ), - "cms_aca.marketplace_effectuated_enrollment": ( - "aca_marketplace_enrollment", - "COUNT", - ), - "cms_aca.marketplace_plan_selections": ( - "aca_marketplace_plan_selections", - "COUNT", - ), - "cms_aca.aptc_consumers": ("aca_aptc_consumers", "COUNT"), - "cms_aca.average_monthly_aptc": ("aca_average_monthly_aptc", "RATE"), - "cms_medicaid.total_medicaid_enrollment": ( - "medicaid_total_enrollment", - "COUNT", - ), - "cms_medicaid.total_medicaid_chip_enrollment": ( - "medicaid_chip_total_enrollment", - "COUNT", - ), - "cms_medicaid.total_chip_enrollment": ("chip_total_enrollment", "COUNT"), - "cms_medicaid.medicaid_chip_child_enrollment": ( - "medicaid_chip_child_enrollment", - "COUNT", - ), - "cms_medicaid.total_adult_medicaid_enrollment": ( - "adult_medicaid_enrollment", - "COUNT", - ), - "cms_nhe.medicaid_title_xix_expenditures": ( - "medicaid_benefits", - "AMOUNT", - ), - "ssa.ssi_recipient_count": ("ssi_recipients", "COUNT"), - "ssa.ssi_payment_amount": ("ssi_total_payments", "AMOUNT"), - "federal_reserve.z1.households_nonprofits_net_worth": ( - "net_worth_amount", - "AMOUNT", - ), - "hhs_acf_tanf.cash_assistance_expenditures": ( - "tanf_cash_assistance", - "AMOUNT", - ), - "hhs_acf_tanf.average_monthly_tanf_total_recipients": ( - "tanf_recipient_count", - "COUNT", - ), - "hhs_acf_tanf.average_monthly_tanf_adult_recipients": ( - "tanf_adult_recipient_count", - "COUNT", - ), - "hhs_acf_tanf.average_monthly_tanf_child_recipients": ( - "tanf_child_recipient_count", - "COUNT", - ), - "hhs_acf_tanf.average_monthly_tanf_total_families": ( - "tanf_family_count", - "COUNT", - ), - "hhs_acf_tanf.average_monthly_tanf_one_parent_families": ( - "tanf_one_parent_family_count", - "COUNT", - ), - "hhs_acf_tanf.average_monthly_tanf_two_parent_families": ( - "tanf_two_parent_family_count", - "COUNT", - ), - "hhs_acf_tanf.average_monthly_tanf_no_parent_families": ( - "tanf_no_parent_family_count", - "COUNT", - ), - "hhs_acf_liheap.households_served_by_state_programs": ( - "liheap_household_count", - "COUNT", - ), - "bea_nipa.wages_and_salaries": ( - ARCH_BEA_STATE_EMPLOYMENT_INCOME_BEFORE_LSR_VARIABLE, - "AMOUNT", - ), - "bea_nipa.proprietors_income_with_inventory_valuation_and_capital_consumption_adjustments": ( - "proprietors_income_amount", - "AMOUNT", - ), - "bea_nipa.rental_income_of_persons_with_capital_consumption_adjustment": ( - "rental_income_amount", - "AMOUNT", - ), - "bea_nipa.personal_interest_income": ( - "personal_interest_income_amount", - "RATE", - ), - "bea_nipa.personal_dividend_income": ( - "personal_dividend_income_amount", - "AMOUNT", - ), - "bea_nipa.supplements_to_wages_and_salaries": ( - "supplements_to_wages_and_salaries", - "RATE", - ), - "bea_nipa.employer_contributions_for_employee_pension_and_insurance_funds": ( - "employer_pension_and_insurance_contributions", - "RATE", - ), - "bea_nipa.employer_contributions_for_government_social_insurance": ( - "employer_government_social_insurance_contributions", - "RATE", - ), - "bea_nipa.farm_proprietors_income": ("farm_proprietors_income", "RATE"), - "bea_nipa.nonfarm_proprietors_income": ("nonfarm_proprietors_income", "RATE"), - "bea_nipa.government_social_benefits_to_persons": ( - "government_social_benefits_to_persons", - "RATE", - ), - "bea_nipa.social_security_benefits": ("social_security_benefits", "AMOUNT"), - "bea_nipa.medicare_benefits": ("medicare_benefits", "RATE"), - "bea_nipa.medicaid_benefits": ("medicaid_benefits", "AMOUNT"), - "bea_nipa.unemployment_insurance_benefits": ( - "unemployment_insurance_benefits", - "AMOUNT", - ), - "bea_nipa.veterans_benefits": ("veterans_benefits", "RATE"), - "bea_nipa.other_government_social_benefits_to_persons": ( - "other_government_social_benefits_to_persons", - "RATE", - ), - "bea_nipa.other_current_transfer_receipts_from_business_net": ( - "other_current_transfer_receipts_from_business_net", - "RATE", - ), - "bea_nipa.personal_current_transfer_receipts": ( - "personal_current_transfer_receipts", - "RATE", - ), - "bea_nipa.personal_income": ("personal_income", "RATE"), - "bea_nipa.personal_current_taxes": ("personal_current_taxes", "RATE"), - "bea_nipa.disposable_personal_income": ("disposable_personal_income", "RATE"), - "bea_nipa.personal_outlays": ("personal_outlays", "RATE"), - "bea_nipa.personal_saving": ("personal_saving", "RATE"), - "bea_nipa.personal_saving_rate": ("personal_saving_rate", "RATE"), - "bea_regional.personal_income": ("regional_personal_income", "RATE"), - "bea_regional.dividends_interest_and_rent": ( - "regional_dividends_interest_and_rent", - "RATE", - ), - "bea_regional.personal_current_transfer_receipts": ( - "regional_personal_current_transfer_receipts", - "RATE", - ), - "bea_regional.wages_and_salaries": ( - "bea_regional_wages_salaries_place_of_work_amount", - "AMOUNT", - ), - "bea_regional.supplements_to_wages_and_salaries": ( - "regional_supplements_to_wages_and_salaries", - "RATE", - ), - "bea_regional.contributions_for_government_social_insurance": ( - "regional_contributions_for_government_social_insurance", - "RATE", - ), - "bea_regional.residence_adjustment": ( - "regional_residence_adjustment", - "RATE", - ), - "bea_regional.proprietors_income": ("proprietors_income_amount", "AMOUNT"), - "usda_snap.total_benefits": ("snap_benefits", "AMOUNT"), - "usda_snap.average_monthly_households": ("snap_household_count", "COUNT"), - "usda_snap.average_monthly_persons": ("snap_participant_count", "COUNT"), - "usda_snap.average_monthly_benefit_per_person": ( - "snap_average_monthly_benefit_per_person", - "RATE", - ), -} - -ARCH_FACT_CONCEPTS_TO_SKIP = frozenset( - { - # SOI Table 2.1 total state/local taxes includes personal property - # taxes. PolicyEngine's federal SALT input currently combines - # state/local income-or-sales taxes and real estate taxes, so the total - # source-native concept is not exposed as a Microplex target. - "irs_soi.returns_with_state_and_local_taxes", - "irs_soi.state_and_local_taxes", - # CBO revenue projections are useful Arch reference facts, but they are - # forecast concepts rather than current-year Microplex calibration - # targets. Map them only through an explicit adapter. - "cbo.adjusted_gross_income_projection", - "cbo.wages_and_salaries_projection", - "cbo.taxable_interest_and_ordinary_dividends_excluding_qualified_dividends_projection", - "cbo.qualified_dividend_income_projection", - "cbo.net_capital_gain_projection", - "cbo.net_business_income_projection", - } -) - -ARCH_FACT_DOMAIN_CONSTRAINTS = { - "all_individual_income_tax_returns": (("is_tax_filer", "==", "1"),), - "form_w2_items": (), - "household_balance_sheet": (), - "individual_income_tax_returns": (("is_tax_filer", "==", "1"),), - "individual_income_tax_returns_excluding_dependents": ( - ("is_dependent", "==", "0"), - ), - "individual_income_tax_returns_with_earned_income_credit": (("eitc", ">", "0"),), - "individual_income_tax_returns_with_itemized_deductions": ( - ("itemized_deductions", ">", "0"), - ), - "individual_retirement_arrangement_contributions": (), - "compensation_of_employees": (), - "households": (), - "aca_marketplace_effectuated_enrollment": (), - "aca_marketplace_qhp_selections": (), - "medicaid_chip_enrollment": (), - "medicare_financing": (), - "national_health_expenditures": (), - "personal_current_transfer_receipts": (), - "personal_income": (), - "population_projection": (), - "resident_population": (), - "total_population": (), - "social_security_and_ssi_payments": (), - "state_government_tax_collections": (), - "supplemental_nutrition_assistance_program": (("snap", "==", "1"),), - "tanf_cash_assistance": (), - "tanf_caseload": (), - "liheap_state_programs": (), -} - -ARCH_FACT_CONSTRAINT_VARIABLE_ALIASES = { - "age": "age", - "snap_receipt_status": "snap", - "ssi_category": "ssi_category", - "us.tax.earned_income_credit_qualifying_children": "eitc_child_count", - "us_social_security_and_ssi.program_payment_type": "program_payment_type", - "us:statutes/26/62#adjusted_gross_income": "adjusted_gross_income", - "irs_soi.adjusted_gross_income": "adjusted_gross_income", -} - -ARCH_IGNORED_FACT_CONSTRAINT_VARIABLES = frozenset( - { - "amount_basis", - "administering_entity", - "bea_nipa.series_code", - "bea_regional.geo_name", - "bea_regional.line_code", - "bea_regional.table_name", - "medicare.financing_component", - "medicare.part", - "program", - } -) - -ARCH_ENTITY_HINTS = { - "adjusted_gross_income": EntityType.TAX_UNIT, - "income_tax": EntityType.TAX_UNIT, - "income_tax_positive": EntityType.TAX_UNIT, - "income_tax_before_credits": EntityType.TAX_UNIT, - "eitc": EntityType.TAX_UNIT, - "non_refundable_ctc": EntityType.TAX_UNIT, - "refundable_ctc": EntityType.TAX_UNIT, - "qualified_business_income_deduction": EntityType.TAX_UNIT, - "salt": EntityType.TAX_UNIT, - "salt_deduction": EntityType.TAX_UNIT, - "charitable_deduction": EntityType.TAX_UNIT, - "deductible_mortgage_interest": EntityType.TAX_UNIT, - "interest_deduction": EntityType.TAX_UNIT, - "investment_interest_expense": EntityType.PERSON, - "medical_expense_deduction": EntityType.TAX_UNIT, - "real_estate_taxes": EntityType.TAX_UNIT, - "tax_unit_partnership_s_corp_income": EntityType.TAX_UNIT, - "dividend_income": EntityType.PERSON, - "employment_income": EntityType.PERSON, - "qualified_dividend_income": EntityType.PERSON, - "taxable_interest_income": EntityType.PERSON, - "tax_exempt_interest_income": EntityType.PERSON, - "long_term_capital_gains": EntityType.PERSON, - "short_term_capital_gains": EntityType.PERSON, - "proprietors_income_amount": EntityType.PERSON, - "rental_income": EntityType.PERSON, - "roth_401k_contributions": EntityType.PERSON, - "self_employment_income": EntityType.PERSON, - "self_employed_pension_contribution_ald": EntityType.TAX_UNIT, - "salt_refund_income": EntityType.PERSON, - "state_income_tax": EntityType.TAX_UNIT, - "taxable_ira_distributions": EntityType.PERSON, - "traditional_ira_contributions": EntityType.PERSON, - "roth_ira_contributions": EntityType.PERSON, - "taxable_pension_income": EntityType.PERSON, - "taxable_social_security": EntityType.PERSON, - "tip_income": EntityType.PERSON, - "traditional_401k_contributions": EntityType.PERSON, - "unemployment_compensation": EntityType.PERSON, - "medicare_part_b_premiums": EntityType.PERSON, - "medicaid": EntityType.PERSON, - "net_worth": EntityType.HOUSEHOLD, - "social_security": EntityType.PERSON, - "social_security_dependents": EntityType.PERSON, - "social_security_disability": EntityType.PERSON, - "social_security_retirement": EntityType.PERSON, - "social_security_survivors": EntityType.PERSON, - "snap": EntityType.HOUSEHOLD, - "ssi": EntityType.PERSON, - "tanf": EntityType.SPM_UNIT, -} - -ARCH_AGI_BRACKET_FILTERS = { - "under_1": (None, 1), - "1_to_10k": (1, 10_000), - "10k_to_25k": (10_000, 25_000), - "25k_to_50k": (25_000, 50_000), - "50k_to_75k": (50_000, 75_000), - "75k_to_100k": (75_000, 100_000), - "100k_to_200k": (100_000, 200_000), - "200k_to_500k": (200_000, 500_000), - "500k_to_1m": (500_000, 1_000_000), - "1m_plus": (1_000_000, None), -} - -ARCH_CURRENT_TAX_VARIABLES = frozenset( - { - "tax_unit_count", - "adjusted_gross_income", - "income_tax_liability", - } -) - -ARCH_LABEL_WORD_OVERRIDES = { - "aca": "ACA", - "actc": "ACTC", - "agi": "AGI", - "bls": "BLS", - "cbo": "CBO", - "cms": "CMS", - "ctc": "CTC", - "eitc": "EITC", - "irs": "IRS", - "qbi": "QBI", - "liheap": "LIHEAP", - "snap": "SNAP", - "soi": "SOI", - "ssi": "SSI", - "tanf": "TANF", - "usda": "USDA", -} - -ARCH_VARIABLE_LABEL_OVERRIDES = { - "adjusted_gross_income": "Adjusted gross income", - "income_tax_liability": "Income tax liability", - "income_tax_liability_returns": "Returns with income tax after credits", - "income_tax_before_credits_returns": ("Returns with income tax before credits"), - "income_tax_before_credits_amount": "Income tax before credits amount", - "tax_filer_individual_count": "Individuals on tax returns", - "aca_ptc_returns": "Returns with premium tax credit", - "aca_aptc_amount": "Premium tax credit amount", - "eitc_claims": "Returns with earned income credit", - "eitc_amount": "Earned income credit amount", - "real_estate_taxes_claims": "Returns with real estate taxes", - "real_estate_taxes_amount": "Real estate taxes amount", - "limited_state_local_taxes_returns": ("Returns with limited state and local taxes"), - "tax_exempt_interest_returns": "Tax-exempt interest returns", - "tax_exempt_interest_amount": "Tax-exempt interest amount", - "taxable_interest_amount": "Taxable interest amount", - "wages_salaries_returns": "Returns with total wages", - "wages_salaries_amount": "Total wages amount", - "personal_dividend_income_amount": "Personal dividend income amount", - "proprietors_income_amount": "Proprietors' income amount", - "rental_income_amount": "Rental income amount", - "net_capital_gains_returns": "Returns with taxable net capital gains", - "net_capital_gains_amount": "Taxable net capital gains amount", - "taxable_ira_distributions_returns": ("Returns with taxable IRA distributions"), - "taxable_ira_distributions_amount": "Taxable IRA distributions amount", - "taxable_pension_income_returns": "Returns with taxable pension income", - "taxable_pension_income_amount": "Taxable pension income amount", - "unemployment_compensation_returns": ("Returns with unemployment compensation"), - "unemployment_compensation_amount": "Unemployment compensation amount", - "unemployment_insurance_benefits": "Unemployment insurance benefits", - "taxable_social_security_returns": ( - "Returns with taxable Social Security benefits" - ), - "taxable_social_security_amount": "Taxable Social Security benefits amount", - "ordinary_dividends_amount": "Ordinary dividends amount", - "qualified_dividends_returns": "Returns with qualified dividends", - "qualified_dividends_amount": "Qualified dividends amount", - "long_term_capital_gains_amount": "Long-term capital gains amount", - "short_term_capital_gains_amount": "Short-term capital gains amount", - "partnership_scorp_income_returns": "Returns with partnership and S-corp income", - "employment_income_before_lsr_amount": ( - "Employment income before labor-supply responses amount" - ), - "partnership_scorp_income_amount": "Partnership and S-corp income amount", - "schedule_c_income_returns": "Returns with Schedule C income", - "schedule_c_income_amount": "Schedule C income amount", - "medical_claims": "Returns with medical expense deduction", - "medical_dental_expense_amount": "Medical and dental expense amount", - "tax_unit_count": "Tax unit count", - "household_count": "Household count", - "population": "Population count", - "snap_household_count": "SNAP household count", - "snap_participant_count": "SNAP participant count", - "aca_marketplace_enrollment": "ACA marketplace enrollment", - "state_individual_income_tax_collections": ( - "State individual income tax collections" - ), - "charitable_amount": "Contributions deduction amount", - "charitable_returns": "Returns with contributions deduction", - "deductible_points_returns": "Returns with deductible points", - "home_mortgage_personal_seller_returns": ( - "Returns with home mortgage interest paid to individuals" - ), - "interest_paid_deduction_returns": ("Returns with interest paid deduction"), - "investment_interest_paid_returns": ( - "Returns with investment interest expense deduction" - ), - "limited_state_local_taxes_amount": "Limited state and local taxes amount", - "state_local_income_or_sales_tax_amount": ( - "State and local income or sales taxes amount" - ), - "state_local_income_or_sales_tax_returns": ( - "Returns with state and local income or sales taxes" - ), - "mortgage_interest_paid_returns": ( - "Returns with home mortgage interest paid to financial institutions" - ), - "interest_paid_deduction_amount": "Interest paid deduction amount", - "mortgage_interest_paid_amount": "Mortgage interest paid amount", - "home_mortgage_personal_seller_amount": ( - "Home mortgage from personal seller amount" - ), - "deductible_points_amount": "Deductible points amount", - "investment_interest_paid_amount": "Investment interest paid amount", - "medicaid_benefits": "Medicaid benefits", - "medicaid_total_enrollment": "Medicaid enrollment", - "medicaid_enrollment": "Medicaid enrollment", - "liheap_household_count": "LIHEAP household count", - "social_security_benefits": "Social Security benefits", - "social_security_dependents_benefits": "Social Security dependent benefits", - "social_security_disability_benefits": "Social Security disability benefits", - "social_security_retirement_benefits": "Social Security retirement benefits", - "social_security_survivors_benefits": "Social Security survivor benefits", - "ssi_payments": "SSI payments", - "tanf_cash_assistance": "TANF cash assistance", - "tanf_family_count": "TANF family count", - "tanf_recipient_count": "TANF recipient count", - "tip_income": "Tip income", - "traditional_401k_contributions": "Traditional 401(k) contributions", - "traditional_ira_contributions": "Traditional IRA contributions", - "roth_401k_contributions": "Roth 401(k) contributions", - "roth_ira_contributions": "Roth IRA contributions", - "self_employed_pension_contribution_ald": ( - "Self-employed pension contribution ALD" - ), -} - -ARCH_AGI_BRACKET_LABELS = { - "under_1": "under $1", - "1_to_10k": "$1-$10k", - "10k_to_25k": "$10k-$25k", - "25k_to_50k": "$25k-$50k", - "50k_to_75k": "$50k-$75k", - "75k_to_100k": "$75k-$100k", - "100k_to_200k": "$100k-$200k", - "200k_to_500k": "$200k-$500k", - "500k_to_1m": "$500k-$1m", - "1m_plus": "$1m+", -} - -ARCH_MODEL_AMOUNT_VARIABLE_HINTS = { - **{ - model_variable: source_variable - for source_variable, model_variable in ARCH_AMOUNT_VARIABLE_ALIASES.items() - }, - "employment_income": "wages_salaries_amount", - "employment_income_before_lsr": ( - ARCH_BEA_STATE_EMPLOYMENT_INCOME_BEFORE_LSR_VARIABLE - ), - "income_tax_positive": "income_tax_liability", - "income_tax_before_credits": "income_tax_before_credits_amount", - "interest_deduction": "interest_paid_deduction_amount", - "medicare_part_b_premiums": "medicare_part_b_premiums", - "net_capital_gains": "net_capital_gains_amount", - "net_worth": "net_worth", - "real_estate_taxes": "real_estate_taxes_amount", - "roth_401k_contributions": "roth_401k_contributions", - "self_employed_pension_contribution_ald": ( - "self_employed_pension_contribution_ald" - ), - "total_self_employment_income": "schedule_c_income_amount", - "taxable_ira_distributions": "taxable_ira_distributions_amount", - "taxable_pension_income": "taxable_pension_income_amount", - "taxable_social_security": "taxable_social_security_amount", - "tip_income": "tip_income", - "traditional_401k_contributions": "traditional_401k_contributions", - "unemployment_compensation": "unemployment_compensation_amount", -} - -ARCH_MODEL_COUNT_DOMAIN_VARIABLE_HINTS = { - "adjusted_gross_income": "tax_unit_count", - "dividend_income": "ordinary_dividends_returns", - "employment_income": "wages_salaries_returns", - "eitc": "eitc_claims", - "income_tax": "income_tax_liability_returns", - "income_tax_before_credits": "income_tax_before_credits_returns", - "medical_expense_deduction": "medical_claims", - "net_capital_gains": "net_capital_gains_returns", - "non_refundable_ctc": "ctc_claims", - "qualified_business_income_deduction": "qbi_claims", - "qualified_dividend_income": "qualified_dividends_returns", - "real_estate_taxes": "real_estate_taxes_claims", - "refundable_ctc": "actc_claims", - "rental_income": "rental_royalty_income_returns", - "salt": "salt_claims", - "self_employment_income": "schedule_c_income_returns", - "total_self_employment_income": "schedule_c_income_returns", - "tax_exempt_interest_income": "tax_exempt_interest_returns", - "tax_unit_partnership_s_corp_income": "partnership_scorp_income_returns", - "taxable_interest_income": "taxable_interest_returns", - "taxable_ira_distributions": "taxable_ira_distributions_returns", - "taxable_pension_income": "taxable_pension_income_returns", - "taxable_social_security": "taxable_social_security_returns", - "unemployment_compensation": "unemployment_compensation_returns", -} - -ARCH_BEA_FULL_POP_AMOUNT_VARIABLES = frozenset( - { - "dividend_income", - "employment_income", - "employment_income_before_lsr", - "rental_income", - "unemployment_compensation", - } -) - -ARCH_BEA_FULL_POP_AMOUNT_ARCH_VARIABLES = { - "dividend_income": "personal_dividend_income_amount", - "employment_income": "wages_salaries_amount", - "employment_income_before_lsr": ( - ARCH_BEA_STATE_EMPLOYMENT_INCOME_BEFORE_LSR_VARIABLE - ), - "rental_income": "rental_income_amount", - "unemployment_compensation": "unemployment_insurance_benefits", -} - -ARCH_IRS_SOI_GAP_VARIABLES = frozenset( - { - *ARCH_MODEL_AMOUNT_VARIABLE_HINTS, - *ARCH_MODEL_COUNT_DOMAIN_VARIABLE_HINTS, - "income_tax_positive", - "interest_deduction", - "roth_ira_contributions", - "tax_unit_count", - "tip_income", - "traditional_ira_contributions", - } -) - -ARCH_DEPRIORITIZED_SURVEY_OR_MODEL_GAP_VARIABLES = frozenset( - { - "child_support_expense", - "child_support_received", - "health_insurance_premiums_without_medicare_part_b", - "other_medical_expenses", - "over_the_counter_health_expenses", - "rent", - "spm_unit_capped_housing_subsidy", - "spm_unit_capped_work_childcare_expenses", - } -) - -ARCH_DEPRIORITIZED_SURVEY_OR_MODEL_GAP_DOMAINS = frozenset( - { - "ssn_card_type", - } -) - -ARCH_GAP_SOURCE_TABLE_HINTS = { - "aca_aptc_amount": "CMS Marketplace Open Enrollment public-use files", - "aca_marketplace_enrollment": "CMS Marketplace Open Enrollment public-use files", - "employment_income": "IRS SOI Publication 1304 Table 1.4", - "aca_ptc_returns": "IRS SOI Historic Table 2", - "eitc_amount": "IRS SOI Historic Table 2", - "eitc_claims": "IRS SOI Historic Table 2", - "income_tax_liability": "IRS SOI Publication 1304 Table 1.1 or Historic Table 2", - "income_tax_before_credits": "IRS SOI Publication 1304 Table 1.1", - "income_tax_before_credits_returns": "IRS SOI Historic Table 2", - "tax_filer_individual_count": "IRS SOI Historic Table 2", - "charitable_amount": "IRS SOI Publication 1304 Table 2.1", - "charitable_returns": "IRS SOI Publication 1304 Table 2.1", - "deductible_points_amount": "IRS SOI Publication 1304 Table 2.1", - "deductible_points_returns": "IRS SOI Publication 1304 Table 2.1", - "home_mortgage_personal_seller_amount": "IRS SOI Publication 1304 Table 2.1", - "home_mortgage_personal_seller_returns": "IRS SOI Publication 1304 Table 2.1", - "interest_paid_deduction_amount": "IRS SOI Publication 1304 Table 2.1", - "interest_paid_deduction_returns": "IRS SOI Publication 1304 Table 2.1", - "investment_interest_paid_amount": "IRS SOI Publication 1304 Table 2.1", - "investment_interest_paid_returns": "IRS SOI Publication 1304 Table 2.1", - "limited_state_local_taxes_amount": "IRS SOI Publication 1304 Table 2.1", - "limited_state_local_taxes_returns": "IRS SOI Publication 1304 Table 2.1", - "mortgage_interest_paid_amount": "IRS SOI Publication 1304 Table 2.1", - "mortgage_interest_paid_returns": "IRS SOI Publication 1304 Table 2.1", - "state_local_income_or_sales_tax_amount": "IRS SOI Publication 1304 Table 2.1", - "state_local_income_or_sales_tax_returns": "IRS SOI Publication 1304 Table 2.1", - "liheap_household_count": "HHS ACF LIHEAP National Profile", - "medicaid_benefits": ( - "CMS National Health Expenditures by type of service and source of funds" - ), - "net_capital_gains": "IRS SOI Publication 1304 Table 1.4", - "population": "Census Population Estimates Program Vintage 2024 age-sex files", - "real_estate_taxes": "IRS SOI itemized deduction tables or ACS state files", - "roth_ira_contributions": "IRS SOI IRA contribution tables", - "roth_401k_contributions": "IRS SOI Form W-2 Statistics Table 4.B", - "self_employed_pension_contribution_ald": "IRS SOI Publication 1304 Table 1.4", - "state_individual_income_tax_collections": ( - "Census State Tax Collections item T40" - ), - "social_security_benefits": "SSA Annual Statistical Supplement", - "social_security_dependents_benefits": "SSA Annual Statistical Supplement", - "social_security_disability_benefits": "SSA Annual Statistical Supplement", - "social_security_retirement_benefits": "SSA Annual Statistical Supplement", - "social_security_survivors_benefits": "SSA Annual Statistical Supplement", - "snap_benefits": "USDA FNS SNAP annual state participation and benefit workbooks", - "snap_household_count": ( - "USDA FNS SNAP annual state participation and benefit workbooks" - ), - "snap_participant_count": ( - "USDA FNS SNAP annual state participation and benefit workbooks" - ), - "ssi_payments": "SSA Annual Statistical Supplement", - "tanf_cash_assistance": "ACF TANF Financial Data", - "tanf_family_count": "ACF TANF Caseload Data", - "tanf_recipient_count": "ACF TANF Caseload Data", - "tip_income": "IRS SOI Form W-2 Statistics", - "traditional_ira_contributions": "IRS SOI IRA contribution tables", - "traditional_401k_contributions": "IRS SOI Form W-2 Statistics Table 4.B", - "taxable_ira_distributions": "IRS SOI IRA accumulation/distribution tables", - "taxable_pension_income": "IRS SOI Publication 1304 Table 1.4", - "taxable_social_security": "IRS SOI Publication 1304 Table 1.4", - "unemployment_compensation": "IRS SOI Publication 1304 Table 1.4", -} - - -@dataclass(frozen=True) -class SOIAgingFactors: - """Declared factors used to age SOI target records to a model year.""" - - source_year: int - target_year: int - count_factor: float - amount_factor: float - count_method: str - amount_method: str - - -@dataclass(frozen=True) -class ArchTargetRecord: - """A source target record loaded from the Arch SQLite DB.""" - - target_id: int - stratum_id: int - variable: str - period: int - value: float - target_type: str - geographic_level: str | None - geography_id: str | None - source: str - source_table: str | None - source_url: str | None - notes: str | None - stratum_name: str | None - jurisdiction: str - constraints: tuple[tuple[str, str, str], ...] - source_period: int | None = None - aging_factors: SOIAgingFactors | None = None - aggregate_fact_key: str | None = None - semantic_fact_key: str | None = None - source_record_id: str | None = None - source_cell_keys: tuple[str, ...] = () - source_row_keys: tuple[str, ...] = () - unit: str | None = None - concept: str | None = None - source_concept: str | None = None - concept_relation: str | None = None - concept_authority: str | None = None - concept_evidence_url: str | None = None - concept_evidence_notes: str | None = None - legal_vintage: str | None = None - source_db_path: str | None = None - source_db_index: int | None = None - source_target_id: int | None = None - source_stratum_id: int | None = None - - -@dataclass(frozen=True) -class ArchTargetCellCoverage: - """Coverage for one PolicyEngine target cell from an Arch target DB.""" - - cell: dict[str, str | None] - target_ids: tuple[int, ...] - target_names: tuple[str, ...] - sources: tuple[str, ...] - - @property - def covered(self) -> bool: - return bool(self.target_ids) - - @property - def target_count(self) -> int: - return len(self.target_ids) - - def to_dict(self) -> dict[str, Any]: - return { - "cell": dict(self.cell), - "covered": self.covered, - "target_count": self.target_count, - "target_ids": list(self.target_ids), - "target_names": list(self.target_names), - "sources": list(self.sources), - } - - -@dataclass(frozen=True) -class ArchTargetProfileCoverageReport: - """JSON-ready summary of Arch coverage for a Microplex target profile.""" - - profile_name: str - period: int - target_cell_count: int - covered_cell_count: int - uncovered_cell_count: int - coverage_rate: float - by_geo_level: dict[str, dict[str, int]] - by_variable: dict[str, dict[str, int]] - cells: tuple[ArchTargetCellCoverage, ...] - - def to_dict(self) -> dict[str, Any]: - return { - "profile_name": self.profile_name, - "period": self.period, - "target_cell_count": self.target_cell_count, - "covered_cell_count": self.covered_cell_count, - "uncovered_cell_count": self.uncovered_cell_count, - "coverage_rate": self.coverage_rate, - "by_geo_level": self.by_geo_level, - "by_variable": self.by_variable, - "cells": [cell.to_dict() for cell in self.cells], - } - - -@dataclass(frozen=True) -class ArchTargetGapQueueRow: - """One target-profile cell as an Arch authoring task.""" - - priority: int - profile_name: str - period: int - variable: str - geo_level: str | None - domain_variable: str | None - geographic_id: str | None - covered: bool - target_count: int - target_ids: tuple[int, ...] - sources: tuple[str, ...] - expected_source: str | None - expected_source_table: str | None - expected_arch_variable: str | None - expected_target_type: str | None - expected_entity: str | None - expected_aggregation: str | None - expected_filters: tuple[dict[str, Any], ...] - gap_category: str - loader_status: str - agent_task_kind: str - notes: str - - def to_dict(self) -> dict[str, Any]: - return { - "priority": self.priority, - "profile_name": self.profile_name, - "period": self.period, - "cell": { - "variable": self.variable, - "geo_level": self.geo_level, - "domain_variable": self.domain_variable, - "geographic_id": self.geographic_id, - }, - "covered": self.covered, - "target_count": self.target_count, - "target_ids": list(self.target_ids), - "sources": list(self.sources), - "expected_source": self.expected_source, - "expected_source_table": self.expected_source_table, - "expected_arch_variable": self.expected_arch_variable, - "expected_target_type": self.expected_target_type, - "expected_entity": self.expected_entity, - "expected_aggregation": self.expected_aggregation, - "expected_filters": list(self.expected_filters), - "gap_category": self.gap_category, - "loader_status": self.loader_status, - "agent_task_kind": self.agent_task_kind, - "notes": self.notes, - } - - -@dataclass(frozen=True) -class ArchTargetGapQueueReport: - """JSON-ready Arch authoring queue for a Microplex target profile.""" - - profile_name: str - period: int - row_count: int - covered_row_count: int - uncovered_row_count: int - by_loader_status: dict[str, int] - by_gap_category: dict[str, int] - rows: tuple[ArchTargetGapQueueRow, ...] - - def to_dict(self) -> dict[str, Any]: - return { - "profile_name": self.profile_name, - "period": self.period, - "row_count": self.row_count, - "covered_row_count": self.covered_row_count, - "uncovered_row_count": self.uncovered_row_count, - "by_loader_status": self.by_loader_status, - "by_gap_category": self.by_gap_category, - "rows": [row.to_dict() for row in self.rows], - } - - -@dataclass(frozen=True) -class ArchTargetParityRow: - """One canonical target identity compared across two Arch artifacts.""" - - status: str - identity: tuple[Any, ...] - incumbent_targets: tuple[CanonicalTargetSpec, ...] - candidate_targets: tuple[CanonicalTargetSpec, ...] - absolute_delta: float | None - relative_delta: float | None - - def to_dict(self) -> dict[str, Any]: - return { - "status": self.status, - "identity": _arch_target_parity_identity_dict(self.identity), - "incumbent_target_count": len(self.incumbent_targets), - "candidate_target_count": len(self.candidate_targets), - "absolute_delta": self.absolute_delta, - "relative_delta": self.relative_delta, - "incumbent_targets": [ - _target_parity_sample(target) for target in self.incumbent_targets - ], - "candidate_targets": [ - _target_parity_sample(target) for target in self.candidate_targets - ], - } - - -@dataclass(frozen=True) -class ArchTargetParityReport: - """JSON-ready parity report between incumbent and candidate Arch artifacts.""" - - period: int - incumbent_artifacts: tuple[str, ...] - candidate_artifacts: tuple[str, ...] - value_abs_tolerance: float - value_rel_tolerance: float - counts: dict[str, int] - rows: tuple[ArchTargetParityRow, ...] - errors: tuple[dict[str, Any], ...] - - @property - def valid(self) -> bool: - return not self.errors - - def to_dict(self, *, row_limit: int | None = None) -> dict[str, Any]: - rows = self.rows if row_limit is None else self.rows[: max(0, row_limit)] - return { - "valid": self.valid, - "period": self.period, - "incumbent_artifacts": list(self.incumbent_artifacts), - "candidate_artifacts": list(self.candidate_artifacts), - "value_abs_tolerance": self.value_abs_tolerance, - "value_rel_tolerance": self.value_rel_tolerance, - "counts": self.counts, - "row_count": len(self.rows), - "rows": [row.to_dict() for row in rows], - "errors": list(self.errors), - } - - -class ArchSQLiteTargetProvider: - """Read Arch target records from the Arch SQLite DB.""" - - def __init__( - self, - db_path: str | Path, - *, - jurisdiction: str = "us", - compose_model_year_targets: bool = True, - age_soi_targets: bool = True, - ) -> None: - self.db_path = Path(db_path) - self.jurisdiction = jurisdiction - self.compose_model_year_targets = compose_model_year_targets - self.age_soi_targets = age_soi_targets - - def load_target_set(self, query: TargetQuery | None = None) -> TargetSet: - """Load canonical targets through the core provider protocol.""" - if not self.db_path.exists(): - raise FileNotFoundError(f"Arch targets DB not found: {self.db_path}") - - query = query or TargetQuery() - provider_filters = dict(query.provider_filters) - period = query.period if isinstance(query.period, int) else None - jurisdiction = str(provider_filters.get("jurisdiction") or self.jurisdiction) - variables = _as_string_tuple(provider_filters.get("variables")) - domain_variables = _as_string_tuple(provider_filters.get("domain_variables")) - sources = _as_string_tuple(provider_filters.get("sources")) - geo_levels = _as_string_tuple(provider_filters.get("geo_levels")) - target_cells = _as_target_cell_filters(provider_filters.get("target_cells")) - compose_model_year_targets = bool( - provider_filters.get( - "compose_model_year_targets", - self.compose_model_year_targets, - ) - ) - age_soi_targets = bool( - provider_filters.get("age_soi_targets", self.age_soi_targets) - ) - entity_overrides = provider_filters.get("entity_overrides") or {} - - records = ( - self._compose_model_year_records( - target_year=period, - jurisdiction=jurisdiction, - sources=sources, - age_soi_targets=age_soi_targets, - ) - if compose_model_year_targets and period is not None - else self.load_records( - period=period, - jurisdiction=jurisdiction, - sources=sources, - ) - ) - canonical_targets = TargetSet( - [ - target - for record in records - if _matches_arch_provider_filters( - record, - variables=variables, - domain_variables=domain_variables, - geo_levels=geo_levels, - target_cells=target_cells, - entity_overrides=entity_overrides, - ) - for target in [ - arch_target_record_to_canonical_spec( - record, - entity_overrides=entity_overrides, - ) - ] - if target is not None - ] - ) - return apply_target_query( - canonical_targets, - TargetQuery( - period=query.period, - entity=query.entity, - names=query.names, - metadata_filters=query.metadata_filters, - ), - ) - - def load_records( - self, - *, - period: int | None = None, - jurisdiction: str | None = None, - sources: tuple[str, ...] = (), - ) -> list[ArchTargetRecord]: - """Load source target records with attached stratum constraints.""" - jurisdiction = jurisdiction or self.jurisdiction - normalized_sources = tuple(_normalize_arch_source(source) for source in sources) - clauses = [_jurisdiction_clause(jurisdiction)] - params: list[Any] = [] - if period is not None: - clauses.append("t.period = ?") - params.append(int(period)) - if normalized_sources: - placeholders = ", ".join("?" for _ in normalized_sources) - clauses.append(f"t.source IN ({placeholders})") - params.extend(normalized_sources) - where_clause = " AND ".join(clauses) - sql = f""" - SELECT - t.id AS target_id, - t.stratum_id, - t.variable, - t.period, - t.value, - t.target_type, - t.geographic_level, - t.source, - t.source_table, - t.source_url, - t.notes, - s.name AS stratum_name, - s.jurisdiction, - sc.variable AS constraint_variable, - sc.operator AS constraint_operator, - sc.value AS constraint_value - FROM targets AS t - JOIN strata AS s - ON s.id = t.stratum_id - LEFT JOIN stratum_constraints AS sc - ON sc.stratum_id = s.id - WHERE {where_clause} - ORDER BY t.id, sc.variable, sc.operator, sc.value - """ - conn = sqlite3.connect(self.db_path) - conn.row_factory = sqlite3.Row - try: - has_parent_id = _sqlite_table_has_column(conn, "strata", "parent_id") - if has_parent_id: - sql = f""" - WITH target_rows AS ( - SELECT - t.id AS target_id, - t.stratum_id, - t.variable, - t.period, - t.value, - t.target_type, - t.geographic_level, - t.source, - t.source_table, - t.source_url, - t.notes, - s.name AS stratum_name, - s.jurisdiction, - s.parent_id - FROM targets AS t - JOIN strata AS s - ON s.id = t.stratum_id - WHERE {where_clause} - ), - ancestor_strata(target_id, stratum_id, depth) AS ( - SELECT - target_id, - stratum_id, - 0 AS depth - FROM target_rows - UNION ALL - SELECT - a.target_id, - parent.id AS stratum_id, - a.depth + 1 AS depth - FROM ancestor_strata AS a - JOIN strata AS child - ON child.id = a.stratum_id - JOIN strata AS parent - ON parent.id = child.parent_id - WHERE child.parent_id IS NOT NULL - ) - SELECT - tr.target_id, - tr.stratum_id, - tr.variable, - tr.period, - tr.value, - tr.target_type, - tr.geographic_level, - tr.source, - tr.source_table, - tr.source_url, - tr.notes, - tr.stratum_name, - tr.jurisdiction, - sc.variable AS constraint_variable, - sc.operator AS constraint_operator, - sc.value AS constraint_value - FROM target_rows AS tr - LEFT JOIN ancestor_strata AS a - ON a.target_id = tr.target_id - LEFT JOIN stratum_constraints AS sc - ON sc.stratum_id = a.stratum_id - ORDER BY - tr.target_id, - a.depth DESC, - sc.variable, - sc.operator, - sc.value - """ - rows = conn.execute(sql, params).fetchall() - finally: - conn.close() - return _group_arch_target_rows(rows) - - def _compose_model_year_records( - self, - *, - target_year: int, - jurisdiction: str, - sources: tuple[str, ...], - age_soi_targets: bool, - ) -> list[ArchTargetRecord]: - current_records = self.load_records( - period=target_year, - jurisdiction=jurisdiction, - sources=sources, - ) - current_records = _with_bea_state_employment_income_before_lsr_records( - current_records - ) - if sources and _normalize_arch_source("IRS_SOI") not in { - _normalize_arch_source(source) for source in sources - }: - carry_forward_records = _latest_carry_forward_records_by_target_cell( - self.load_records( - period=None, - jurisdiction=jurisdiction, - sources=sources, - ), - target_year=target_year, - sources=sources, - ) - return _with_state_to_national_rollup_records( - [ - record - for record in current_records - if not _is_latest_carry_forward_candidate(record) - ] - + carry_forward_records - ) - - non_soi_current_records = [ - record - for record in current_records - if record.source != "IRS_SOI" - and not _is_latest_carry_forward_candidate(record) - ] - carry_forward_records = _latest_carry_forward_records_by_target_cell( - self.load_records( - period=None, - jurisdiction=jurisdiction, - sources=sources, - ), - target_year=target_year, - sources=sources, - ) - soi_records = self._latest_soi_records_by_composition( - target_year=target_year, - jurisdiction=jurisdiction, - ) - if age_soi_targets: - soi_records = self._age_soi_records_by_source_year( - soi_records, - target_year=target_year, - jurisdiction=jurisdiction, - ) - return _with_state_to_national_rollup_records( - [*non_soi_current_records, *carry_forward_records, *soi_records] - ) - - def _latest_soi_records_by_composition( - self, - *, - target_year: int, - jurisdiction: str, - ) -> list[ArchTargetRecord]: - """Return the latest SOI records for each target composition.""" - records = [ - record - for record in self.load_records( - period=None, - jurisdiction=jurisdiction, - sources=("IRS_SOI",), - ) - if record.period <= target_year - ] - latest_period_by_key: dict[ - tuple[str, str, str, tuple[tuple[str, str, str], ...]], - int, - ] = {} - for record in records: - key = _arch_record_composition_key(record) - latest_period_by_key[key] = max( - latest_period_by_key.get(key, record.period), - record.period, - ) - return [ - record - for record in records - if record.period - == latest_period_by_key[_arch_record_composition_key(record)] - ] - - def _age_soi_records_by_source_year( - self, - records: list[ArchTargetRecord], - *, - target_year: int, - jurisdiction: str, - ) -> list[ArchTargetRecord]: - aged: list[ArchTargetRecord] = [] - source_years = sorted({record.period for record in records}) - for source_year in source_years: - source_records = [ - record for record in records if record.period == source_year - ] - if source_year == target_year: - aged.extend(source_records) - else: - aged.extend( - self.age_soi_records( - source_records, - source_year=source_year, - target_year=target_year, - jurisdiction=jurisdiction, - ) - ) - return aged - - def latest_soi_year(self, target_year: int, *, jurisdiction: str) -> int | None: - """Return the latest SOI year at or before the model year.""" - variables = tuple(sorted(ARCH_CURRENT_TAX_VARIABLES)) - placeholders = ", ".join("?" for _ in variables) - sql = f""" - SELECT DISTINCT t.period - FROM targets AS t - JOIN strata AS s - ON s.id = t.stratum_id - WHERE {_jurisdiction_clause(jurisdiction)} - AND t.source = 'IRS_SOI' - AND t.period <= ? - AND t.variable IN ({placeholders}) - ORDER BY t.period DESC - """ - conn = sqlite3.connect(self.db_path) - try: - rows = conn.execute(sql, [int(target_year), *variables]).fetchall() - finally: - conn.close() - return int(rows[0][0]) if rows else None - - def age_soi_records( - self, - records: list[ArchTargetRecord], - *, - source_year: int, - target_year: int, - jurisdiction: str, - ) -> list[ArchTargetRecord]: - """Age SOI records with declared Microplex-side factors.""" - needs_count_factor = any(record.target_type == "COUNT" for record in records) - needs_amount_factor = any(record.target_type == "AMOUNT" for record in records) - factors = self.get_soi_aging_factors( - source_year=source_year, - target_year=target_year, - jurisdiction=jurisdiction, - needs_count_factor=needs_count_factor, - needs_amount_factor=needs_amount_factor, - ) - aged: list[ArchTargetRecord] = [] - for record in records: - if record.source != "IRS_SOI": - aged.append(record) - continue - if record.target_type == "COUNT": - factor = factors.count_factor - elif record.target_type == "AMOUNT": - factor = factors.amount_factor - else: - factor = 1.0 - aged.append( - replace( - record, - value=float(record.value) * factor, - period=target_year, - source_period=record.period, - aging_factors=factors, - ) - ) - return aged - - def get_soi_aging_factors( - self, - *, - source_year: int, - target_year: int, - jurisdiction: str, - needs_count_factor: bool = True, - needs_amount_factor: bool = True, - ) -> SOIAgingFactors: - """Resolve source-backed factors for SOI count and amount targets.""" - if source_year == target_year: - return SOIAgingFactors( - source_year=source_year, - target_year=target_year, - count_factor=1.0, - amount_factor=1.0, - count_method="identity", - amount_method="identity", - ) - if needs_count_factor: - source_labor_force = self._target_value( - year=source_year, - jurisdiction=jurisdiction, - source="BLS", - variable="labor_force_count", - ) - target_labor_force, count_method = self._labor_force_for_year( - year=target_year, - jurisdiction=jurisdiction, - ) - count_factor = target_labor_force / source_labor_force - else: - count_factor = 1.0 - count_method = "not_required" - - if needs_amount_factor: - source_agi = self._soi_total_agi( - year=source_year, jurisdiction=jurisdiction - ) - target_agi, amount_method = self._soi_total_agi_for_year( - target_year=target_year, - jurisdiction=jurisdiction, - ) - amount_factor = target_agi / source_agi - else: - amount_factor = 1.0 - amount_method = "not_required" - - return SOIAgingFactors( - source_year=source_year, - target_year=target_year, - count_factor=count_factor, - amount_factor=amount_factor, - count_method=count_method, - amount_method=amount_method, - ) - - def _labor_force_for_year( - self, - *, - year: int, - jurisdiction: str, - ) -> tuple[float, str]: - bls_value = self._optional_target_value( - year=year, - jurisdiction=jurisdiction, - source="BLS", - variable="labor_force_count", - ) - if bls_value is not None: - return bls_value, "bls_labor_force_ratio" - cbo_value = self._optional_target_value( - year=year, - jurisdiction=jurisdiction, - source="CBO", - variable="labor_force", - ) - if cbo_value is not None: - return cbo_value, "cbo_labor_force_ratio" - raise ValueError(f"No BLS/CBO labor-force target found for {year}.") - - def _soi_total_agi_for_year( - self, - *, - target_year: int, - jurisdiction: str, - ) -> tuple[float, str]: - target_agi = self._optional_soi_total_agi( - year=target_year, - jurisdiction=jurisdiction, - ) - if target_agi is not None: - return target_agi, "soi_total_agi_ratio" - - available = { - year: value - for year in range(target_year - 20, target_year + 1) - if ( - value := self._optional_soi_total_agi( - year=year, - jurisdiction=jurisdiction, - ) - ) - is not None - } - if len(available) < 2: - raise ValueError( - "Need at least two SOI total AGI years to extrapolate " - f"aggregate income to {target_year}." - ) - latest_year = max(available) - previous_year = max(year for year in available if year < latest_year) - annual_growth = available[latest_year] / available[previous_year] - years_forward = target_year - latest_year - return ( - available[latest_year] * annual_growth**years_forward, - "soi_total_agi_last_growth_extrapolation", - ) - - def _soi_total_agi(self, *, year: int, jurisdiction: str) -> float: - value = self._optional_soi_total_agi(year=year, jurisdiction=jurisdiction) - if value is None: - raise ValueError(f"No SOI total AGI target found for {year}.") - return value - - def _optional_soi_total_agi(self, *, year: int, jurisdiction: str) -> float | None: - records = self.load_records( - period=year, - jurisdiction=jurisdiction, - sources=("IRS_SOI",), - ) - for record in records: - if ( - record.variable == "adjusted_gross_income" - and record.stratum_name == "US All Filers" - ): - return float(record.value) - for record in records: - if record.variable == "adjusted_gross_income" and record.constraints == ( - ("is_tax_filer", "==", "1"), - ): - return float(record.value) - return None - - def _target_value( - self, - *, - year: int, - jurisdiction: str, - source: str, - variable: str, - ) -> float: - value = self._optional_target_value( - year=year, - jurisdiction=jurisdiction, - source=source, - variable=variable, - ) - if value is None: - raise ValueError(f"No {source} {variable} target found for {year}.") - return value - - def _optional_target_value( - self, - *, - year: int, - jurisdiction: str, - source: str, - variable: str, - ) -> float | None: - records = self.load_records( - period=year, - jurisdiction=jurisdiction, - sources=(source,), - ) - matching = [record for record in records if record.variable == variable] - if not matching: - return None - unconstrained = [record for record in matching if not record.constraints] - if len(unconstrained) == 1: - return float(unconstrained[0].value) - return float(matching[0].value) - - -class ArchFactSQLiteTargetProvider: - """Read Arch aggregate facts and expose Microplex canonical targets.""" - - def __init__( - self, - db_path: str | Path, - *, - jurisdiction: str = "us", - compose_model_year_targets: bool = True, - age_soi_targets: bool = True, - ) -> None: - self.db_path = Path(db_path) - self.jurisdiction = jurisdiction - self.compose_model_year_targets = compose_model_year_targets - self.age_soi_targets = age_soi_targets - - def load_target_set(self, query: TargetQuery | None = None) -> TargetSet: - """Load canonical targets from Arch aggregate fact tables.""" - if not self.db_path.exists(): - raise FileNotFoundError(f"Arch facts DB not found: {self.db_path}") - - query = query or TargetQuery() - provider_filters = dict(query.provider_filters) - period = query.period if isinstance(query.period, int) else None - variables = _as_string_tuple(provider_filters.get("variables")) - domain_variables = _as_string_tuple(provider_filters.get("domain_variables")) - sources = _as_string_tuple(provider_filters.get("sources")) - geo_levels = _as_string_tuple(provider_filters.get("geo_levels")) - target_cells = _as_target_cell_filters(provider_filters.get("target_cells")) - entity_overrides = provider_filters.get("entity_overrides") or {} - compose_model_year_targets = bool( - provider_filters.get( - "compose_model_year_targets", - self.compose_model_year_targets, - ) - ) - age_soi_targets = bool( - provider_filters.get("age_soi_targets", self.age_soi_targets) - ) - - records = ( - self._compose_model_year_records( - target_year=period, - sources=sources, - age_soi_targets=age_soi_targets, - ) - if compose_model_year_targets and period is not None - else self.load_records(period=period, sources=sources) - ) - canonical_targets = TargetSet( - [ - target - for record in records - if _matches_arch_provider_filters( - record, - variables=variables, - domain_variables=domain_variables, - geo_levels=geo_levels, - target_cells=target_cells, - entity_overrides=entity_overrides, - ) - for target in [ - arch_target_record_to_canonical_spec( - record, - entity_overrides=entity_overrides, - ) - ] - if target is not None - ] - ) - return apply_target_query( - canonical_targets, - TargetQuery( - period=query.period, - entity=query.entity, - names=query.names, - metadata_filters=query.metadata_filters, - ), - ) - - def load_records( - self, - *, - period: int | None = None, - sources: tuple[str, ...] = (), - ) -> list[ArchTargetRecord]: - """Load Arch fact rows with attached fact constraints and lineage.""" - conn = sqlite3.connect(self.db_path) - conn.row_factory = sqlite3.Row - try: - clauses = ["1 = 1"] - params: list[Any] = [] - if period is not None: - clauses.append("CAST(af.period_value AS INTEGER) = ?") - params.append(int(period)) - where_clause = " AND ".join(clauses) - rows = conn.execute( - f""" - SELECT - af.fact_key, - af.source_record_id, - af.value_numeric, - af.value_text, - af.value_json, - af.period_value, - af.geography_level, - af.geography_id, - af.geography_name, - af.measure_concept, - af.measure_source_concept, - af.measure_concept_relation, - af.measure_concept_authority, - af.measure_concept_evidence_url, - af.measure_concept_evidence_notes, - af.measure_legal_vintage, - af.measure_unit, - af.aggregation_method, - af.domain, - af.filters_json, - af.label, - af.source_name, - af.source_table, - af.source_url, - af.source_method_notes, - ac.ordinal AS constraint_ordinal, - ac.variable AS constraint_variable, - ac.operator AS constraint_operator, - ac.value_text AS constraint_value_text, - ac.value_numeric AS constraint_value_numeric, - ac.value_json AS constraint_value_json - FROM aggregate_facts AS af - LEFT JOIN aggregate_constraints AS ac - ON ac.fact_key = af.fact_key - WHERE {where_clause} - ORDER BY af.fact_key, ac.ordinal - """, - params, - ).fetchall() - lineage = _load_arch_fact_lineage(conn) - finally: - conn.close() - - records = _group_arch_fact_rows(rows, lineage=lineage) - if sources: - normalized_sources = {_normalize_arch_source(source) for source in sources} - records = [ - record - for record in records - if _normalize_arch_source(record.source) in normalized_sources - ] - return records - - def _compose_model_year_records( - self, - *, - target_year: int, - sources: tuple[str, ...], - age_soi_targets: bool, - ) -> list[ArchTargetRecord]: - return _compose_arch_model_year_records( - self.load_records(period=None, sources=()), - target_year=target_year, - sources=sources, - age_soi_targets=age_soi_targets, - ) - - -class ArchConsumerFactJSONLTargetProvider: - """Read Arch consumer-contract JSONL facts as Microplex targets.""" - - schema_version = "arch.consumer_fact.v1" - - def __init__( - self, - path: str | Path, - *, - jurisdiction: str = "us", - compose_model_year_targets: bool = True, - age_soi_targets: bool = True, - ) -> None: - self.path = Path(path) - self.jurisdiction = jurisdiction - self.compose_model_year_targets = compose_model_year_targets - self.age_soi_targets = age_soi_targets - - def load_target_set(self, query: TargetQuery | None = None) -> TargetSet: - """Load canonical targets from Arch consumer-contract JSONL.""" - if not self.path.exists(): - raise FileNotFoundError(f"Arch consumer facts JSONL not found: {self.path}") - - query = query or TargetQuery() - provider_filters = dict(query.provider_filters) - period = query.period if isinstance(query.period, int) else None - variables = _as_string_tuple(provider_filters.get("variables")) - domain_variables = _as_string_tuple(provider_filters.get("domain_variables")) - sources = _as_string_tuple(provider_filters.get("sources")) - geo_levels = _as_string_tuple(provider_filters.get("geo_levels")) - target_cells = _as_target_cell_filters(provider_filters.get("target_cells")) - entity_overrides = provider_filters.get("entity_overrides") or {} - compose_model_year_targets = bool( - provider_filters.get( - "compose_model_year_targets", - self.compose_model_year_targets, - ) - ) - age_soi_targets = bool( - provider_filters.get("age_soi_targets", self.age_soi_targets) - ) - - records = ( - self._compose_model_year_records( - target_year=period, - sources=sources, - age_soi_targets=age_soi_targets, - ) - if compose_model_year_targets and period is not None - else self.load_records(period=period, sources=sources) - ) - canonical_targets = TargetSet( - [ - target - for record in records - if _matches_arch_provider_filters( - record, - variables=variables, - domain_variables=domain_variables, - geo_levels=geo_levels, - target_cells=target_cells, - entity_overrides=entity_overrides, - ) - for target in [ - arch_target_record_to_canonical_spec( - record, - entity_overrides=entity_overrides, - ) - ] - if target is not None - ] - ) - return apply_target_query( - canonical_targets, - TargetQuery( - period=query.period, - entity=query.entity, - names=query.names, - metadata_filters=query.metadata_filters, - ), - ) - - def load_records( - self, - *, - period: int | None = None, - sources: tuple[str, ...] = (), - ) -> list[ArchTargetRecord]: - """Load Arch consumer-contract fact rows.""" - if not self.path.exists(): - raise FileNotFoundError(f"Arch consumer facts JSONL not found: {self.path}") - - rows = list( - load_arch_consumer_fact_jsonl_rows( - (self.path,), - period=period, - schema_version=self.schema_version, - ) - ) - - records = _consumer_fact_rows_to_records(rows) - if sources: - normalized_sources = {_normalize_arch_source(source) for source in sources} - records = [ - record - for record in records - if _normalize_arch_source(record.source) in normalized_sources - ] - return records - - def _compose_model_year_records( - self, - *, - target_year: int, - sources: tuple[str, ...], - age_soi_targets: bool, - ) -> list[ArchTargetRecord]: - return _compose_arch_model_year_records( - self.load_records(period=None, sources=()), - target_year=target_year, - sources=sources, - age_soi_targets=age_soi_targets, - ) - - -class ArchCompositeSQLiteTargetProvider: - """Compose multiple Arch SQLite artifacts into one target provider.""" - - def __init__( - self, - db_paths: tuple[str | Path, ...], - *, - jurisdiction: str = "us", - compose_model_year_targets: bool = True, - age_soi_targets: bool = True, - ) -> None: - paths = tuple(Path(path) for path in db_paths) - if not paths: - raise ValueError("At least one Arch targets DB path is required") - self.db_paths = paths - self.path = tuple(str(path) for path in paths) - self.jurisdiction = jurisdiction - self.compose_model_year_targets = compose_model_year_targets - self.age_soi_targets = age_soi_targets - self.providers = tuple( - resolve_arch_sqlite_target_provider( - path, - jurisdiction=jurisdiction, - compose_model_year_targets=compose_model_year_targets, - age_soi_targets=age_soi_targets, - ) - for path in paths - ) - - def load_target_set(self, query: TargetQuery | None = None) -> TargetSet: - """Load and renumber targets across all configured Arch artifacts.""" - query = query or TargetQuery() - provider_filters = dict(query.provider_filters) - period = query.period if isinstance(query.period, int) else None - variables = _as_string_tuple(provider_filters.get("variables")) - domain_variables = _as_string_tuple(provider_filters.get("domain_variables")) - sources = _as_string_tuple(provider_filters.get("sources")) - geo_levels = _as_string_tuple(provider_filters.get("geo_levels")) - target_cells = _as_target_cell_filters(provider_filters.get("target_cells")) - entity_overrides = provider_filters.get("entity_overrides") or {} - compose_model_year_targets = bool( - provider_filters.get( - "compose_model_year_targets", - self.compose_model_year_targets, - ) - ) - age_soi_targets = bool( - provider_filters.get("age_soi_targets", self.age_soi_targets) - ) - - records = self.load_records( - period=period, - sources=sources, - compose_model_year_targets=compose_model_year_targets, - age_soi_targets=age_soi_targets, - ) - stratum_ids: dict[tuple[tuple[str, str, Any], ...], int] = {} - targets: list[CanonicalTargetSpec] = [] - for record in records: - if not _matches_arch_provider_filters( - record, - variables=variables, - domain_variables=domain_variables, - geo_levels=geo_levels, - target_cells=target_cells, - entity_overrides=entity_overrides, - ): - continue - target = arch_target_record_to_canonical_spec( - record, - entity_overrides=entity_overrides, - ) - if target is None: - continue - metadata = dict(target.metadata) - metadata["stratum_id"] = stratum_ids.setdefault( - _target_filter_tuple(target), - len(stratum_ids) + 1, - ) - targets.append( - replace( - target, - name=f"arch_target_{metadata['target_id']}", - metadata=metadata, - ) - ) - return apply_target_query( - TargetSet(targets), - TargetQuery( - period=query.period, - entity=query.entity, - names=query.names, - metadata_filters=query.metadata_filters, - ), - ) - - def load_records( - self, - *, - period: int | None = None, - sources: tuple[str, ...] = (), - compose_model_year_targets: bool | None = None, - age_soi_targets: bool | None = None, - ) -> list[ArchTargetRecord]: - """Load and renumber raw records across configured Arch artifacts.""" - records = self._load_all_child_records() - resolved_compose = ( - self.compose_model_year_targets - if compose_model_year_targets is None - else compose_model_year_targets - ) - resolved_age_soi = ( - self.age_soi_targets if age_soi_targets is None else age_soi_targets - ) - if resolved_compose and period is not None: - records = _compose_arch_model_year_records( - records, - target_year=period, - sources=sources, - age_soi_targets=resolved_age_soi, - ) - else: - records = [ - record - for record in records - if (period is None or record.period == period) - and _record_matches_sources(record, sources) - ] - return _renumber_arch_records(records) - - def _load_all_child_records(self) -> list[ArchTargetRecord]: - records: list[ArchTargetRecord] = [] - seen_fact_keys: set[str] = set() - for source_index, (path, provider) in enumerate( - zip(self.db_paths, self.providers, strict=True), - start=1, - ): - provider_records = _load_arch_provider_raw_records( - provider, - jurisdiction=self.jurisdiction, - ) - for record in provider_records: - if record.aggregate_fact_key is not None: - if record.aggregate_fact_key in seen_fact_keys: - continue - seen_fact_keys.add(record.aggregate_fact_key) - records.append( - replace( - record, - source_db_path=str(path), - source_db_index=source_index, - source_target_id=record.source_target_id or record.target_id, - source_stratum_id=( - record.source_stratum_id or record.stratum_id - ), - ) - ) - return records - - -def _load_arch_provider_raw_records( - provider: ( - ArchSQLiteTargetProvider - | ArchFactSQLiteTargetProvider - | ArchConsumerFactJSONLTargetProvider - | ArchCompositeSQLiteTargetProvider - ), - *, - jurisdiction: str, -) -> list[ArchTargetRecord]: - if isinstance( - provider, - (ArchFactSQLiteTargetProvider, ArchConsumerFactJSONLTargetProvider), - ): - return provider.load_records(period=None, sources=()) - if isinstance(provider, ArchCompositeSQLiteTargetProvider): - return provider._load_all_child_records() - return provider.load_records(period=None, jurisdiction=jurisdiction, sources=()) - - -def _compose_arch_model_year_records( - records: list[ArchTargetRecord], - *, - target_year: int, - sources: tuple[str, ...], - age_soi_targets: bool, -) -> list[ArchTargetRecord]: - current_records = [ - record - for record in records - if record.period == target_year and _record_matches_sources(record, sources) - ] - current_records = _with_bea_state_employment_income_before_lsr_records( - current_records - ) - normalized_sources = {_normalize_arch_source(source) for source in sources} - if sources and _normalize_arch_source("IRS_SOI") not in normalized_sources: - carry_forward_records = _latest_carry_forward_records_by_target_cell( - records, - target_year=target_year, - sources=sources, - ) - return _with_state_to_national_rollup_records( - [ - record - for record in current_records - if not _is_latest_carry_forward_candidate(record) - ] - + carry_forward_records - ) - - non_soi_current_records = [ - record - for record in current_records - if _normalize_arch_source(record.source) != "IRS_SOI" - and not _is_latest_carry_forward_candidate(record) - ] - carry_forward_records = _latest_carry_forward_records_by_target_cell( - records, - target_year=target_year, - sources=sources, - ) - soi_records = _latest_soi_records_by_composition( - records, - target_year=target_year, - ) - if age_soi_targets: - soi_records = _age_arch_soi_records_by_source_year( - soi_records, - target_year=target_year, - reference_records=records, - ) - else: - soi_records = [ - _carry_forward_arch_record_to_model_year(record, target_year=target_year) - for record in soi_records - ] - return _with_state_to_national_rollup_records( - [*non_soi_current_records, *carry_forward_records, *soi_records] - ) - - -def _latest_carry_forward_records_by_target_cell( - records: list[ArchTargetRecord], - *, - target_year: int, - sources: tuple[str, ...], -) -> list[ArchTargetRecord]: - latest_by_cell: dict[tuple[Any, ...], tuple[tuple[Any, ...], ArchTargetRecord]] = {} - for record in records: - if record.period > target_year: - continue - if not _record_matches_sources(record, sources): - continue - if not _is_latest_carry_forward_candidate(record): - continue - target = arch_target_record_to_canonical_spec(record) - if target is None: - continue - cell_key = _arch_target_carry_forward_cell_key(target) - rank = _latest_carry_forward_record_rank(record) - current = latest_by_cell.get(cell_key) - if current is None or rank > current[0]: - latest_by_cell[cell_key] = (rank, record) - return [ - ( - record - if record.period == target_year - else _carry_forward_arch_record_to_model_year( - record, - target_year=target_year, - ) - ) - for _, record in sorted( - latest_by_cell.values(), - key=lambda item: ( - _arch_record_geo_level(item[1]), - item[1].variable, - item[1].target_type, - tuple(sorted(item[1].constraints)), - item[1].target_id, - ), - ) - ] - - -def _is_latest_carry_forward_candidate(record: ArchTargetRecord) -> bool: - return ( - _normalize_arch_source(record.source) == "SSA" - and record.variable in ARCH_LATEST_CARRY_FORWARD_VARIABLES - and record.target_type in {"AMOUNT", "COUNT"} - ) - - -def _arch_target_carry_forward_cell_key( - target: CanonicalTargetSpec, -) -> tuple[Any, ...]: - domain_variables = _arch_target_domain_variables(target) - if _target_self_domain_is_redundant(target, domain_variables): - domain_variables = set() - return ( - tuple(sorted(_arch_target_cell_variables(target))), - getattr(target.aggregation, "value", target.aggregation), - _normalize_geo_level(target.metadata.get("geo_level")), - tuple(sorted(domain_variables)), - _arch_target_geographic_id(target), - ) - - -def _latest_carry_forward_record_rank(record: ArchTargetRecord) -> tuple[Any, ...]: - source_table = str(record.source_table or "").lower() - return ( - int(record.period), - "annual statistical report" in source_table, - bool(record.source_table), - record.variable == "ssi_total_payments", - int(record.target_id), - ) - - -def _with_state_to_national_rollup_records( - records: list[ArchTargetRecord], -) -> list[ArchTargetRecord]: - expanded_records = _with_component_sum_records(records) - rollups = _state_to_national_rollup_records(expanded_records) - if not rollups: - return expanded_records - return [*expanded_records, *rollups] - - -def _with_component_sum_records( - records: list[ArchTargetRecord], -) -> list[ArchTargetRecord]: - component_records = _component_sum_records(records) - if not component_records: - return records - return [*records, *component_records] - - -def _component_sum_records( - records: list[ArchTargetRecord], -) -> list[ArchTargetRecord]: - existing_keys = { - _component_sum_record_key(record, output_variable=record.variable) - for record in records - if record.target_type == "AMOUNT" - } - grouped: dict[ - tuple[Any, ...], - dict[str, ArchTargetRecord], - ] = {} - for record in records: - if record.target_type != "AMOUNT": - continue - for output_variable, component_variables in ARCH_COMPONENT_SUM_TARGETS.items(): - if record.variable not in component_variables: - continue - key = _component_sum_record_key(record, output_variable=output_variable) - if key in existing_keys: - continue - components = grouped.setdefault(key, {}) - if record.variable in components: - components.clear() - break - components[record.variable] = record - - composite_records: list[ArchTargetRecord] = [] - for key, components_by_variable in grouped.items(): - output_variable = str(key[0]) - component_variables = ARCH_COMPONENT_SUM_TARGETS[output_variable] - if set(components_by_variable) != set(component_variables): - continue - composite_records.append( - _component_records_to_sum_record( - key, - [ - components_by_variable[component_variable] - for component_variable in component_variables - ], - ) - ) - return composite_records - - -def _component_sum_record_key( - record: ArchTargetRecord, - *, - output_variable: str, -) -> tuple[Any, ...]: - return ( - output_variable, - record.target_type, - record.period, - _arch_record_geo_level(record), - record.geography_id, - tuple(sorted(record.constraints)), - _normalize_arch_source(record.source), - record.source_period, - record.aging_factors, - record.unit, - ) - - -def _component_records_to_sum_record( - key: tuple[Any, ...], - records: list[ArchTargetRecord], -) -> ArchTargetRecord: - first = records[0] - digest = sha1(repr(key).encode("utf-8")).hexdigest() - component_labels = ", ".join(record.variable for record in records) - source_tables = tuple( - dict.fromkeys(record.source_table for record in records if record.source_table) - ) - source_urls = tuple( - dict.fromkeys(record.source_url for record in records if record.source_url) - ) - source_row_keys = tuple( - dict.fromkeys( - source_row_key - for record in records - for source_row_key in ( - record.source_row_keys - or (str(record.source_target_id or record.target_id),) - ) - ) - ) - source_cell_keys = tuple( - dict.fromkeys( - source_cell_key - for record in records - for source_cell_key in record.source_cell_keys - ) - ) - notes = ( - "Microplex component sum matching PolicyEngine salt sources: " - f"{component_labels}." - ) - return replace( - first, - target_id=-int(digest[:12], 16), - stratum_id=-int(digest[12:20], 16), - variable=str(key[0]), - value=sum(record.value for record in records), - source_table=( - source_tables[0] - if len(source_tables) == 1 - else "Microplex component sum from Arch source tables" - ), - source_url=source_urls[0] if len(source_urls) == 1 else None, - notes=f"{first.notes} {notes}" if first.notes else notes, - source_record_id=f"microplex_component_sum:{digest[:16]}", - source_cell_keys=source_cell_keys, - source_row_keys=source_row_keys, - aggregate_fact_key=None, - semantic_fact_key=None, - source_target_id=None, - source_stratum_id=None, - concept=None, - source_concept=None, - concept_relation="sum_of_components", - concept_authority="policyengine_us", - concept_evidence_notes=notes, - ) - - -def _state_to_national_rollup_records( - records: list[ArchTargetRecord], -) -> list[ArchTargetRecord]: - existing_national_keys = { - key - for record in records - if _arch_record_geo_level(record) == "national" - for key in [_state_rollup_group_key(record)] - if key is not None - } - grouped: dict[tuple[Any, ...], list[tuple[str, ArchTargetRecord]]] = {} - for record in records: - if _arch_record_geo_level(record) != "state": - continue - key = _state_rollup_group_key(record) - if key is None or key in existing_national_keys: - continue - state_fips = _arch_record_state_fips(record) - if state_fips is None or state_fips not in ARCH_NATIONAL_ROLLUP_STATE_FIPS: - continue - grouped.setdefault(key, []).append((state_fips, record)) - - rollups: list[ArchTargetRecord] = [] - for key, state_records in grouped.items(): - records_by_state: dict[str, ArchTargetRecord] = {} - for state_fips, record in state_records: - if state_fips in records_by_state: - records_by_state = {} - break - records_by_state[state_fips] = record - if set(records_by_state) != ARCH_NATIONAL_ROLLUP_STATE_FIPS: - continue - ordered_records = [ - records_by_state[state_fips] - for state_fips in sorted(ARCH_NATIONAL_ROLLUP_STATE_FIPS) - ] - rollups.append( - _state_records_to_national_rollup_record( - key, - ordered_records, - ) - ) - return rollups - - -def _state_rollup_group_key(record: ArchTargetRecord) -> tuple[Any, ...] | None: - if record.variable not in ARCH_STATE_TO_NATIONAL_ROLLUP_VARIABLES: - return None - return ( - _normalize_arch_source(record.source), - record.source_table, - record.source_url, - record.variable, - record.target_type, - record.period, - record.source_period, - record.aging_factors, - record.unit, - record.concept, - record.source_concept, - record.concept_relation, - record.concept_authority, - record.legal_vintage, - _non_state_constraints(record.constraints), - ) - - -def _state_records_to_national_rollup_record( - key: tuple[Any, ...], - records: list[ArchTargetRecord], -) -> ArchTargetRecord: - first = records[0] - digest = sha1(repr(key).encode("utf-8")).hexdigest() - source_row_keys = tuple( - dict.fromkeys( - source_row_key - for record in records - for source_row_key in ( - record.source_row_keys - or (str(record.source_target_id or record.target_id),) - ) - ) - ) - source_cell_keys = tuple( - dict.fromkeys( - source_cell_key - for record in records - for source_cell_key in record.source_cell_keys - ) - ) - notes = "Microplex national rollup from 51 state targets." - if first.notes: - notes = f"{first.notes} {notes}" - return replace( - first, - target_id=-int(digest[:12], 16), - stratum_id=-int(digest[12:20], 16), - value=sum(record.value for record in records), - geographic_level=None, - geography_id=None, - stratum_name="US National Rollup", - constraints=_non_state_constraints(first.constraints), - notes=notes, - source_record_id=f"microplex_state_rollup:{digest[:16]}", - source_cell_keys=source_cell_keys, - source_row_keys=source_row_keys, - source_target_id=None, - source_stratum_id=None, - ) - - -def _with_bea_state_employment_income_before_lsr_records( - records: list[ArchTargetRecord], -) -> list[ArchTargetRecord]: - derived = _bea_state_employment_income_before_lsr_records(records) - if not derived: - return records - return [*records, *derived] - - -def _bea_state_employment_income_before_lsr_records( - records: list[ArchTargetRecord], -) -> list[ArchTargetRecord]: - national_wages = _bea_nipa_wages_record(records) - if national_wages is None: - return [] - - required_states = set(ARCH_NATIONAL_ROLLUP_STATE_FIPS) - components_by_state: dict[str, dict[str, ArchTargetRecord]] = {} - for record in records: - if _normalize_arch_source(record.source) != "BEA": - continue - component = ARCH_BEA_REGIONAL_WAGE_COMPONENTS.get(record.variable) - if component is None: - continue - if _arch_record_geo_level(record) != "state": - continue - state_fips = _arch_record_state_fips(record) - if state_fips is None or state_fips not in required_states: - continue - components_by_state.setdefault(state_fips, {}).setdefault(component, record) - - required_components = set(ARCH_BEA_REGIONAL_WAGE_COMPONENTS.values()) - if not required_states or set(components_by_state) != required_states: - return [] - if any( - set(state_components) != required_components - for state_components in components_by_state.values() - ): - return [] - - adjusted_by_state: dict[str, float] = {} - for state_fips, components in components_by_state.items(): - wages = components["wages"].value - supplements = components["supplements"].value - contributions = components["contributions"].value - residence_adjustment = components["residence_adjustment"].value - denominator = wages + supplements + contributions - if denominator <= 0: - return [] - adjusted_by_state[state_fips] = ( - wages + residence_adjustment * wages / denominator - ) - - adjusted_total = sum(adjusted_by_state.values()) - if adjusted_total <= 0: - return [] - scale_factor = national_wages.value / adjusted_total - - return [ - _bea_state_employment_income_before_lsr_record( - state_fips=state_fips, - state_components=components_by_state[state_fips], - national_wages=national_wages, - value=adjusted_by_state[state_fips] * scale_factor, - scale_factor=scale_factor, - ) - for state_fips in sorted(components_by_state) - ] - - -def _bea_nipa_wages_record( - records: list[ArchTargetRecord], -) -> ArchTargetRecord | None: - candidates = [ - record - for record in records - if _normalize_arch_source(record.source) == "BEA" - and record.variable == ARCH_BEA_STATE_EMPLOYMENT_INCOME_BEFORE_LSR_VARIABLE - and record.target_type == "AMOUNT" - and _arch_record_geo_level(record) in {"national", "country"} - and ( - record.concept == "bea_nipa.wages_and_salaries" - or record.source_concept == "bea_nipa.a034rc_wages_and_salaries" - or "nipa" in str(record.source_record_id or "").lower() - ) - ] - if not candidates: - return None - return sorted( - candidates, - key=lambda record: ( - record.concept == "bea_nipa.wages_and_salaries", - bool(record.source_record_id), - int(record.target_id), - ), - reverse=True, - )[0] - - -def _bea_state_employment_income_before_lsr_record( - *, - state_fips: str, - state_components: dict[str, ArchTargetRecord], - national_wages: ArchTargetRecord, - value: float, - scale_factor: float, -) -> ArchTargetRecord: - component_records = tuple( - state_components[component] - for component in ( - "wages", - "supplements", - "contributions", - "residence_adjustment", - ) - ) - first = component_records[0] - digest = sha1( - repr( - ( - "bea_state_employment_income_before_lsr", - first.period, - state_fips, - tuple(record.source_record_id or record.target_id for record in component_records), - national_wages.source_record_id or national_wages.target_id, - ) - ).encode("utf-8") - ).hexdigest() - source_cell_keys = tuple( - dict.fromkeys( - source_cell_key - for record in (*component_records, national_wages) - for source_cell_key in record.source_cell_keys - ) - ) - source_row_keys = tuple( - dict.fromkeys( - source_row_key - for record in (*component_records, national_wages) - for source_row_key in ( - record.source_row_keys - or (str(record.source_record_id or record.target_id),) - ) - ) - ) - state_abbr = US_STATE_ABBR_BY_FIPS.get(state_fips, state_fips) - notes = ( - "Microplex derived BEA state employment_income_before_lsr from " - "SAINC5N line 50 wages, line 60 supplements, line 36 contributions, " - "and line 42 residence adjustment. Residence adjustment is allocated " - "to wages by wages / (wages + supplements + contributions), then " - f"scaled to national BEA NIPA wages with factor {scale_factor:.12g}." - ) - return replace( - first, - target_id=-int(digest[:12], 16), - stratum_id=-int(digest[12:20], 16), - variable=ARCH_BEA_STATE_EMPLOYMENT_INCOME_BEFORE_LSR_VARIABLE, - value=float(value), - target_type="AMOUNT", - source_table="BEA Regional SAINC5N residence-adjusted state wages", - source_url=first.source_url or national_wages.source_url, - notes=notes, - stratum_name=f"{state_abbr} residence-adjusted wages", - constraints=(), - aggregate_fact_key=f"microplex.derived.bea_state_wages.{first.period}.{state_fips}", - semantic_fact_key=f"microplex.semantic.bea_state_wages.{first.period}.{state_fips}", - source_record_id=f"microplex.derived.bea_state_wages.{first.period}.{state_fips}", - source_cell_keys=source_cell_keys, - source_row_keys=source_row_keys, - concept="policyengine_us.employment_income_before_lsr", - source_concept="bea_regional.sainc5n_residence_adjusted_wages_scaled_to_nipa", - concept_relation="derived", - concept_authority="microplex-us", - concept_evidence_url=national_wages.concept_evidence_url - or first.concept_evidence_url, - concept_evidence_notes=notes, - legal_vintage=national_wages.legal_vintage or first.legal_vintage, - source_target_id=None, - source_stratum_id=None, - ) - - -def _non_state_constraints( - constraints: tuple[tuple[str, str, str], ...], -) -> tuple[tuple[str, str, str], ...]: - return tuple( - constraint for constraint in constraints if constraint[0] != "state_fips" - ) - - -def _arch_record_state_fips(record: ArchTargetRecord) -> str | None: - for variable, operator, value in record.constraints: - if variable != "state_fips": - continue - if _canonical_arch_constraint_operator(operator) != "==": - continue - try: - return str(int(float(value))).zfill(2) - except (TypeError, ValueError): - return str(value).zfill(2) - if _normalize_geo_level(record.geographic_level) == "state": - geography_id = record.geography_id - if geography_id is not None: - return _state_fips_from_arch_geography_id(geography_id) - return None - - -def _latest_soi_records_by_composition( - records: list[ArchTargetRecord], - *, - target_year: int, -) -> list[ArchTargetRecord]: - candidates = [ - record - for record in records - if _normalize_arch_source(record.source) == "IRS_SOI" - and record.period <= target_year - ] - latest_period_by_key: dict[ - tuple[str, str, str, tuple[tuple[str, str, str], ...]], - int, - ] = {} - for record in candidates: - key = _arch_record_composition_key(record) - latest_period_by_key[key] = max( - latest_period_by_key.get(key, record.period), - record.period, - ) - return [ - record - for record in candidates - if record.period == latest_period_by_key[_arch_record_composition_key(record)] - ] - - -def _age_arch_soi_records_by_source_year( - records: list[ArchTargetRecord], - *, - target_year: int, - reference_records: list[ArchTargetRecord], -) -> list[ArchTargetRecord]: - aged: list[ArchTargetRecord] = [] - for source_year in sorted({record.period for record in records}): - source_records = [record for record in records if record.period == source_year] - if source_year == target_year: - aged.extend(source_records) - continue - needs_count_factor = any( - record.target_type == "COUNT" for record in source_records - ) - needs_amount_factor = any( - record.target_type == "AMOUNT" for record in source_records - ) - factors = _arch_record_soi_aging_factors( - reference_records, - source_year=source_year, - target_year=target_year, - needs_count_factor=needs_count_factor, - needs_amount_factor=needs_amount_factor, - ) - for record in source_records: - factor = 1.0 - if record.target_type == "COUNT": - factor = factors.count_factor - elif record.target_type == "AMOUNT": - factor = factors.amount_factor - aged.append( - replace( - record, - value=float(record.value) * factor, - period=target_year, - source_period=record.period, - aging_factors=factors, - ) - ) - return aged - - -def _carry_forward_arch_record_to_model_year( - record: ArchTargetRecord, - *, - target_year: int, -) -> ArchTargetRecord: - if record.period == target_year: - return record - return replace(record, period=target_year, source_period=record.period) - - -def _arch_record_soi_aging_factors( - records: list[ArchTargetRecord], - *, - source_year: int, - target_year: int, - needs_count_factor: bool, - needs_amount_factor: bool, -) -> SOIAgingFactors: - if source_year == target_year: - return SOIAgingFactors( - source_year=source_year, - target_year=target_year, - count_factor=1.0, - amount_factor=1.0, - count_method="identity", - amount_method="identity", - ) - - if needs_count_factor: - count_factor, count_method = _arch_record_soi_count_aging_factor( - records, - source_year=source_year, - target_year=target_year, - ) - else: - count_factor = 1.0 - count_method = "not_required" - - if needs_amount_factor: - amount_factor, amount_method = _arch_record_soi_amount_aging_factor( - records, - source_year=source_year, - target_year=target_year, - ) - else: - amount_factor = 1.0 - amount_method = "not_required" - - return SOIAgingFactors( - source_year=source_year, - target_year=target_year, - count_factor=count_factor, - amount_factor=amount_factor, - count_method=count_method, - amount_method=amount_method, - ) - - -def _arch_record_soi_count_aging_factor( - records: list[ArchTargetRecord], - *, - source_year: int, - target_year: int, -) -> tuple[float, str]: - source_labor_force = _optional_arch_total_value( - records, - year=source_year, - source="BLS", - variable="labor_force_count", - ) - target_labor_force, labor_force_method = _optional_arch_labor_force_for_year( - records, - year=target_year, - ) - if source_labor_force is not None and target_labor_force is not None: - return target_labor_force / source_labor_force, labor_force_method - - source_count = _optional_arch_soi_total_value( - records, - year=source_year, - variable="tax_unit_count", - ) - target_count, count_method = _arch_soi_total_for_year( - records, - target_year=target_year, - variable="tax_unit_count", - exact_method="soi_total_return_count_ratio", - extrapolation_method="soi_total_return_count_last_growth_extrapolation", - ) - if source_count is not None and target_count is not None: - return target_count / source_count, count_method - return 1.0, "source_fact_carry_forward_no_count_reference" - - -def _arch_record_soi_amount_aging_factor( - records: list[ArchTargetRecord], - *, - source_year: int, - target_year: int, -) -> tuple[float, str]: - source_agi = _optional_arch_soi_total_value( - records, - year=source_year, - variable="adjusted_gross_income", - ) - target_agi, amount_method = _arch_soi_total_for_year( - records, - target_year=target_year, - variable="adjusted_gross_income", - exact_method="soi_total_agi_ratio", - extrapolation_method="soi_total_agi_last_growth_extrapolation", - ) - if source_agi is not None and target_agi is not None: - return target_agi / source_agi, amount_method - return 1.0, "source_fact_carry_forward_no_amount_reference" - - -def _optional_arch_labor_force_for_year( - records: list[ArchTargetRecord], - *, - year: int, -) -> tuple[float | None, str]: - bls_value = _optional_arch_total_value( - records, - year=year, - source="BLS", - variable="labor_force_count", - ) - if bls_value is not None: - return bls_value, "bls_labor_force_ratio" - cbo_value = _optional_arch_total_value( - records, - year=year, - source="CBO", - variable="labor_force", - ) - if cbo_value is not None: - return cbo_value, "cbo_labor_force_ratio" - return None, "source_fact_carry_forward_no_labor_force_reference" - - -def _arch_soi_total_for_year( - records: list[ArchTargetRecord], - *, - target_year: int, - variable: str, - exact_method: str, - extrapolation_method: str, -) -> tuple[float | None, str]: - exact = _optional_arch_soi_total_value( - records, - year=target_year, - variable=variable, - ) - if exact is not None: - return exact, exact_method - - available = { - year: value - for year in sorted({record.period for record in records}) - if year <= target_year - for value in [ - _optional_arch_soi_total_value( - records, - year=year, - variable=variable, - ) - ] - if value is not None - } - if len(available) < 2: - return None, f"source_fact_carry_forward_no_{variable}_reference" - latest_year = max(available) - previous_year = max(year for year in available if year < latest_year) - annual_growth = available[latest_year] / available[previous_year] - years_forward = target_year - latest_year - return available[latest_year] * annual_growth**years_forward, extrapolation_method - - -def _optional_arch_soi_total_value( - records: list[ArchTargetRecord], - *, - year: int, - variable: str, -) -> float | None: - return _optional_arch_total_value( - records, - year=year, - source="IRS_SOI", - variable=variable, - require_total_scope=True, - ) - - -def _optional_arch_total_value( - records: list[ArchTargetRecord], - *, - year: int, - source: str, - variable: str, - require_total_scope: bool = False, -) -> float | None: - matches = [ - record - for record in records - if record.period == year - and _normalize_arch_source(record.source) == _normalize_arch_source(source) - and record.variable == variable - ] - if require_total_scope: - matches = [record for record in matches if _arch_record_is_total_scope(record)] - if not matches: - return None - return float(matches[0].value) - - -def _arch_record_is_total_scope(record: ArchTargetRecord) -> bool: - if not record.constraints: - return True - if tuple(record.constraints) == (("is_tax_filer", "==", "1"),): - return True - if tuple(record.constraints) == (("tax_unit_is_filer", "==", "1"),): - return True - return str(record.stratum_name or "").lower().endswith("all filers") - - -def _record_matches_sources( - record: ArchTargetRecord, - sources: tuple[str, ...], -) -> bool: - if not sources: - return True - normalized_sources = {_normalize_arch_source(source) for source in sources} - return _normalize_arch_source(record.source) in normalized_sources - - -def _renumber_arch_records(records: list[ArchTargetRecord]) -> list[ArchTargetRecord]: - renumbered: list[ArchTargetRecord] = [] - stratum_ids: dict[tuple[tuple[str, str, str], ...], int] = {} - for record in records: - renumbered.append( - replace( - record, - target_id=len(renumbered) + 1, - stratum_id=stratum_ids.setdefault( - record.constraints, - len(stratum_ids) + 1, - ), - ) - ) - return renumbered - - -def resolve_arch_sqlite_target_provider( - db_path: str | Path | tuple[str | Path, ...], - *, - jurisdiction: str = "us", - compose_model_year_targets: bool = True, - age_soi_targets: bool = True, -) -> ( - ArchSQLiteTargetProvider - | ArchFactSQLiteTargetProvider - | ArchConsumerFactJSONLTargetProvider - | ArchCompositeSQLiteTargetProvider -): - """Return the Arch provider matching a source artifact's schema.""" - paths = _as_arch_db_path_tuple(db_path) - if len(paths) > 1: - return ArchCompositeSQLiteTargetProvider( - paths, - jurisdiction=jurisdiction, - compose_model_year_targets=compose_model_year_targets, - age_soi_targets=age_soi_targets, - ) - path = paths[0] - if not path.exists(): - raise FileNotFoundError(f"Arch targets DB not found: {path}") - if _looks_like_arch_consumer_fact_jsonl(path): - return ArchConsumerFactJSONLTargetProvider( - path, - jurisdiction=jurisdiction, - compose_model_year_targets=compose_model_year_targets, - age_soi_targets=age_soi_targets, - ) - conn = sqlite3.connect(path) - try: - if _sqlite_table_exists(conn, "aggregate_facts"): - return ArchFactSQLiteTargetProvider( - path, - jurisdiction=jurisdiction, - compose_model_year_targets=compose_model_year_targets, - age_soi_targets=age_soi_targets, - ) - finally: - conn.close() - return ArchSQLiteTargetProvider( - path, - jurisdiction=jurisdiction, - compose_model_year_targets=compose_model_year_targets, - age_soi_targets=age_soi_targets, - ) - - -def summarize_arch_target_profile_coverage( - provider: ( - ArchSQLiteTargetProvider - | ArchFactSQLiteTargetProvider - | ArchConsumerFactJSONLTargetProvider - | ArchCompositeSQLiteTargetProvider - ), - *, - period: int, - profile_name: str = "pe_native_broad", - target_cells: tuple[PolicyEngineUSTargetCell | dict[str, Any], ...] | None = None, - sources: tuple[str, ...] = (), - jurisdiction: str | None = None, - compose_model_year_targets: bool | None = None, - age_soi_targets: bool | None = None, - entity_overrides: dict[str, Any] | None = None, - provider_filters: dict[str, Any] | None = None, -) -> ArchTargetProfileCoverageReport: - """Summarize how much of a Microplex target profile Arch can satisfy.""" - - resolved_cells = ( - tuple(target_cells) - if target_cells is not None - else resolve_policyengine_us_target_profile(profile_name) - ) - cell_filters = tuple( - _target_cell_to_provider_filter(cell) for cell in resolved_cells - ) - query_filters: dict[str, Any] = dict(provider_filters or {}) - query_filters["target_profile"] = profile_name - query_filters["target_cells"] = [dict(cell) for cell in cell_filters] - if sources: - query_filters["sources"] = list(sources) - if jurisdiction is not None: - query_filters["jurisdiction"] = jurisdiction - if compose_model_year_targets is not None: - query_filters["compose_model_year_targets"] = compose_model_year_targets - if age_soi_targets is not None: - query_filters["age_soi_targets"] = age_soi_targets - if entity_overrides is not None: - query_filters["entity_overrides"] = entity_overrides - - target_set = provider.load_target_set( - TargetQuery(period=period, provider_filters=query_filters) - ) - coverage_cells = tuple( - _coverage_for_arch_target_cell(cell_filter, target_set) - for cell_filter in cell_filters - ) - target_cell_count = len(coverage_cells) - covered_cell_count = sum(1 for cell in coverage_cells if cell.covered) - uncovered_cell_count = target_cell_count - covered_cell_count - coverage_rate = covered_cell_count / target_cell_count if target_cell_count else 0.0 - return ArchTargetProfileCoverageReport( - profile_name=profile_name, - period=int(period), - target_cell_count=target_cell_count, - covered_cell_count=covered_cell_count, - uncovered_cell_count=uncovered_cell_count, - coverage_rate=coverage_rate, - by_geo_level=_summarize_arch_cell_coverage(coverage_cells, field="geo_level"), - by_variable=_summarize_arch_cell_coverage(coverage_cells, field="variable"), - cells=coverage_cells, - ) - - -def summarize_arch_target_gap_queue( - provider: ( - ArchSQLiteTargetProvider - | ArchFactSQLiteTargetProvider - | ArchConsumerFactJSONLTargetProvider - | ArchCompositeSQLiteTargetProvider - ), - *, - period: int, - profile_name: str = "pe_native_broad", - include_covered: bool = False, - target_cells: tuple[PolicyEngineUSTargetCell | dict[str, Any], ...] | None = None, - sources: tuple[str, ...] = (), - jurisdiction: str | None = None, - compose_model_year_targets: bool | None = None, - age_soi_targets: bool | None = None, - entity_overrides: dict[str, Any] | None = None, - provider_filters: dict[str, Any] | None = None, -) -> ArchTargetGapQueueReport: - """Build an agent-facing queue of Arch target records to add or review.""" - - coverage = summarize_arch_target_profile_coverage( - provider, - period=period, - profile_name=profile_name, - target_cells=target_cells, - sources=sources, - jurisdiction=jurisdiction, - compose_model_year_targets=compose_model_year_targets, - age_soi_targets=age_soi_targets, - entity_overrides=entity_overrides, - provider_filters=provider_filters, - ) - catalog = _arch_gap_loaded_variable_catalog( - provider, - period=period, - jurisdiction=jurisdiction, - sources=sources, - compose_model_year_targets=compose_model_year_targets, - age_soi_targets=age_soi_targets, - ) - variable_uncovered_counts = { - variable: counts["uncovered_cell_count"] - for variable, counts in coverage.by_variable.items() - } - rows = [ - _arch_gap_queue_row_for_coverage_cell( - coverage_cell, - profile_name=profile_name, - period=period, - loaded_variable_catalog=catalog, - variable_uncovered_count=variable_uncovered_counts.get( - str(coverage_cell.cell.get("variable") or ""), - 0, - ), - ) - for coverage_cell in coverage.cells - if include_covered or not coverage_cell.covered - ] - rows = [ - replace(row, priority=priority) - for priority, row in enumerate( - sorted(rows, key=_arch_gap_queue_sort_key), - start=1, - ) - ] - by_loader_status: dict[str, int] = {} - by_gap_category: dict[str, int] = {} - for row in rows: - by_loader_status[row.loader_status] = ( - by_loader_status.get(row.loader_status, 0) + 1 - ) - by_gap_category[row.gap_category] = by_gap_category.get(row.gap_category, 0) + 1 - covered_row_count = sum(1 for row in rows if row.covered) - return ArchTargetGapQueueReport( - profile_name=profile_name, - period=int(period), - row_count=len(rows), - covered_row_count=covered_row_count, - uncovered_row_count=len(rows) - covered_row_count, - by_loader_status=dict(sorted(by_loader_status.items())), - by_gap_category=dict(sorted(by_gap_category.items())), - rows=tuple(rows), - ) - - -def summarize_arch_target_parity( - incumbent_provider: ( - ArchSQLiteTargetProvider - | ArchFactSQLiteTargetProvider - | ArchConsumerFactJSONLTargetProvider - | ArchCompositeSQLiteTargetProvider - ), - candidate_provider: ( - ArchSQLiteTargetProvider - | ArchFactSQLiteTargetProvider - | ArchConsumerFactJSONLTargetProvider - | ArchCompositeSQLiteTargetProvider - ), - *, - period: int, - sources: tuple[str, ...] = (), - variables: tuple[str, ...] = (), - value_abs_tolerance: float = 1e-6, - value_rel_tolerance: float = 1e-12, -) -> ArchTargetParityReport: - """Compare canonical Microplex targets loaded from two Arch artifacts.""" - provider_filters: dict[str, Any] = {} - if sources: - provider_filters["sources"] = tuple(sources) - if variables: - provider_filters["variables"] = tuple(variables) - - query = TargetQuery(period=period, provider_filters=provider_filters) - incumbent_targets = list(incumbent_provider.load_target_set(query).targets) - candidate_targets = list(candidate_provider.load_target_set(query).targets) - rows = _arch_target_parity_rows( - incumbent_targets, - candidate_targets, - value_abs_tolerance=value_abs_tolerance, - value_rel_tolerance=value_rel_tolerance, - ) - errors = tuple( - _arch_target_parity_error(row) for row in rows if row.status != "matched" - ) - counts = { - "incumbent_target_count": len(incumbent_targets), - "candidate_target_count": len(candidate_targets), - "matched_count": sum(1 for row in rows if row.status == "matched"), - "value_mismatch_count": sum( - 1 for row in rows if row.status == "value_mismatch" - ), - "incumbent_only_count": sum( - 1 for row in rows if row.status == "incumbent_only" - ), - "candidate_only_count": sum( - 1 for row in rows if row.status == "candidate_only" - ), - "duplicate_identity_count": sum( - 1 for row in rows if row.status == "duplicate_identity" - ), - } - return ArchTargetParityReport( - period=int(period), - incumbent_artifacts=_arch_provider_artifacts(incumbent_provider), - candidate_artifacts=_arch_provider_artifacts(candidate_provider), - value_abs_tolerance=value_abs_tolerance, - value_rel_tolerance=value_rel_tolerance, - counts=counts, - rows=rows, - errors=errors, - ) - - -def main_coverage(argv: list[str] | None = None) -> int: - """CLI entrypoint for Arch target-profile coverage JSON.""" - import argparse - import json - import sys - - parser = argparse.ArgumentParser( - description="Summarize Arch target DB coverage for a Microplex target profile." - ) - parser.add_argument( - "--arch-targets-db", - required=True, - action="append", - help=( - "Arch targets SQLite DB path or consumer-fact JSONL path. May be " - "supplied multiple times to combine source-package artifacts." - ), - ) - parser.add_argument("--period", type=int, required=True) - parser.add_argument("--profile", default="pe_native_broad") - parser.add_argument("--jurisdiction", default="us") - parser.add_argument("--source", action="append", dest="sources", default=[]) - parser.add_argument( - "--no-compose-model-year-targets", - action="store_false", - dest="compose_model_year_targets", - default=True, - ) - parser.add_argument( - "--no-age-soi-targets", - action="store_false", - dest="age_soi_targets", - default=True, - ) - parser.add_argument("--indent", type=int, default=2) - args = parser.parse_args(argv) - - provider = resolve_arch_sqlite_target_provider( - _single_or_many_paths(args.arch_targets_db), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - report = summarize_arch_target_profile_coverage( - provider, - period=args.period, - profile_name=args.profile, - sources=tuple(args.sources), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - json.dump(report.to_dict(), sys.stdout, indent=args.indent, sort_keys=True) - sys.stdout.write("\n") - return 0 - - -def main_gaps(argv: list[str] | None = None) -> int: - """CLI entrypoint for Arch target-profile gap queue rows.""" - import argparse - import json - import sys - - parser = argparse.ArgumentParser( - description="Emit an agent-facing Arch target gap queue for a profile." - ) - parser.add_argument( - "--arch-targets-db", - required=True, - action="append", - help=( - "Arch targets SQLite DB path or consumer-fact JSONL path. May be " - "supplied multiple times to combine source-package artifacts." - ), - ) - parser.add_argument("--period", type=int, required=True) - parser.add_argument("--profile", default="pe_native_broad") - parser.add_argument("--jurisdiction", default="us") - parser.add_argument("--source", action="append", dest="sources", default=[]) - parser.add_argument("--include-covered", action="store_true") - parser.add_argument("--format", choices=["json", "csv"], default="json") - parser.add_argument("--output") - parser.add_argument( - "--no-compose-model-year-targets", - action="store_false", - dest="compose_model_year_targets", - default=True, - ) - parser.add_argument( - "--no-age-soi-targets", - action="store_false", - dest="age_soi_targets", - default=True, - ) - parser.add_argument("--indent", type=int, default=2) - args = parser.parse_args(argv) - - provider = resolve_arch_sqlite_target_provider( - _single_or_many_paths(args.arch_targets_db), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - report = summarize_arch_target_gap_queue( - provider, - period=args.period, - profile_name=args.profile, - include_covered=args.include_covered, - sources=tuple(args.sources), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - if args.format == "csv": - output = _arch_target_gap_queue_csv(report) - else: - output = json.dumps(report.to_dict(), indent=args.indent, sort_keys=True) - output += "\n" - - if args.output: - Path(args.output).write_text(output) - else: - sys.stdout.write(output) - return 0 - - -def main_refresh(argv: list[str] | None = None) -> int: - """Refresh Arch target coverage and gap snapshots for a target profile.""" - import argparse - import json - import sys - - parser = argparse.ArgumentParser( - description=( - "Write Arch target-profile coverage, gap queue, and summary artifacts." - ) - ) - parser.add_argument( - "--arch-targets-db", - action="append", - default=[], - help=( - "Arch targets SQLite DB path or consumer-fact JSONL path. May be " - "supplied multiple times. If omitted, --artifact-root is searched." - ), - ) - parser.add_argument( - "--artifact-root", - action="append", - default=[], - help=( - "Directory or file to search for Arch target artifacts when " - "--arch-targets-db is omitted. May be supplied multiple times." - ), - ) - parser.add_argument("--period", type=int, required=True) - parser.add_argument("--profile", default="pe_native_broad") - parser.add_argument("--jurisdiction", default="us") - parser.add_argument("--source", action="append", dest="sources", default=[]) - parser.add_argument( - "--output-dir", - default="artifacts/arch-target-coverage", - help="Directory for coverage JSON, gap JSON/CSV, and markdown summary.", - ) - parser.add_argument( - "--no-compose-model-year-targets", - action="store_false", - dest="compose_model_year_targets", - default=True, - ) - parser.add_argument( - "--no-age-soi-targets", - action="store_false", - dest="age_soi_targets", - default=True, - ) - parser.add_argument("--indent", type=int, default=2) - args = parser.parse_args(argv) - - artifact_paths = tuple(Path(path) for path in args.arch_targets_db) - if not artifact_paths: - discovery_roots = ( - tuple(Path(path) for path in args.artifact_root) - or _default_arch_target_artifact_roots() - ) - artifact_paths = discover_arch_target_artifacts(discovery_roots) - if not artifact_paths: - roots = args.artifact_root or [ - str(path) for path in _default_arch_target_artifact_roots() - ] - raise FileNotFoundError( - "No Arch target artifacts found. Pass --arch-targets-db or place " - f"consumer_facts.jsonl / Arch targets DB files under: {', '.join(roots)}" - ) - - provider = resolve_arch_sqlite_target_provider( - _single_or_many_paths([str(path) for path in artifact_paths]), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - coverage = summarize_arch_target_profile_coverage( - provider, - period=args.period, - profile_name=args.profile, - sources=tuple(args.sources), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - gaps = summarize_arch_target_gap_queue( - provider, - period=args.period, - profile_name=args.profile, - sources=tuple(args.sources), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - stem = f"{_filename_slug(args.profile)}_{int(args.period)}" - coverage_path = output_dir / f"{stem}_coverage.json" - gaps_json_path = output_dir / f"{stem}_gaps.json" - gaps_csv_path = output_dir / f"{stem}_gaps.csv" - summary_path = output_dir / f"{stem}_summary.md" - - coverage_path.write_text( - json.dumps(coverage.to_dict(), indent=args.indent, sort_keys=True) + "\n" - ) - gaps_json_path.write_text( - json.dumps(gaps.to_dict(), indent=args.indent, sort_keys=True) + "\n" - ) - gaps_csv_path.write_text(_arch_target_gap_queue_csv(gaps)) - summary_path.write_text( - _arch_target_refresh_summary_markdown( - coverage, - gaps, - artifact_paths=artifact_paths, - output_paths=( - coverage_path, - gaps_json_path, - gaps_csv_path, - summary_path, - ), - ) - ) - - json.dump( - { - "profile_name": coverage.profile_name, - "period": coverage.period, - "target_cell_count": coverage.target_cell_count, - "covered_cell_count": coverage.covered_cell_count, - "uncovered_cell_count": coverage.uncovered_cell_count, - "coverage_rate": coverage.coverage_rate, - "artifact_paths": [str(path) for path in artifact_paths], - "output_paths": { - "coverage": str(coverage_path), - "gaps_json": str(gaps_json_path), - "gaps_csv": str(gaps_csv_path), - "summary": str(summary_path), - }, - }, - sys.stdout, - indent=args.indent, - sort_keys=True, - ) - sys.stdout.write("\n") - return 0 - - -def main_parity(argv: list[str] | None = None) -> int: - """CLI entrypoint comparing incumbent and candidate Arch target artifacts.""" - import argparse - import json - import sys - - parser = argparse.ArgumentParser( - description=( - "Compare two Arch target artifacts after loading both through the " - "Microplex Arch provider." - ) - ) - parser.add_argument( - "--incumbent-arch-targets-db", - required=True, - action="append", - help=( - "Incumbent Arch targets SQLite DB path. May be supplied multiple " - "times to combine artifacts." - ), - ) - parser.add_argument( - "--candidate-arch-targets-db", - required=True, - action="append", - help=( - "Candidate Arch targets SQLite DB or consumer-fact JSONL path. May " - "be supplied multiple times to combine artifacts." - ), - ) - parser.add_argument("--period", type=int, required=True) - parser.add_argument("--jurisdiction", default="us") - parser.add_argument("--source", action="append", dest="sources", default=[]) - parser.add_argument("--variable", action="append", dest="variables", default=[]) - parser.add_argument("--value-abs-tolerance", type=float, default=1e-6) - parser.add_argument("--value-rel-tolerance", type=float, default=1e-12) - parser.add_argument("--row-limit", type=int, default=50) - parser.add_argument( - "--no-compose-model-year-targets", - action="store_false", - dest="compose_model_year_targets", - default=True, - ) - parser.add_argument( - "--no-age-soi-targets", - action="store_false", - dest="age_soi_targets", - default=True, - ) - parser.add_argument("--indent", type=int, default=2) - args = parser.parse_args(argv) - - try: - incumbent_provider = resolve_arch_sqlite_target_provider( - _single_or_many_paths(args.incumbent_arch_targets_db), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - candidate_provider = resolve_arch_sqlite_target_provider( - _single_or_many_paths(args.candidate_arch_targets_db), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - payload = summarize_arch_target_parity( - incumbent_provider, - candidate_provider, - period=args.period, - sources=tuple(args.sources), - variables=tuple(args.variables), - value_abs_tolerance=args.value_abs_tolerance, - value_rel_tolerance=args.value_rel_tolerance, - ).to_dict(row_limit=args.row_limit) - except Exception as exc: # noqa: BLE001 - CLI must return JSON on failures. - payload = { - "valid": False, - "period": args.period, - "incumbent_artifacts": list(args.incumbent_arch_targets_db), - "candidate_artifacts": list(args.candidate_arch_targets_db), - "counts": { - "incumbent_target_count": 0, - "candidate_target_count": 0, - "matched_count": 0, - "value_mismatch_count": 0, - "incumbent_only_count": 0, - "candidate_only_count": 0, - "duplicate_identity_count": 0, - }, - "row_count": 0, - "rows": [], - "errors": [{"code": "load_failed", "message": str(exc)}], - } - - json.dump(payload, sys.stdout, indent=args.indent, sort_keys=True) - sys.stdout.write("\n") - return 0 if payload["valid"] else 1 - - -def main_smoke(argv: list[str] | None = None) -> int: - """CLI entrypoint proving Arch artifacts load as Microplex targets.""" - import argparse - import json - import sys - - parser = argparse.ArgumentParser( - description=( - "Load an Arch target artifact, including consumer_facts.jsonl, " - "through the Microplex Arch provider and emit a JSON smoke report." - ) - ) - parser.add_argument( - "--arch-targets-db", - required=True, - action="append", - help=( - "Arch targets SQLite DB path or consumer-fact JSONL path. May be " - "supplied multiple times to combine source-package artifacts." - ), - ) - parser.add_argument("--period", type=int, required=True) - parser.add_argument("--jurisdiction", default="us") - parser.add_argument("--source", action="append", dest="sources", default=[]) - parser.add_argument("--variable", action="append", dest="variables", default=[]) - parser.add_argument("--expected-target-count", type=int) - parser.add_argument("--sample-limit", type=int, default=5) - parser.add_argument( - "--no-compose-model-year-targets", - action="store_false", - dest="compose_model_year_targets", - default=True, - ) - parser.add_argument( - "--no-age-soi-targets", - action="store_false", - dest="age_soi_targets", - default=True, - ) - parser.add_argument("--indent", type=int, default=2) - args = parser.parse_args(argv) - - errors: list[dict[str, str]] = [] - targets: list[CanonicalTargetSpec] = [] - try: - provider = resolve_arch_sqlite_target_provider( - _single_or_many_paths(args.arch_targets_db), - jurisdiction=args.jurisdiction, - compose_model_year_targets=args.compose_model_year_targets, - age_soi_targets=args.age_soi_targets, - ) - provider_filters: dict[str, Any] = {} - if args.sources: - provider_filters["sources"] = tuple(args.sources) - if args.variables: - provider_filters["variables"] = tuple(args.variables) - targets = list( - provider.load_target_set( - TargetQuery( - period=args.period, - provider_filters=provider_filters, - ) - ).targets - ) - except Exception as exc: # noqa: BLE001 - CLI must return JSON on failures. - errors.append({"code": "load_failed", "message": str(exc)}) - - if ( - args.expected_target_count is not None - and len(targets) != args.expected_target_count - ): - errors.append( - { - "code": "unexpected_target_count", - "message": ( - f"Expected {args.expected_target_count} targets, " - f"loaded {len(targets)}." - ), - } - ) - - payload = { - "valid": not errors, - "period": args.period, - "target_count": len(targets), - "by_variable": dict( - sorted(Counter(_target_variable(target) for target in targets).items()) - ), - "by_source": dict( - sorted(Counter(str(target.source) for target in targets).items()) - ), - "by_aggregation": dict( - sorted( - Counter( - str(getattr(target.aggregation, "value", target.aggregation)) - for target in targets - ).items() - ) - ), - "by_filter_count": { - str(key): value - for key, value in sorted( - Counter(len(target.filters) for target in targets).items() - ) - }, - "sample_targets": [ - _target_smoke_sample(target) - for target in targets[: max(0, args.sample_limit)] - ], - "errors": errors, - } - json.dump(payload, sys.stdout, indent=args.indent, sort_keys=True) - sys.stdout.write("\n") - return 0 if payload["valid"] else 1 - - -def _target_variable(target: CanonicalTargetSpec) -> str: - """Return the Microplex variable represented by a canonical target.""" - variable = target.metadata.get("variable") if target.metadata else None - return str(variable or target.measure or target.name) - - -def _target_smoke_sample(target: CanonicalTargetSpec) -> dict[str, Any]: - """Return a compact JSON sample for an Arch target smoke report.""" - return { - "name": target.name, - "variable": _target_variable(target), - "aggregation": str(getattr(target.aggregation, "value", target.aggregation)), - "measure": target.measure, - "value": target.value, - "period": target.period, - "source": str(target.source), - "filters": [ - { - "feature": target_filter.feature, - "operator": str( - getattr(target_filter.operator, "value", target_filter.operator) - ), - "value": target_filter.value, - } - for target_filter in target.filters - ], - "metadata": { - key: target.metadata[key] - for key in ( - "arch_aggregate_fact_key", - "arch_semantic_fact_key", - "arch_source_record_id", - "geo_level", - ) - if key in target.metadata - }, - } - - -def _arch_target_parity_rows( - incumbent_targets: list[CanonicalTargetSpec], - candidate_targets: list[CanonicalTargetSpec], - *, - value_abs_tolerance: float, - value_rel_tolerance: float, -) -> tuple[ArchTargetParityRow, ...]: - incumbent_by_identity = _index_arch_targets_by_parity_identity(incumbent_targets) - candidate_by_identity = _index_arch_targets_by_parity_identity(candidate_targets) - rows: list[ArchTargetParityRow] = [] - for identity in sorted( - set(incumbent_by_identity) | set(candidate_by_identity), - key=str, - ): - incumbent_group = tuple(incumbent_by_identity.get(identity, ())) - candidate_group = tuple(candidate_by_identity.get(identity, ())) - absolute_delta: float | None = None - relative_delta: float | None = None - if len(incumbent_group) != 1 or len(candidate_group) != 1: - status = _arch_target_parity_nonunique_status( - incumbent_group, - candidate_group, - ) - else: - incumbent_value = float(incumbent_group[0].value) - candidate_value = float(candidate_group[0].value) - absolute_delta = candidate_value - incumbent_value - relative_delta = ( - absolute_delta / incumbent_value if incumbent_value != 0 else None - ) - status = ( - "matched" - if _arch_target_values_match( - incumbent_value, - candidate_value, - abs_tolerance=value_abs_tolerance, - rel_tolerance=value_rel_tolerance, - ) - else "value_mismatch" - ) - rows.append( - ArchTargetParityRow( - status=status, - identity=identity, - incumbent_targets=incumbent_group, - candidate_targets=candidate_group, - absolute_delta=absolute_delta, - relative_delta=relative_delta, - ) - ) - return tuple(sorted(rows, key=_arch_target_parity_row_sort_key)) - - -def _index_arch_targets_by_parity_identity( - targets: list[CanonicalTargetSpec], -) -> dict[tuple[Any, ...], list[CanonicalTargetSpec]]: - indexed: dict[tuple[Any, ...], list[CanonicalTargetSpec]] = {} - for target in targets: - indexed.setdefault(_arch_target_parity_identity(target), []).append(target) - return indexed - - -def _arch_target_parity_identity(target: CanonicalTargetSpec) -> tuple[Any, ...]: - metadata = target.metadata or {} - return ( - str(getattr(target.entity, "value", target.entity)), - str(getattr(target.aggregation, "value", target.aggregation)), - str(target.measure or ""), - _arch_target_period_value(target.period), - str(target.source or ""), - _target_variable(target), - str(metadata.get("geo_level") or ""), - str(_arch_target_geographic_id(target) or ""), - _target_parity_filter_tuple(target), - ) - - -def _arch_target_period_value(value: int | str) -> int | str: - try: - return int(value) - except (TypeError, ValueError): - return str(value) - - -def _target_parity_filter_tuple( - target: CanonicalTargetSpec, -) -> tuple[tuple[str, str, str], ...]: - return tuple( - sorted( - ( - str(target_filter.feature), - str(getattr(target_filter.operator, "value", target_filter.operator)), - _json_scalar_text(target_filter.value), - ) - for target_filter in target.filters - ) - ) - - -def _arch_target_parity_nonunique_status( - incumbent_targets: tuple[CanonicalTargetSpec, ...], - candidate_targets: tuple[CanonicalTargetSpec, ...], -) -> str: - if len(incumbent_targets) > 1 or len(candidate_targets) > 1: - return "duplicate_identity" - if not incumbent_targets: - return "candidate_only" - if not candidate_targets: - return "incumbent_only" - return "duplicate_identity" - - -def _arch_target_values_match( - incumbent_value: float, - candidate_value: float, - *, - abs_tolerance: float, - rel_tolerance: float, -) -> bool: - delta = abs(candidate_value - incumbent_value) - if delta <= abs_tolerance: - return True - scale = max(abs(incumbent_value), abs(candidate_value), 1.0) - return delta <= rel_tolerance * scale - - -def _arch_target_parity_row_sort_key(row: ArchTargetParityRow) -> tuple[int, str]: - status_rank = { - "value_mismatch": 0, - "incumbent_only": 1, - "candidate_only": 2, - "duplicate_identity": 3, - "matched": 4, - } - return ( - status_rank.get(row.status, 99), - json.dumps(_arch_target_parity_identity_dict(row.identity), sort_keys=True), - ) - - -def _arch_target_parity_error(row: ArchTargetParityRow) -> dict[str, Any]: - identity = _arch_target_parity_identity_dict(row.identity) - if row.status == "value_mismatch": - return { - "code": "value_mismatch", - "message": "Candidate target value differs from incumbent target value.", - "identity": identity, - "incumbent_value": row.incumbent_targets[0].value, - "candidate_value": row.candidate_targets[0].value, - "absolute_delta": row.absolute_delta, - "relative_delta": row.relative_delta, - } - if row.status == "incumbent_only": - return { - "code": "missing_candidate_target", - "message": "Incumbent target identity is absent from the candidate artifact.", - "identity": identity, - "incumbent_target_count": len(row.incumbent_targets), - "candidate_target_count": len(row.candidate_targets), - } - if row.status == "candidate_only": - return { - "code": "unexpected_candidate_target", - "message": "Candidate target identity is absent from the incumbent artifact.", - "identity": identity, - "incumbent_target_count": len(row.incumbent_targets), - "candidate_target_count": len(row.candidate_targets), - } - return { - "code": "duplicate_identity", - "message": "A target identity is not unique in one or both artifacts.", - "identity": identity, - "incumbent_target_count": len(row.incumbent_targets), - "candidate_target_count": len(row.candidate_targets), - } - - -def _arch_target_parity_identity_dict(identity: tuple[Any, ...]) -> dict[str, Any]: - ( - entity, - aggregation, - measure, - period, - source, - variable, - geo_level, - geographic_id, - filters, - ) = identity - return { - "entity": entity, - "aggregation": aggregation, - "measure": measure or None, - "period": period, - "source": source or None, - "variable": variable, - "geo_level": geo_level or None, - "geographic_id": geographic_id or None, - "filters": [ - {"feature": feature, "operator": operator, "value": value} - for feature, operator, value in filters - ], - } - - -def _target_parity_sample(target: CanonicalTargetSpec) -> dict[str, Any]: - sample = _target_smoke_sample(target) - metadata = dict(sample["metadata"]) - for key in ( - "target_id", - "source_table", - "display_label", - "arch_source_period", - "arch_model_period", - ): - if key in target.metadata: - metadata[key] = target.metadata[key] - sample["metadata"] = metadata - return sample - - -def _arch_provider_artifacts( - provider: ( - ArchSQLiteTargetProvider - | ArchFactSQLiteTargetProvider - | ArchConsumerFactJSONLTargetProvider - | ArchCompositeSQLiteTargetProvider - ), -) -> tuple[str, ...]: - if isinstance(provider, ArchCompositeSQLiteTargetProvider): - return tuple(str(path) for path in provider.db_paths) - path = getattr(provider, "db_path", None) or getattr(provider, "path", None) - if path is None: - return () - return (str(path),) - - -def arch_target_record_to_canonical_spec( - record: ArchTargetRecord, - *, - entity_overrides: dict[str, Any] | None = None, -) -> CanonicalTargetSpec | None: - """Translate one Arch target record into a canonical core target spec.""" - if record.target_type == "RATE": - return None - if _should_skip_arch_target_record(record): - return None - - filters = list(_canonical_filters_for_arch_constraints(record.constraints)) - geography_filter = _target_filter_for_arch_geography(record) - if geography_filter is not None: - filters.append(geography_filter) - entity_overrides = entity_overrides or {} - source_variable = record.variable - model_variable: str - aggregation: TargetAggregation - measure: str | None - entity: EntityType - - if record.target_type == "COUNT": - count_mapping = ARCH_COUNT_VARIABLE_ALIASES.get(source_variable) - positive_measure = _positive_measure_for_count_record(source_variable) - if count_mapping is not None: - model_variable, entity, count_filter_measure = count_mapping - if count_filter_measure is not None: - filters.append( - TargetFilter( - feature=count_filter_measure, - operator=">", - value=0, - ) - ) - elif positive_measure is not None: - model_variable = "tax_unit_count" - entity = EntityType.TAX_UNIT - filters.append( - TargetFilter(feature=positive_measure, operator=">", value=0) - ) - else: - return None - aggregation = TargetAggregation.COUNT - measure = None - elif record.target_type == "AMOUNT": - model_variable = ARCH_AMOUNT_VARIABLE_ALIASES.get( - source_variable, source_variable - ) - aggregation = TargetAggregation.SUM - measure = model_variable - if _is_blocked_self_employment_binding(record, model_variable): - raise ValueError( - "Broad Arch business/proprietors income cannot be exposed as " - "plain self_employment_income; use a dedicated proprietors-income " - "target or an explicit composite mapping." - ) - entity = _entity_for_measure(model_variable, entity_overrides) - if model_variable in ARCH_POSITIVE_AMOUNT_FILTER_VARIABLES: - filters.append( - TargetFilter( - feature=model_variable, - operator=">", - value=0, - ) - ) - else: - return None - - if source_variable in ARCH_SSA_PAYMENT_TYPE_AMOUNT_VARIABLES: - filters = [ - target_filter - for target_filter in filters - if str(target_filter.feature) != "program_payment_type" - ] - filters = list(_dedupe_target_filters(filters)) - display_label = _arch_target_display_label(record) - metadata = { - "target_id": record.target_id, - "stratum_id": record.stratum_id, - "display_label": display_label, - "variable": model_variable, - "model_variable_role": policyengine_us_variable_role(model_variable).value, - "arch_variable": record.variable, - "arch_target_type": record.target_type, - "target_semantic": record.target_type.lower(), - "source": record.source, - "source_table": record.source_table, - "source_url": record.source_url, - "notes": record.notes, - "stratum_name": record.stratum_name, - "jurisdiction": record.jurisdiction, - "geo_level": _arch_record_geo_level(record), - "geographic_level": record.geographic_level, - "geography_id": record.geography_id, - "constraint_count": len(filters), - "arch_source_period": record.source_period or record.period, - "arch_model_period": record.period, - } - if record.aggregate_fact_key is not None: - metadata["arch_aggregate_fact_key"] = record.aggregate_fact_key - if record.semantic_fact_key is not None: - metadata["arch_semantic_fact_key"] = record.semantic_fact_key - if record.source_record_id is not None: - metadata["arch_source_record_id"] = record.source_record_id - if record.source_cell_keys: - metadata["arch_source_cell_keys"] = list(record.source_cell_keys) - if record.source_row_keys: - metadata["arch_source_row_keys"] = list(record.source_row_keys) - if record.unit is not None: - metadata["unit"] = record.unit - if record.concept is not None: - metadata["arch_concept"] = record.concept - if record.source_concept is not None: - metadata["arch_source_concept"] = record.source_concept - if record.concept_relation is not None: - metadata["arch_concept_relation"] = record.concept_relation - if record.concept_authority is not None: - metadata["arch_concept_authority"] = record.concept_authority - if record.concept_evidence_url is not None: - metadata["arch_concept_evidence_url"] = record.concept_evidence_url - if record.concept_evidence_notes is not None: - metadata["arch_concept_evidence_notes"] = record.concept_evidence_notes - if record.legal_vintage is not None: - metadata["arch_legal_vintage"] = record.legal_vintage - if record.source_db_path is not None: - metadata["arch_source_db_path"] = record.source_db_path - if record.source_db_index is not None: - metadata["arch_source_db_index"] = record.source_db_index - if record.source_target_id is not None: - metadata["arch_source_target_id"] = record.source_target_id - if record.source_stratum_id is not None: - metadata["arch_source_stratum_id"] = record.source_stratum_id - if record.aging_factors is not None: - factors = record.aging_factors - metadata.update( - { - "arch_aged": True, - "arch_aging_source_year": factors.source_year, - "arch_aging_target_year": factors.target_year, - "arch_aging_count_factor": factors.count_factor, - "arch_aging_amount_factor": factors.amount_factor, - "arch_aging_count_method": factors.count_method, - "arch_aging_amount_method": factors.amount_method, - } - ) - - return CanonicalTargetSpec( - name=f"arch_target_{record.target_id}", - entity=entity, - value=record.value, - period=record.period, - measure=measure, - aggregation=aggregation, - filters=tuple(filters), - source=record.source, - description=display_label, - metadata=metadata, - ) - - -def _should_skip_arch_target_record(record: ArchTargetRecord) -> bool: - return ( - record.variable in ARCH_UNSUPPORTED_RATIO_OR_COMPONENT_VARIABLES - or _is_bea_regional_country_record(record) - ) - - -def _is_blocked_self_employment_binding( - record: ArchTargetRecord, - model_variable: str, -) -> bool: - if model_variable != "self_employment_income": - return False - markers = { - str(value) - for value in ( - record.variable, - record.concept, - record.source_concept, - record.source_record_id, - ) - if value is not None - } - markers.update( - f"{variable}:{value}" - for variable, _, value in record.constraints - if value is not None - ) - return bool(markers & ARCH_BROAD_BUSINESS_INCOME_SELF_EMPLOYMENT_BLOCKLIST) - - -def _is_bea_regional_country_record(record: ArchTargetRecord) -> bool: - if not _has_bea_regional_lineage(record): - return False - if str(record.geography_id) == "0100000US": - return True - return _arch_record_geo_level(record) in {"national", "country"} - - -def _has_bea_regional_lineage(record: ArchTargetRecord) -> bool: - lineage_values = ( - record.concept, - record.source_concept, - record.source_record_id, - ) - return any( - str(value).startswith("bea_regional.") - or str(value).startswith("bea-regional.") - or ".bea-regional-" in str(value) - for value in lineage_values - if value is not None - ) - - -def _group_arch_target_rows(rows: list[sqlite3.Row]) -> list[ArchTargetRecord]: - grouped: dict[int, dict[str, Any]] = {} - for row in rows: - target_id = int(row["target_id"]) - item = grouped.setdefault( - target_id, - { - "target_id": target_id, - "stratum_id": int(row["stratum_id"]), - "variable": row["variable"], - "period": int(row["period"]), - "value": float(row["value"]), - "target_type": str(row["target_type"]), - "geographic_level": row["geographic_level"], - "geography_id": None, - "source": row["source"], - "source_table": row["source_table"], - "source_url": row["source_url"], - "notes": row["notes"], - "stratum_name": row["stratum_name"], - "jurisdiction": row["jurisdiction"], - "constraints": [], - }, - ) - if row["constraint_variable"] is not None: - constraint = ( - str(row["constraint_variable"]), - str(row["constraint_operator"]), - str(row["constraint_value"]), - ) - if constraint not in item["constraints"]: - item["constraints"].append(constraint) - return [ - ArchTargetRecord( - **{ - **item, - "constraints": tuple(item["constraints"]), - } - ) - for item in grouped.values() - ] - - -def _load_arch_fact_lineage( - conn: sqlite3.Connection, -) -> dict[str, dict[str, tuple[str, ...]]]: - lineage: dict[str, dict[str, tuple[str, ...]]] = {} - if _sqlite_table_exists(conn, "fact_source_cells"): - for row in conn.execute( - """ - SELECT fact_key, source_cell_key - FROM fact_source_cells - ORDER BY fact_key, ordinal - """ - ): - fact_key = str(row["fact_key"]) - item = lineage.setdefault(fact_key, {}) - item["source_cell_keys"] = ( - *item.get("source_cell_keys", ()), - str(row["source_cell_key"]), - ) - if _sqlite_table_exists(conn, "fact_source_rows"): - for row in conn.execute( - """ - SELECT fact_key, source_row_key - FROM fact_source_rows - ORDER BY fact_key, ordinal - """ - ): - fact_key = str(row["fact_key"]) - item = lineage.setdefault(fact_key, {}) - item["source_row_keys"] = ( - *item.get("source_row_keys", ()), - str(row["source_row_key"]), - ) - return lineage - - -def _consumer_fact_rows_to_records( - rows: list[dict[str, Any]], -) -> list[ArchTargetRecord]: - records: list[ArchTargetRecord] = [] - stratum_ids: dict[tuple[tuple[str, str, str], ...], int] = {} - for target_id, row in enumerate(rows, start=1): - concept = arch_consumer_fact_concept(row) - if concept is None: - continue - if _should_skip_arch_fact_concept(concept): - continue - target_identity = _arch_consumer_fact_target_identity(row) - if target_identity is None: - continue - domain_constraints = _arch_consumer_fact_domain_constraints(row) - if domain_constraints is None: - continue - constraints = tuple( - dict.fromkeys( - constraint - for constraint in ( - *domain_constraints, - *( - _arch_consumer_fact_constraint(constraint) - for constraint in _consumer_fact_universe_constraints(row).get( - "constraints", [] - ) - ), - ) - if constraint is not None - ) - ) - stratum_id = stratum_ids.setdefault(constraints, len(stratum_ids) + 1) - variable, target_type = target_identity - source = row.get("source") or {} - observed_measure = row.get("observed_measure") or {} - geography = row.get("geography") or {} - lineage = row.get("lineage") or {} - concept_alignment = row.get("concept_alignment") or {} - source_name = ( - source.get("source_name") or observed_measure.get("source_name") or "arch" - ) - records.append( - ArchTargetRecord( - target_id=target_id, - stratum_id=stratum_id, - variable=variable, - period=_consumer_fact_period(row), - value=_json_numeric_value(row.get("value")), - target_type=target_type, - geographic_level=_arch_consumer_fact_geographic_level(row), - geography_id=geography.get("id"), - source=_normalize_arch_source(str(source_name)), - source_table=source.get("source_table") - or observed_measure.get("source_table"), - source_url=source.get("url"), - notes=source.get("method_notes"), - stratum_name=_arch_consumer_fact_stratum_name(row), - jurisdiction="US", - constraints=constraints, - aggregate_fact_key=row.get("aggregate_fact_key"), - semantic_fact_key=row.get("semantic_fact_key"), - source_record_id=arch_consumer_fact_source_record_id(row), - source_cell_keys=tuple(lineage.get("source_cell_keys") or ()), - source_row_keys=tuple(lineage.get("source_row_keys") or ()), - unit=observed_measure.get("unit"), - concept=_arch_consumer_fact_concept(row), - source_concept=concept_alignment.get("source_concept") - or observed_measure.get("source_concept"), - concept_relation=concept_alignment.get("relation"), - concept_authority=concept_alignment.get("authority"), - concept_evidence_url=concept_alignment.get("evidence_url"), - concept_evidence_notes=concept_alignment.get("evidence_notes"), - legal_vintage=concept_alignment.get("legal_vintage"), - ) - ) - return records - - -def _consumer_fact_period(row: dict[str, Any]) -> int: - return arch_consumer_fact_period(row) - - -def _arch_consumer_fact_target_identity( - row: dict[str, Any], -) -> tuple[str, str] | None: - concept = _arch_consumer_fact_concept(row) - if concept in ARCH_FACT_CONCEPTS_TO_SKIP: - return None - if concept == "ssa.annual_oasdi_or_ssi_payment_amount": - return (_ssa_payment_variable_from_consumer_fact(row), "AMOUNT") - return ARCH_FACT_CONCEPT_TO_TARGET.get(concept) - - -def _ssa_payment_variable_from_consumer_fact(row: dict[str, Any]) -> str: - for constraint in _consumer_fact_universe_constraints(row).get("constraints", []): - if ( - constraint.get("variable") - == "us_social_security_and_ssi.program_payment_type" - ): - return str(constraint.get("value")) - raise ValueError("SSA payment fact row has no program payment type constraint.") - - -def _arch_consumer_fact_concept(row: dict[str, Any]) -> str: - concept = arch_consumer_fact_concept(row) - if concept is None: - raise ValueError("Arch consumer fact row has no mappable concept.") - return concept - - -def _arch_consumer_fact_domain_constraints( - row: dict[str, Any], -) -> tuple[tuple[str, str, str], ...] | None: - domain = str(_consumer_fact_universe_constraints(row).get("domain")) - return _arch_fact_domain_constraints_for_domain(domain) - - -def _arch_consumer_fact_constraint( - constraint: dict[str, Any], -) -> tuple[str, str, str] | None: - variable = str(constraint["variable"]) - if variable in ARCH_IGNORED_FACT_CONSTRAINT_VARIABLES: - return None - value = _json_scalar_text(constraint.get("value")) - if variable == "sex": - return _arch_sex_constraint(str(constraint["operator"]), value) - try: - mapped_variable = ARCH_FACT_CONSTRAINT_VARIABLE_ALIASES[variable] - except KeyError as exc: - raise ValueError( - f"No Microplex Arch consumer fact constraint mapping for variable {variable!r}" - ) from exc - return ( - mapped_variable, - str(constraint["operator"]), - value, - ) - - -def _consumer_fact_universe_constraints(row: dict[str, Any]) -> dict[str, Any]: - universe_constraints = row.get("universe_constraints") or {} - if not isinstance(universe_constraints, dict): - raise ValueError("Arch consumer fact universe_constraints must be an object.") - return universe_constraints - - -def _arch_consumer_fact_geographic_level(row: dict[str, Any]) -> str | None: - geography = row.get("geography") or {} - return _arch_geographic_level_from_arch_level(geography.get("level")) - - -def _arch_consumer_fact_stratum_name(row: dict[str, Any]) -> str: - dimensions = row.get("dimensions") or {} - income_range = dimensions.get("income_range") - geography_name = _arch_consumer_fact_geography_name(row) - if income_range == "all": - return f"{geography_name} All Filers" - if income_range: - return f"{geography_name} Filers AGI {income_range}" - return str(row.get("label") or geography_name) - - -def _arch_consumer_fact_geography_name(row: dict[str, Any]) -> str: - geography = row.get("geography") or {} - level = str(geography.get("level") or "").lower() - if level == "country": - return "US" - return str(geography.get("name") or geography.get("id") or "US") - - -def _group_arch_fact_rows( - rows: list[sqlite3.Row], - *, - lineage: dict[str, dict[str, tuple[str, ...]]], -) -> list[ArchTargetRecord]: - grouped: dict[str, dict[str, Any]] = {} - stratum_ids: dict[tuple[tuple[str, str, str], ...], int] = {} - for row in rows: - if _should_skip_arch_fact_concept(str(row["measure_concept"])): - continue - target_identity = _arch_fact_target_identity(row) - if target_identity is None: - continue - domain_constraints = _arch_fact_domain_constraints(row) - if domain_constraints is None: - continue - fact_key = str(row["fact_key"]) - item = grouped.setdefault( - fact_key, - { - "row": row, - "target_identity": target_identity, - "constraints": list(domain_constraints), - }, - ) - if row["constraint_variable"] is not None: - constraint = _arch_fact_constraint(row) - if constraint is not None: - item["constraints"].append(constraint) - - records: list[ArchTargetRecord] = [] - for target_id, (fact_key, item) in enumerate(sorted(grouped.items()), start=1): - row = item["row"] - constraints = tuple(dict.fromkeys(item["constraints"])) - stratum_id = stratum_ids.setdefault(constraints, len(stratum_ids) + 1) - variable, target_type = item["target_identity"] - period = int(row["period_value"]) - source_name = row["source_name"] or "arch" - fact_lineage = lineage.get(fact_key, {}) - records.append( - ArchTargetRecord( - target_id=target_id, - stratum_id=stratum_id, - variable=variable, - period=period, - value=_arch_fact_numeric_value(row), - target_type=target_type, - geographic_level=_arch_fact_geographic_level(row), - geography_id=row["geography_id"], - source=_normalize_arch_source(source_name), - source_table=row["source_table"], - source_url=row["source_url"], - notes=row["source_method_notes"], - stratum_name=_arch_fact_stratum_name(row), - jurisdiction="US", - constraints=constraints, - aggregate_fact_key=fact_key, - semantic_fact_key=_arch_fact_semantic_key(row, constraints), - source_record_id=row["source_record_id"], - source_cell_keys=fact_lineage.get("source_cell_keys", ()), - source_row_keys=fact_lineage.get("source_row_keys", ()), - unit=row["measure_unit"], - concept=row["measure_concept"], - source_concept=row["measure_source_concept"], - concept_relation=row["measure_concept_relation"], - concept_authority=row["measure_concept_authority"], - concept_evidence_url=row["measure_concept_evidence_url"], - concept_evidence_notes=row["measure_concept_evidence_notes"], - legal_vintage=row["measure_legal_vintage"], - ) - ) - return records - - -def _should_skip_arch_fact_concept(concept: str) -> bool: - return concept in ARCH_SKIPPED_FACT_CONCEPTS - - -def _arch_fact_target_identity(row: sqlite3.Row) -> tuple[str, str] | None: - concept = str(row["measure_concept"]) - if concept in ARCH_FACT_CONCEPTS_TO_SKIP: - return None - return ARCH_FACT_CONCEPT_TO_TARGET.get(concept) - - -def _arch_fact_domain_constraints( - row: sqlite3.Row, -) -> tuple[tuple[str, str, str], ...] | None: - domain = str(row["domain"]) - return _arch_fact_domain_constraints_for_domain(domain) - - -def _arch_fact_domain_constraints_for_domain( - domain: str, -) -> tuple[tuple[str, str, str], ...] | None: - return ARCH_FACT_DOMAIN_CONSTRAINTS.get(domain) - - -def _arch_fact_constraint(row: sqlite3.Row) -> tuple[str, str, str] | None: - variable = str(row["constraint_variable"]) - if variable in ARCH_IGNORED_FACT_CONSTRAINT_VARIABLES: - return None - value = _sqlite_json_scalar_text( - row["constraint_value_text"], - row["constraint_value_numeric"], - row["constraint_value_json"], - ) - if variable == "sex": - return _arch_sex_constraint(str(row["constraint_operator"]), value) - try: - mapped_variable = ARCH_FACT_CONSTRAINT_VARIABLE_ALIASES[variable] - except KeyError as exc: - raise ValueError( - f"No Microplex Arch fact constraint mapping for variable {variable!r}" - ) from exc - return ( - mapped_variable, - str(row["constraint_operator"]), - value, - ) - - -def _arch_sex_constraint(operator: str, value: str) -> tuple[str, str, str]: - canonical_operator = _canonical_arch_constraint_operator(operator) - value_text = str(value).strip().lower() - if value_text in {"female", "f", "2", "2.0"}: - is_female_value = "1" - elif value_text in {"male", "m", "1", "1.0"}: - is_female_value = "0" - else: - raise ValueError(f"No Microplex Arch sex constraint mapping for value {value!r}") - if canonical_operator == "==": - return ("is_female", "==", is_female_value) - if canonical_operator == "!=": - return ("is_female", "==", "0" if is_female_value == "1" else "1") - raise ValueError( - f"No Microplex Arch sex constraint mapping for operator {operator!r}" - ) - - -def _arch_fact_numeric_value(row: sqlite3.Row) -> float: - numeric = row["value_numeric"] - if numeric is not None: - return float(numeric) - return float(_sqlite_json_scalar_text(row["value_text"], None, row["value_json"])) - - -def _sqlite_json_scalar_text( - text_value: Any, - numeric_value: Any, - json_value: Any, -) -> str: - if text_value is not None: - return str(text_value) - if numeric_value is not None: - numeric = float(numeric_value) - return str(int(numeric)) if numeric.is_integer() else str(numeric) - return str(json_value) - - -def _arch_fact_geographic_level(row: sqlite3.Row) -> str | None: - return _arch_geographic_level_from_arch_level(row["geography_level"]) - - -def _arch_geographic_level_from_arch_level(level_value: Any) -> str | None: - level = str(level_value or "").lower() - if level == "country": - return "NATIONAL" - if level == "state": - return "STATE" - if level == "county": - return "COUNTY" - if level in {"congressional_district", "congressional-district"}: - return "CONGRESSIONAL_DISTRICT" - if level in { - "state_legislative_district_upper", - "state-legislative-district-upper", - }: - return "STATE_LEGISLATIVE_DISTRICT_UPPER" - if level in { - "state_legislative_district_lower", - "state-legislative-district-lower", - }: - return "STATE_LEGISLATIVE_DISTRICT_LOWER" - return level.upper() if level else None - - -def _json_numeric_value(value: Any) -> float: - return arch_consumer_fact_numeric_value(value) - - -def _json_scalar_text(value: Any) -> str: - if isinstance(value, float) and value.is_integer(): - return str(int(value)) - if isinstance(value, (int, float, str)): - return str(value) - return json.dumps(value, sort_keys=True) - - -def _arch_fact_stratum_name(row: sqlite3.Row) -> str: - income_range = _json_object_value(row["filters_json"], "income_range") - geography_name = row["geography_name"] or "US" - if income_range is None: - return str(geography_name) - if income_range == "all": - return f"{geography_name} All Filers" - return f"{geography_name} Filers AGI {income_range}" - - -def _arch_fact_semantic_key( - row: sqlite3.Row, - constraints: tuple[tuple[str, str, str], ...], -) -> str: - constraint_key = ",".join( - f"{variable}{operator}{value}" for variable, operator, value in constraints - ) - return "|".join( - [ - "arch.semantic_fact.v1", - str(row["measure_concept"]), - str(row["domain"]), - f"{row['period_value']}", - f"{row['geography_level']}:{row['geography_id']}", - constraint_key, - ] - ) - - -def _json_object_value(raw: Any, key: str) -> Any: - if raw is None: - return None - import json - - try: - payload = json.loads(str(raw)) - except json.JSONDecodeError: - return None - if not isinstance(payload, dict): - return None - return payload.get(key) - - -def _arch_target_display_label(record: ArchTargetRecord) -> str: - measure_label = _arch_target_measure_label(record) - scope_label = _arch_target_scope_label(record) - source_label = _humanize_arch_source(record.source) - suffix = ( - f" ({source_label}, {record.period})" if source_label else f" ({record.period})" - ) - if scope_label: - return f"{measure_label} for {scope_label}{suffix}" - return f"{measure_label}{suffix}" - - -def _arch_target_measure_label(record: ArchTargetRecord) -> str: - source_variable = str(record.variable) - override = ARCH_VARIABLE_LABEL_OVERRIDES.get(source_variable) - if override is not None: - return override - if record.target_type == "COUNT": - for suffix in ("_returns", "_claims", "_count"): - if source_variable.endswith(suffix): - base = source_variable.removesuffix(suffix) - return f"{_humanize_snake_label(base)} {suffix.removeprefix('_')}" - return f"{_humanize_snake_label(source_variable)} count" - if record.target_type == "AMOUNT": - if source_variable.endswith("_amount"): - return f"{_humanize_snake_label(source_variable.removesuffix('_amount'))} amount" - return f"{_humanize_snake_label(source_variable)} amount" - return _humanize_snake_label(source_variable) - - -def _arch_target_scope_label(record: ArchTargetRecord) -> str: - if record.stratum_name: - return str(record.stratum_name) - constraint_labels = [ - label - for constraint in record.constraints - for label in [_arch_constraint_display_label(constraint)] - if label - ] - if constraint_labels: - return ", ".join(constraint_labels) - jurisdiction = str(record.jurisdiction or "").strip() - return jurisdiction.upper().replace("_", " ") if jurisdiction else "" - - -def _arch_constraint_display_label( - constraint: tuple[str, str, str], -) -> str: - variable, operator, value = constraint - canonical_operator = _canonical_arch_constraint_operator(operator) - value_text = str(value) - if variable == "agi_bracket": - return f"AGI {ARCH_AGI_BRACKET_LABELS.get(value_text, value_text)}" - if variable == "is_tax_filer" and canonical_operator == "==": - if _truthy_constraint_value(value_text): - return "tax filers" - if _falsey_constraint_value(value_text): - return "non-filers" - if variable == "state_fips" and canonical_operator == "==": - return f"state FIPS {str(value_text).zfill(2)}" - if variable == "congressional_district" and canonical_operator == "==": - return f"congressional district {str(value_text).zfill(2)}" - if variable == "sldu_id" and canonical_operator == "==": - return f"state senate district {value_text}" - if variable == "sldl_id" and canonical_operator == "==": - return f"state house district {value_text}" - if variable == "snap_receipt_status" and canonical_operator == "==": - if value_text == "receiving_food_stamps_snap": - return "SNAP > 0" - if value_text == "not_receiving_food_stamps_snap": - return "SNAP = 0" - positive_feature = ARCH_POSITIVE_CONSTRAINT_ALIASES.get(variable) - if positive_feature is not None and canonical_operator == "==": - feature_label = _humanize_snake_label(positive_feature) - if _truthy_constraint_value(value_text): - return f"{feature_label} > 0" - if _falsey_constraint_value(value_text): - return f"{feature_label} = 0" - return f"{_humanize_snake_label(variable)} {canonical_operator} {value_text}" - - -def _truthy_constraint_value(value: str) -> bool: - try: - return float(str(value)) == 1.0 - except ValueError: - return str(value).strip().lower() in { - "true", - "yes", - "receiving_food_stamps_snap", - } - - -def _falsey_constraint_value(value: str) -> bool: - try: - return float(str(value)) == 0.0 - except ValueError: - return str(value).strip().lower() in { - "false", - "no", - "not_receiving_food_stamps_snap", - } - - -def _humanize_arch_source(source: str | None) -> str: - if not source: - return "" - return _humanize_snake_label(str(source)) - - -def _humanize_snake_label(value: str) -> str: - words = [ - ARCH_LABEL_WORD_OVERRIDES.get(word.lower(), word.lower()) - for word in str(value).replace("-", "_").split("_") - if word - ] - if not words: - return "" - label = " ".join(words) - label = label[0].upper() + label[1:] - return label.replace("Tax exempt", "Tax-exempt") - - -def _canonical_filters_for_arch_constraints( - constraints: tuple[tuple[str, str, str], ...], -) -> tuple[TargetFilter, ...]: - filters: list[TargetFilter] = [] - equalities = _constraint_equalities(constraints) - for variable, operator, value in constraints: - canonical_operator = _canonical_arch_constraint_operator(operator) - if variable == "ssi_category": - filters.extend( - _ssi_category_filters_for_arch_constraint( - operator=canonical_operator, - value=value, - ) - ) - continue - if variable == "agi_bracket": - filters.extend(_agi_bracket_filters(value)) - continue - if variable == "congressional_district": - geoid = _congressional_district_geoid( - state_fips=equalities.get("state_fips"), - district=value, - ) - filters.append( - TargetFilter( - feature="congressional_district_geoid", - operator=canonical_operator, - value=geoid or value, - ) - ) - continue - if variable == "snap_receipt_status": - if value == "receiving_food_stamps_snap" and canonical_operator == "==": - filters.append(TargetFilter(feature="snap", operator=">", value=0)) - continue - if value == "not_receiving_food_stamps_snap" and canonical_operator == "==": - filters.append(TargetFilter(feature="snap", operator="==", value=0)) - continue - positive_feature = ARCH_POSITIVE_CONSTRAINT_ALIASES.get(variable) - if positive_feature is not None: - filters.append( - _positive_support_filter_for_arch_constraint( - positive_feature, - operator=canonical_operator, - value=value, - ) - ) - continue - feature = ARCH_CONSTRAINT_VARIABLE_ALIASES.get(variable, variable) - filters.append( - TargetFilter(feature=feature, operator=canonical_operator, value=value) - ) - return _dedupe_target_filters(filters) - - -def _ssi_category_filters_for_arch_constraint( - *, - operator: str, - value: str, -) -> tuple[TargetFilter, ...]: - category = str(value).strip().lower() - category_feature = { - "aged": "is_ssi_aged", - "blind": "is_blind", - "disabled": "is_ssi_disabled", - }.get(category) - if operator == "==" and category_feature is not None: - return ( - TargetFilter( - feature=category_feature, - operator=">", - value=0, - ), - ) - return (TargetFilter(feature="ssi_category", operator=operator, value=value),) - - -def _target_filter_for_arch_geography(record: ArchTargetRecord) -> TargetFilter | None: - geography_id = record.geography_id - if geography_id is None: - return None - geo_level = _arch_record_geo_level(record) - if geo_level == "state": - return TargetFilter( - feature="state_fips", - operator="==", - value=_state_fips_from_arch_geography_id(geography_id), - ) - if geo_level == "county": - return TargetFilter( - feature="county_fips", - operator="==", - value=_county_fips_from_arch_geography_id(geography_id), - ) - if geo_level == "district": - return TargetFilter( - feature="congressional_district_geoid", - operator="==", - value=_congressional_district_from_arch_geography_id(geography_id), - ) - if geo_level == "sldu": - return TargetFilter( - feature="sldu_id", - operator="==", - value=_state_legislative_district_from_arch_geography_id( - geography_id, - chamber="upper", - ), - ) - if geo_level == "sldl": - return TargetFilter( - feature="sldl_id", - operator="==", - value=_state_legislative_district_from_arch_geography_id( - geography_id, - chamber="lower", - ), - ) - return None - - -def _state_fips_from_arch_geography_id(geography_id: Any) -> str: - raw = str(geography_id) - if raw.startswith("0400000US"): - return raw[-2:] - if raw.isdigit(): - return raw.zfill(2) - return raw - - -def _county_fips_from_arch_geography_id(geography_id: Any) -> str: - raw = str(geography_id) - if raw.startswith("0500000US"): - return raw[-5:] - if raw.isdigit(): - return raw.zfill(5) - return raw - - -def _congressional_district_from_arch_geography_id(geography_id: Any) -> str: - raw = str(geography_id) - if raw.startswith("500") and "US" in raw: - return raw.rsplit("US", 1)[-1] - return raw - - -ARCH_STATE_ABBR_BY_FIPS = { - "01": "AL", - "02": "AK", - "04": "AZ", - "05": "AR", - "06": "CA", - "08": "CO", - "09": "CT", - "10": "DE", - "11": "DC", - "12": "FL", - "13": "GA", - "15": "HI", - "16": "ID", - "17": "IL", - "18": "IN", - "19": "IA", - "20": "KS", - "21": "KY", - "22": "LA", - "23": "ME", - "24": "MD", - "25": "MA", - "26": "MI", - "27": "MN", - "28": "MS", - "29": "MO", - "30": "MT", - "31": "NE", - "32": "NV", - "33": "NH", - "34": "NJ", - "35": "NM", - "36": "NY", - "37": "NC", - "38": "ND", - "39": "OH", - "40": "OK", - "41": "OR", - "42": "PA", - "44": "RI", - "45": "SC", - "46": "SD", - "47": "TN", - "48": "TX", - "49": "UT", - "50": "VT", - "51": "VA", - "53": "WA", - "54": "WV", - "55": "WI", - "56": "WY", - "72": "PR", -} - - -def _state_legislative_district_from_arch_geography_id( - geography_id: Any, - *, - chamber: str, -) -> str: - return normalize_state_legislative_district_id( - geography_id, chamber=chamber - ) or str(geography_id) - - -def _canonical_arch_constraint_operator(operator: str) -> str: - value = str(operator).strip() - return ARCH_CONSTRAINT_OPERATOR_ALIASES.get(value.lower(), value) - - -def _constraint_equalities( - constraints: tuple[tuple[str, str, str], ...], -) -> dict[str, str]: - return { - variable: value - for variable, operator, value in constraints - if _canonical_arch_constraint_operator(operator) == "==" - } - - -def _congressional_district_geoid( - *, - state_fips: str | None, - district: str, -) -> str | None: - try: - district_id = str(int(str(district))).zfill(2) - except ValueError: - district_id = str(district) - if len(district_id) >= 4: - return district_id - if state_fips is None: - return None - try: - state_id = str(int(str(state_fips))).zfill(2) - except ValueError: - state_id = str(state_fips).zfill(2) - return f"{state_id}{district_id}" - - -def _positive_support_filter_for_arch_constraint( - feature: str, - *, - operator: str, - value: str, -) -> TargetFilter: - canonical_operator = _canonical_arch_constraint_operator(operator) - if canonical_operator == "==": - try: - numeric_value = float(str(value)) - except ValueError: - numeric_value = None - if numeric_value == 1 or _truthy_constraint_value(str(value)): - return TargetFilter(feature=feature, operator=">", value=0) - if numeric_value == 0 or _falsey_constraint_value(str(value)): - return TargetFilter(feature=feature, operator="==", value=0) - return TargetFilter(feature=feature, operator=canonical_operator, value=value) - - -def _dedupe_target_filters(filters: list[TargetFilter]) -> tuple[TargetFilter, ...]: - seen: set[tuple[str, str, Any]] = set() - deduped: list[TargetFilter] = [] - for target_filter in filters: - operator = getattr(target_filter.operator, "value", target_filter.operator) - key = (str(target_filter.feature), str(operator), str(target_filter.value)) - if key in seen: - continue - seen.add(key) - deduped.append(target_filter) - return tuple(deduped) - - -def _agi_bracket_filters(value: str) -> tuple[TargetFilter, ...]: - bounds = ARCH_AGI_BRACKET_FILTERS.get(value) - if bounds is None: - return (TargetFilter(feature="agi_bracket", operator="==", value=value),) - lower, upper = bounds - filters: list[TargetFilter] = [] - if lower is not None: - filters.append( - TargetFilter(feature="adjusted_gross_income", operator=">=", value=lower) - ) - if upper is not None: - filters.append( - TargetFilter(feature="adjusted_gross_income", operator="<", value=upper) - ) - return tuple(filters) - - -def _positive_measure_for_count_record(source_variable: str) -> str | None: - if source_variable.endswith("_returns"): - amount_variable = f"{source_variable.removesuffix('_returns')}_amount" - elif source_variable.endswith("_claims"): - amount_variable = f"{source_variable.removesuffix('_claims')}_amount" - else: - return None - return ARCH_AMOUNT_VARIABLE_ALIASES.get(amount_variable) - - -def _entity_for_measure( - measure: str, - entity_overrides: dict[str, Any], -) -> EntityType: - override = entity_overrides.get(measure) - if isinstance(override, EntityType): - return override - if override is not None: - return EntityType(override) - return ARCH_ENTITY_HINTS.get(measure, EntityType.TAX_UNIT) - - -def _matches_arch_provider_filters( - record: ArchTargetRecord, - *, - variables: tuple[str, ...], - domain_variables: tuple[str, ...], - geo_levels: tuple[str, ...], - target_cells: tuple[dict[str, Any], ...], - entity_overrides: dict[str, Any] | None = None, -) -> bool: - target: CanonicalTargetSpec | None = None - if variables or domain_variables or target_cells: - target = arch_target_record_to_canonical_spec( - record, - entity_overrides=entity_overrides or {}, - ) - if target is None: - return False - if variables and target is not None: - candidate_variables = _arch_target_query_variables(record, target) - if variables and candidate_variables.isdisjoint(variables): - return False - if domain_variables and target is not None: - candidate_domain_variables = _arch_target_domain_variables(target) - if candidate_domain_variables.isdisjoint(domain_variables): - return False - if geo_levels: - geo_level = _arch_record_geo_level(record) - if geo_level not in {_normalize_geo_level(str(level)) for level in geo_levels}: - return False - if target_cells and target is not None: - if not any(_matches_arch_target_cell(target, cell) for cell in target_cells): - return False - return True - - -def _arch_target_query_variables( - record: ArchTargetRecord, - target: CanonicalTargetSpec, -) -> set[str]: - metadata_variable = str(target.metadata.get("variable") or "") - domain_variables = _arch_target_domain_variables(target) - variables = { - record.variable, - } - if not ( - target.aggregation is TargetAggregation.COUNT - and metadata_variable in { - "household_count", - "person_count", - "spm_unit_count", - "tax_unit_count", - } - and domain_variables - ): - variables.add(metadata_variable) - if target.measure is not None: - variables.add(str(target.measure)) - variables.update(domain_variables) - if target.aggregation is TargetAggregation.SUM: - variables.update(_arch_target_cell_variables(target)) - return {variable for variable in variables if variable} - - -def _arch_target_cell_variables(target: CanonicalTargetSpec) -> set[str]: - if target.aggregation is TargetAggregation.COUNT: - if target.entity is EntityType.HOUSEHOLD: - return {"household_count"} - if target.entity is EntityType.PERSON: - return {"person_count"} - if target.entity is EntityType.SPM_UNIT: - return {"spm_unit_count"} - return {"tax_unit_count"} - if target.measure is not None: - variable = str(target.measure) - return {variable, *ARCH_TARGET_CELL_VARIABLE_ALIASES.get(variable, ())} - variable = target.metadata.get("variable") - if variable is None: - return set() - variable = str(variable) - return {variable, *ARCH_TARGET_CELL_VARIABLE_ALIASES.get(variable, ())} - - -def _arch_target_domain_variables(target: CanonicalTargetSpec) -> set[str]: - domain_variables: set[str] = set() - for target_filter in target.filters: - feature = str(target_filter.feature) - if feature in { - "state_fips", - "county_fips", - "tract_geoid", - "congressional_district_geoid", - "sldu_id", - "sldl_id", - "program_payment_type", - "tax_unit_is_filer", - }: - continue - domain_variables.add(feature) - variable = str(target.metadata.get("variable") or "") - if ( - target.aggregation is TargetAggregation.COUNT - and variable - and variable not in _arch_target_cell_variables(target) - ): - domain_variables.add(variable) - if ( - target.aggregation is TargetAggregation.SUM - and variable in ARCH_SELF_DOMAIN_AMOUNT_VARIABLES - and not domain_variables - ): - domain_variables.add(variable) - return domain_variables - - -def _matches_arch_target_cell( - target: CanonicalTargetSpec, - raw_cell: dict[str, Any], -) -> bool: - variable = raw_cell.get("variable") - if variable is None or str(variable) not in _arch_target_cell_variables(target): - return False - - target_geo_level = _normalize_geo_level( - str(target.metadata.get("geo_level") or "national") - ) - geo_level = raw_cell.get("geo_level") - cell_geo_level = target_geo_level - if geo_level is not None: - cell_geo_level = _normalize_geo_level(str(geo_level)) - if target_geo_level != cell_geo_level: - return False - - geographic_id = raw_cell.get("geographic_id") - if geographic_id is not None: - target_geographic_id = _arch_target_geographic_id(target) - if target_geographic_id is None: - return False - if _normalize_target_cell_geographic_id( - target_geographic_id, - geo_level=target_geo_level, - ) != _normalize_target_cell_geographic_id( - geographic_id, - geo_level=cell_geo_level, - ): - return False - - domain_variable = raw_cell.get("domain_variable") - if "domain_variable" in raw_cell: - target_domain_variables = _arch_target_domain_variables(target) - cell_domain_variables = set( - _split_target_cell_domain_variables(domain_variable) - ) - if domain_variable is None or not cell_domain_variables: - if _target_domain_variables_are_redundant_for_unfiltered_cell( - target, - target_domain_variables, - ): - return True - return not target_domain_variables - if not _target_domain_variables_match( - target, - target_domain_variables=target_domain_variables, - cell_domain_variables=cell_domain_variables, - ): - return False - - return True - - -def _target_domain_variables_match( - target: CanonicalTargetSpec, - *, - target_domain_variables: set[str], - cell_domain_variables: set[str], -) -> bool: - target_domain_variables = _normalize_arch_target_domain_variables( - target, - target_domain_variables, - ) - cell_domain_variables = _normalize_arch_target_domain_variables( - target, - cell_domain_variables, - ) - if cell_domain_variables == target_domain_variables: - return True - - implied_domain_variables = _normalize_arch_target_domain_variables( - target, - _arch_target_implied_domain_variables(target), - ) - effective_target_domain_variables = ( - target_domain_variables | implied_domain_variables - ) - if cell_domain_variables == effective_target_domain_variables: - return True - - target_variables = _arch_target_cell_variables(target) - if ( - target.aggregation is TargetAggregation.SUM - and target_variables.issubset(ARCH_SELF_DOMAIN_AMOUNT_VARIABLES) - and cell_domain_variables - == effective_target_domain_variables | target_variables - ): - return True - - model_variable = str(target.metadata.get("variable") or "") - if ( - target.aggregation is TargetAggregation.SUM - and model_variable - and model_variable in effective_target_domain_variables - and cell_domain_variables - == effective_target_domain_variables - {model_variable} - ): - return True - - if ( - target.aggregation is TargetAggregation.COUNT - and model_variable - and cell_domain_variables - == effective_target_domain_variables - {model_variable} - ): - return True - - count_positive_measure = _positive_measure_for_count_record( - str(target.metadata.get("arch_variable") or "") - ) - if ( - target.aggregation is TargetAggregation.COUNT - and count_positive_measure - and count_positive_measure in effective_target_domain_variables - and cell_domain_variables - == effective_target_domain_variables - {count_positive_measure} - ): - return True - - return False - - -def _normalize_arch_target_domain_variables( - target: CanonicalTargetSpec, - domain_variables: set[str], -) -> set[str]: - """Normalize source-native domains to equivalent PE target cell domains.""" - if not _arch_target_has_soi_itemized_deduction_domain(target): - return set(domain_variables) - return { - "tax_unit_itemizes" if variable == "itemized_deductions" else variable - for variable in domain_variables - } - - -def _target_domain_variables_are_redundant_for_unfiltered_cell( - target: CanonicalTargetSpec, - target_domain_variables: set[str], -) -> bool: - normalized_domain_variables = _normalize_arch_target_domain_variables( - target, - target_domain_variables, - ) - if _target_self_domain_is_redundant(target, normalized_domain_variables): - return True - return ( - target.aggregation is TargetAggregation.SUM - and _arch_target_has_soi_itemized_deduction_domain(target) - and normalized_domain_variables.issubset({"tax_unit_itemizes"}) - ) - - -def _arch_target_has_soi_itemized_deduction_domain( - target: CanonicalTargetSpec, -) -> bool: - return _arch_target_implied_domain_variables(target) == {"tax_unit_itemizes"} - - -def _arch_target_implied_domain_variables( - target: CanonicalTargetSpec, -) -> set[str]: - if str(target.source) != "IRS_SOI": - return set() - arch_variable = str(target.metadata.get("arch_variable") or "") - if arch_variable in ARCH_IRS_SOI_CREDIT_AGI_DOMAIN_VARIABLES: - return {"adjusted_gross_income"} - if arch_variable in ( - ARCH_IRS_SOI_ITEMIZED_DEDUCTION_AMOUNT_VARIABLES - | ARCH_IRS_SOI_ITEMIZED_DEDUCTION_COUNT_VARIABLES - ): - source_table = str(target.metadata.get("source_table") or "").lower() - if any( - marker in source_table - for marker in ARCH_IRS_SOI_ITEMIZED_DEDUCTION_TABLE_MARKERS - ): - return {"tax_unit_itemizes"} - return set() - - -def _target_self_domain_is_redundant( - target: CanonicalTargetSpec, - target_domain_variables: set[str], -) -> bool: - if target.aggregation is not TargetAggregation.SUM: - return False - target_variables = _arch_target_cell_variables(target) - return ( - len(target_domain_variables) == 1 - and target_domain_variables.issubset(target_variables) - and target_domain_variables.issubset(ARCH_SELF_DOMAIN_AMOUNT_VARIABLES) - ) - - -def _coverage_for_arch_target_cell( - cell_filter: dict[str, str | None], - target_set: TargetSet, -) -> ArchTargetCellCoverage: - matches = [ - target - for target in target_set.targets - if _matches_arch_target_cell(target, cell_filter) - ] - return ArchTargetCellCoverage( - cell=dict(cell_filter), - target_ids=tuple( - int(target.metadata["target_id"]) - for target in matches - if target.metadata.get("target_id") is not None - ), - target_names=tuple(str(target.name) for target in matches), - sources=tuple( - sorted({str(target.source) for target in matches if target.source}) - ), - ) - - -def _arch_gap_loaded_variable_catalog( - provider: ( - ArchSQLiteTargetProvider - | ArchFactSQLiteTargetProvider - | ArchConsumerFactJSONLTargetProvider - | ArchCompositeSQLiteTargetProvider - ), - *, - period: int, - jurisdiction: str | None, - sources: tuple[str, ...], - compose_model_year_targets: bool | None, - age_soi_targets: bool | None, -) -> dict[tuple[str, str], set[str]]: - resolved_jurisdiction = jurisdiction or provider.jurisdiction - if isinstance( - provider, - ( - ArchFactSQLiteTargetProvider, - ArchConsumerFactJSONLTargetProvider, - ArchCompositeSQLiteTargetProvider, - ), - ): - records = provider.load_records(period=period, sources=sources) - else: - resolved_compose = ( - provider.compose_model_year_targets - if compose_model_year_targets is None - else compose_model_year_targets - ) - resolved_age_soi = ( - provider.age_soi_targets if age_soi_targets is None else age_soi_targets - ) - records = ( - provider._compose_model_year_records( - target_year=period, - jurisdiction=resolved_jurisdiction, - sources=sources, - age_soi_targets=resolved_age_soi, - ) - if resolved_compose - else provider.load_records( - period=period, - jurisdiction=resolved_jurisdiction, - sources=sources, - ) - ) - catalog: dict[tuple[str, str], set[str]] = {} - for record in records: - key = (record.source, record.variable) - catalog.setdefault(key, set()).add(_arch_record_geo_level(record)) - return catalog - - -def _arch_gap_queue_row_for_coverage_cell( - coverage: ArchTargetCellCoverage, - *, - profile_name: str, - period: int, - loaded_variable_catalog: dict[tuple[str, str], set[str]], - variable_uncovered_count: int, -) -> ArchTargetGapQueueRow: - cell = coverage.cell - expected_source = _arch_gap_expected_source(cell) - expected_arch_variable = _arch_gap_expected_arch_variable(cell) - expected_target_type = _arch_gap_expected_target_type(cell) - expected_entity = _arch_gap_expected_entity(cell) - expected_aggregation = _arch_gap_expected_aggregation(expected_target_type) - loader_status = _arch_gap_loader_status( - coverage, - expected_source=expected_source, - expected_arch_variable=expected_arch_variable, - loaded_variable_catalog=loaded_variable_catalog, - cell=cell, - ) - gap_category = _arch_gap_category( - cell, - loader_status=loader_status, - expected_source=expected_source, - expected_arch_variable=expected_arch_variable, - ) - return ArchTargetGapQueueRow( - priority=0, - profile_name=profile_name, - period=int(period), - variable=str(cell.get("variable") or ""), - geo_level=cell.get("geo_level"), - domain_variable=cell.get("domain_variable"), - geographic_id=cell.get("geographic_id"), - covered=coverage.covered, - target_count=coverage.target_count, - target_ids=coverage.target_ids, - sources=coverage.sources, - expected_source=expected_source, - expected_source_table=_arch_gap_expected_source_table( - expected_source, - expected_arch_variable, - cell, - ), - expected_arch_variable=expected_arch_variable, - expected_target_type=expected_target_type, - expected_entity=expected_entity, - expected_aggregation=expected_aggregation, - expected_filters=_arch_gap_expected_filters(cell), - gap_category=gap_category, - loader_status=loader_status, - agent_task_kind=_arch_gap_agent_task_kind(gap_category), - notes=_arch_gap_notes( - cell, - expected_source=expected_source, - expected_arch_variable=expected_arch_variable, - gap_category=gap_category, - variable_uncovered_count=variable_uncovered_count, - ), - ) - - -def _arch_gap_queue_sort_key(row: ArchTargetGapQueueRow) -> tuple[Any, ...]: - source_rank = { - "IRS_SOI": 0, - "BEA": 1, - "CENSUS_ACS": 2, - "CMS_ACA": 3, - "CMS_MEDICAID": 4, - "CMS_MEDICARE": 5, - "USDA_SNAP": 6, - "SSA": 7, - "HHS_ACF_TANF": 8, - "HHS_ACF_LIHEAP": 9, - "FEDERAL_RESERVE": 10, - }.get(str(row.expected_source), 99) - return ( - row.covered, - row.loader_status == "needs_source_mapping_review", - -_arch_gap_notes_uncovered_count(row.notes), - source_rank, - str(row.variable), - str(row.geo_level or ""), - str(row.domain_variable or ""), - ) - - -def _arch_gap_notes_uncovered_count(notes: str) -> int: - if not notes.startswith("profile_variable_uncovered_count="): - return 0 - raw_count = notes.split(";", 1)[0].split("=", 1)[1] - try: - return int(raw_count) - except ValueError: - return 0 - - -def _arch_gap_expected_source(cell: dict[str, Any]) -> str | None: - variable = str(cell.get("variable") or "") - domain_variables = set( - _split_target_cell_domain_variables(cell.get("domain_variable")) - ) - if not domain_variables and variable in ARCH_BEA_FULL_POP_AMOUNT_VARIABLES: - return "BEA" - if variable == "tax_unit_count" and "aca_ptc" in domain_variables: - return "IRS_SOI" - if ( - variable in {"household_count", "person_count"} - and "snap" in domain_variables - and _normalize_geo_level(cell.get("geo_level")) == "district" - ): - return "CENSUS_ACS" - if variable == "snap" or "snap" in domain_variables: - return "USDA_SNAP" - if variable == "tanf" or "tanf" in domain_variables: - return "HHS_ACF_TANF" - if "spm_unit_energy_subsidy_reported" in domain_variables: - return "HHS_ACF_LIHEAP" - if variable == "aca_ptc" or "aca_ptc" in domain_variables: - return "CMS_ACA" - if variable == "medicaid" or "medicaid_enrolled" in domain_variables: - return "CMS_MEDICAID" - if variable == "ssi" or variable.startswith("social_security"): - return "SSA" - if variable == "state_income_tax": - return "CENSUS_STC" - if variable == "medicare_part_b_premiums": - return "CMS_MEDICARE" - if variable == "net_worth": - return "FEDERAL_RESERVE" - if variable == "person_count": - if _normalize_geo_level(cell.get("geo_level")) in {"sldu", "sldl"}: - return "CENSUS_DECENNIAL" - if ( - _normalize_geo_level(cell.get("geo_level")) == "district" - and "age" in domain_variables - ): - return "CENSUS_ACS" - if "ssi" in domain_variables: - return "SSA" - if "adjusted_gross_income" in domain_variables: - return "IRS_SOI" - if "age" in domain_variables or not domain_variables: - return "CENSUS_PEP" - return None - if variable == "household_count": - if _normalize_geo_level(cell.get("geo_level")) in {"sldu", "sldl"}: - return "CENSUS_DECENNIAL" - if not domain_variables: - return "CENSUS_ACS" - return None - if variable in ARCH_IRS_SOI_GAP_VARIABLES: - return "IRS_SOI" - if domain_variables & ARCH_IRS_SOI_GAP_VARIABLES: - return "IRS_SOI" - return None - - -def _arch_gap_expected_arch_variable(cell: dict[str, Any]) -> str | None: - variable = str(cell.get("variable") or "") - domain_variables = tuple( - _split_target_cell_domain_variables(cell.get("domain_variable")) - ) - domain_variable = domain_variables[0] if len(domain_variables) == 1 else None - if not domain_variables and variable in ARCH_BEA_FULL_POP_AMOUNT_ARCH_VARIABLES: - return ARCH_BEA_FULL_POP_AMOUNT_ARCH_VARIABLES[variable] - if variable == "tax_unit_count": - if set(domain_variables) in ( - {"eitc_child_count"}, - {"adjusted_gross_income", "eitc", "eitc_child_count"}, - ): - return "eitc_claims" - if { - "adjusted_gross_income", - "income_tax_before_credits", - }.issubset(domain_variables): - return "income_tax_before_credits_returns" - if set(domain_variables) == {"aca_ptc"}: - return "aca_ptc_returns" - itemized_domain_variables = set(domain_variables) - {"tax_unit_itemizes"} - if ( - "tax_unit_itemizes" in domain_variables - and len(itemized_domain_variables) == 1 - ): - return ARCH_MODEL_COUNT_DOMAIN_VARIABLE_HINTS.get( - next(iter(itemized_domain_variables)) - ) - if domain_variable is None: - return "tax_unit_count" if not domain_variables else None - return ARCH_MODEL_COUNT_DOMAIN_VARIABLE_HINTS.get(domain_variable) - if variable == "household_count": - if domain_variable == "snap": - return "snap_household_count" - if domain_variable == "spm_unit_energy_subsidy_reported": - return "liheap_household_count" - return "household_count" if domain_variable is None else None - if variable == "spm_unit_count": - if domain_variable == "tanf": - return "tanf_family_count" - return None - if variable == "person_count": - if domain_variable == "snap": - return "snap_participant_count" - if domain_variable == "aca_ptc": - return "aca_marketplace_enrollment" - if domain_variable == "medicaid_enrolled": - return "medicaid_total_enrollment" - if "ssi" in domain_variables: - return "ssi_recipients" - if domain_variable == "adjusted_gross_income": - return "tax_filer_individual_count" - if domain_variable == "age" or not domain_variables: - return "population" - return None - if variable == "snap": - return "snap_benefits" - if variable == "aca_ptc": - return "aca_aptc_amount" - if variable == "medicaid": - return "medicaid_benefits" - if variable == "tanf": - return "tanf_cash_assistance" - if variable == "state_income_tax": - return "state_individual_income_tax_collections" - return ARCH_MODEL_AMOUNT_VARIABLE_HINTS.get(variable) - - -def _arch_gap_expected_target_type(cell: dict[str, Any]) -> str | None: - variable = str(cell.get("variable") or "") - if variable in { - "household_count", - "person_count", - "spm_unit_count", - "tax_unit_count", - }: - return "COUNT" - if _arch_gap_expected_arch_variable(cell) is not None: - return "AMOUNT" - return None - - -def _arch_gap_expected_entity(cell: dict[str, Any]) -> str | None: - variable = str(cell.get("variable") or "") - if variable == "tax_unit_count": - return EntityType.TAX_UNIT.value - if variable == "person_count": - return EntityType.PERSON.value - if variable == "spm_unit_count": - return EntityType.SPM_UNIT.value - if variable in {"household_count", "snap"}: - return EntityType.HOUSEHOLD.value - entity = ARCH_ENTITY_HINTS.get(variable) - return entity.value if entity is not None else None - - -def _arch_gap_expected_aggregation(target_type: str | None) -> str | None: - if target_type == "COUNT": - return "count" - if target_type == "AMOUNT": - return "sum" - return None - - -def _arch_gap_expected_filters(cell: dict[str, Any]) -> tuple[dict[str, Any], ...]: - filters: list[dict[str, Any]] = [] - geo_level = _normalize_geo_level(cell.get("geo_level")) - geographic_id = cell.get("geographic_id") - if geo_level == "state": - filters.append( - { - "kind": "geography", - "feature": "state_fips", - "operator": "==", - "value": ( - _state_fips_from_arch_geography_id(geographic_id) - if geographic_id is not None - else "" - ), - } - ) - if geo_level == "sldu": - filters.append( - { - "kind": "geography", - "feature": "sldu_id", - "operator": "==", - "value": ( - _normalize_target_cell_geographic_id( - geographic_id, - geo_level=geo_level, - ) - if geographic_id is not None - else "" - ), - } - ) - if geo_level == "sldl": - filters.append( - { - "kind": "geography", - "feature": "sldl_id", - "operator": "==", - "value": ( - _normalize_target_cell_geographic_id( - geographic_id, - geo_level=geo_level, - ) - if geographic_id is not None - else "" - ), - } - ) - for domain_variable in _split_target_cell_domain_variables( - cell.get("domain_variable") - ): - filters.append( - { - "kind": "domain", - "feature": domain_variable, - "operator": ">", - "value": 0, - } - ) - return tuple(filters) - - -def _arch_gap_expected_source_table( - expected_source: str | None, - expected_arch_variable: str | None, - cell: dict[str, Any], -) -> str | None: - variable = str(cell.get("variable") or "") - if expected_source == "BEA": - geo_level = _normalize_geo_level(cell.get("geo_level")) - if ( - geo_level == "state" - and variable == "employment_income_before_lsr" - and expected_arch_variable - == ARCH_BEA_STATE_EMPLOYMENT_INCOME_BEFORE_LSR_VARIABLE - ): - return "BEA Regional SAINC5N residence-adjusted state wages" - if geo_level == "state" and expected_arch_variable in { - "proprietors_income_amount", - "wages_salaries_amount", - }: - return "BEA Regional SAINC5N annual state personal income" - if expected_arch_variable in { - "wages_salaries_amount", - ARCH_BEA_STATE_EMPLOYMENT_INCOME_BEFORE_LSR_VARIABLE, - }: - return "BEA NIPA annual total wages and salaries" - if expected_arch_variable in { - "medicaid_benefits", - "personal_dividend_income_amount", - "proprietors_income_amount", - "rental_income_amount", - "social_security_benefits", - "unemployment_insurance_benefits", - }: - return "BEA NIPA annual personal income components" - return "BEA NIPA or Regional personal income tables" - if expected_arch_variable in ARCH_GAP_SOURCE_TABLE_HINTS: - return ARCH_GAP_SOURCE_TABLE_HINTS[expected_arch_variable] - if variable in ARCH_GAP_SOURCE_TABLE_HINTS: - return ARCH_GAP_SOURCE_TABLE_HINTS[variable] - if expected_source == "IRS_SOI": - if expected_arch_variable and ( - expected_arch_variable.startswith("wages_salaries_") - or expected_arch_variable.startswith("net_capital_gains_") - or expected_arch_variable.startswith("taxable_ira_distributions_") - or expected_arch_variable.startswith("taxable_pension_income_") - or expected_arch_variable.startswith("taxable_social_security_") - or expected_arch_variable.startswith("unemployment_compensation_") - ): - return "IRS SOI Publication 1304 Table 1.4" - if expected_arch_variable and ( - expected_arch_variable.endswith("_claims") - or expected_arch_variable - in {"real_estate_taxes_amount", "real_estate_taxes_claims"} - ): - return "IRS SOI itemized deduction or credit tables" - return "IRS SOI Publication 1304" - if expected_source == "CENSUS_ACS": - return "Census ACS summary tables" - if expected_source == "CENSUS_DECENNIAL": - return "Census 2020 CD119 state legislative district summary file" - if expected_source == "CENSUS_PEP": - return "Census Population Estimates Program age-sex files" - if expected_source == "CENSUS_STC": - return "Census State Tax Collections item T40" - if expected_source == "CMS_ACA": - return "CMS Marketplace Open Enrollment public-use files" - if expected_source == "CMS_MEDICAID": - return "CMS Medicaid enrollment and expenditure reports" - if expected_source == "CMS_MEDICARE": - return "CMS Medicare Trustees Report Part B premium income" - if expected_source == "FEDERAL_RESERVE": - return "Federal Reserve Financial Accounts Z.1 household net worth" - if expected_source == "SSA": - return "SSA Annual Statistical Supplement" - if expected_source == "HHS_ACF_TANF": - return "ACF TANF Financial Data" - if expected_source == "HHS_ACF_LIHEAP": - return "HHS ACF LIHEAP National Profile" - return None - - -def _arch_gap_loader_status( - coverage: ArchTargetCellCoverage, - *, - expected_source: str | None, - expected_arch_variable: str | None, - loaded_variable_catalog: dict[tuple[str, str], set[str]], - cell: dict[str, Any], -) -> str: - if coverage.covered: - return "covered" - if expected_source is None or expected_arch_variable is None: - return "needs_source_mapping_review" - loaded_geo_levels = loaded_variable_catalog.get( - (expected_source, expected_arch_variable) - ) - if loaded_geo_levels: - expected_geo_level = _normalize_geo_level(cell.get("geo_level")) - if expected_geo_level not in loaded_geo_levels: - return "loaded_arch_variable_missing_geography" - return "loaded_arch_variable_missing_filter_or_adapter" - return "missing_arch_target_record" - - -def _arch_gap_category( - cell: dict[str, Any], - *, - loader_status: str, - expected_source: str | None, - expected_arch_variable: str | None, -) -> str: - if loader_status == "covered": - return "covered" - if _arch_gap_is_deprioritized_survey_or_model_input(cell): - return "survey_or_model_input_deprioritized" - if loader_status == "missing_arch_target_record": - return "ready_primary_loader" - if loader_status == "loaded_arch_variable_missing_geography": - return "ready_rollup_or_geography" - if loader_status == "loaded_arch_variable_missing_filter_or_adapter": - return "adapter_or_constraint_review" - if expected_source is None or expected_arch_variable is None: - return "source_mapping_review" - return "source_mapping_review" - - -def _arch_gap_is_deprioritized_survey_or_model_input(cell: dict[str, Any]) -> bool: - variable = str(cell.get("variable") or "") - if variable in ARCH_DEPRIORITIZED_SURVEY_OR_MODEL_GAP_VARIABLES: - return True - domain_variables = set( - _split_target_cell_domain_variables(cell.get("domain_variable")) - ) - return bool(domain_variables & ARCH_DEPRIORITIZED_SURVEY_OR_MODEL_GAP_DOMAINS) - - -def _arch_gap_agent_task_kind(gap_category: str) -> str: - if gap_category == "covered": - return "none" - if gap_category == "survey_or_model_input_deprioritized": - return "defer_or_review_non_primary_source" - if gap_category == "ready_rollup_or_geography": - return "add_arch_rollup_or_geography_records" - if gap_category == "adapter_or_constraint_review": - return "review_adapter_or_constraints" - if gap_category == "ready_primary_loader": - return "add_arch_source_loader_or_target_record" - return "review_source_mapping" - - -def _arch_gap_notes( - cell: dict[str, Any], - *, - expected_source: str | None, - expected_arch_variable: str | None, - gap_category: str, - variable_uncovered_count: int, -) -> str: - parts = [f"profile_variable_uncovered_count={variable_uncovered_count}"] - if gap_category == "survey_or_model_input_deprioritized": - parts.append( - "survey/model-input proxy deprioritized until primary source review" - ) - if expected_source is None: - parts.append("expected_source requires review") - if expected_arch_variable is None: - parts.append("expected Arch variable requires review") - if "," in str(cell.get("domain_variable") or ""): - parts.append("multi-domain cells may need a grouped source-record spec") - return "; ".join(parts) - - -def _arch_target_gap_queue_csv(report: ArchTargetGapQueueReport) -> str: - import csv - import io - import json - - fieldnames = [ - "priority", - "profile_name", - "period", - "variable", - "geo_level", - "domain_variable", - "geographic_id", - "covered", - "target_count", - "target_ids", - "sources", - "expected_source", - "expected_source_table", - "expected_arch_variable", - "expected_target_type", - "expected_entity", - "expected_aggregation", - "expected_filters", - "gap_category", - "loader_status", - "agent_task_kind", - "notes", - ] - buffer = io.StringIO() - writer = csv.DictWriter(buffer, fieldnames=fieldnames) - writer.writeheader() - for row in report.rows: - writer.writerow( - { - "priority": row.priority, - "profile_name": row.profile_name, - "period": row.period, - "variable": row.variable, - "geo_level": row.geo_level, - "domain_variable": row.domain_variable, - "geographic_id": row.geographic_id, - "covered": row.covered, - "target_count": row.target_count, - "target_ids": json.dumps(list(row.target_ids)), - "sources": json.dumps(list(row.sources)), - "expected_source": row.expected_source, - "expected_source_table": row.expected_source_table, - "expected_arch_variable": row.expected_arch_variable, - "expected_target_type": row.expected_target_type, - "expected_entity": row.expected_entity, - "expected_aggregation": row.expected_aggregation, - "expected_filters": json.dumps(list(row.expected_filters)), - "gap_category": row.gap_category, - "loader_status": row.loader_status, - "agent_task_kind": row.agent_task_kind, - "notes": row.notes, - } - ) - return buffer.getvalue() - - -def _summarize_arch_cell_coverage( - coverage_cells: tuple[ArchTargetCellCoverage, ...], - *, - field: str, -) -> dict[str, dict[str, int]]: - summary: dict[str, dict[str, int]] = {} - for coverage in coverage_cells: - raw_value = coverage.cell.get(field) - value = ( - _normalize_geo_level(raw_value) - if field == "geo_level" - else str(raw_value or "") - ) - if not value: - value = "none" - item = summary.setdefault( - value, - { - "target_cell_count": 0, - "covered_cell_count": 0, - "uncovered_cell_count": 0, - }, - ) - item["target_cell_count"] += 1 - if coverage.covered: - item["covered_cell_count"] += 1 - else: - item["uncovered_cell_count"] += 1 - return dict(sorted(summary.items())) - - -def _target_cell_to_provider_filter( - cell: PolicyEngineUSTargetCell | dict[str, Any], -) -> dict[str, str | None]: - if isinstance(cell, PolicyEngineUSTargetCell): - return cell.to_provider_filter() - return { - "variable": cell.get("variable"), - "geo_level": cell.get("geo_level"), - "domain_variable": cell.get("domain_variable"), - "geographic_id": cell.get("geographic_id"), - } - - -def _arch_target_geographic_id(target: CanonicalTargetSpec) -> str | None: - geo_level = str(target.metadata.get("geo_level") or "national").lower() - feature_by_level = { - "state": "state_fips", - "county": "county_fips", - "tract": "tract_geoid", - "district": "congressional_district_geoid", - "congressional_district": "congressional_district_geoid", - "sldu": "sldu_id", - "sldl": "sldl_id", - } - feature = feature_by_level.get(geo_level) - if feature is None: - return None - for target_filter in target.filters: - if str(target_filter.feature) != feature: - continue - operator = getattr(target_filter.operator, "value", target_filter.operator) - if _canonical_arch_constraint_operator(str(operator)) == "==": - return str(target_filter.value) - return None - - -def _split_target_cell_domain_variables(value: Any) -> tuple[str, ...]: - if value is None: - return () - return tuple( - _normalize_target_cell_domain_variable(part) - for part in str(value).split(",") - if part.strip() - ) - - -def _normalize_target_cell_domain_variable(value: Any) -> str: - raw = str(value).strip() - return ARCH_POSITIVE_CONSTRAINT_ALIASES.get(raw, raw) - - -def _normalize_target_cell_geographic_id( - value: Any, - *, - geo_level: str | None = None, -) -> str: - raw = str(value) - normalized_geo_level = _normalize_geo_level(geo_level) - chamber = None - if normalized_geo_level == "sldu": - chamber = "upper" - elif normalized_geo_level == "sldl": - chamber = "lower" - normalized_sld = normalize_state_legislative_district_id(raw, chamber=chamber) - if normalized_sld != raw: - return str(normalized_sld) - try: - return str(int(raw)) - except (TypeError, ValueError): - return raw - - -def _arch_record_composition_key( - record: ArchTargetRecord, -) -> tuple[str, str, str, tuple[tuple[str, str, str], ...]]: - return ( - record.variable, - record.target_type, - _arch_record_geo_level(record), - tuple(sorted(record.constraints)), - ) - - -def _arch_record_geo_level(record: ArchTargetRecord) -> str: - return _geo_level_for_constraints(record.constraints) or _normalize_geo_level( - record.geographic_level - ) - - -def _geo_level_for_constraints( - constraints: tuple[tuple[str, str, str], ...], -) -> str | None: - constraint_variables = {variable for variable, _, _ in constraints} - for variable, geo_level in ( - ("tract_geoid", "tract"), - ("county_fips", "county"), - ("congressional_district", "district"), - ("congressional_district_geoid", "district"), - ("sldu_id", "sldu"), - ("sldl_id", "sldl"), - ("state_fips", "state"), - ): - if variable in constraint_variables: - return geo_level - return None - - -def _normalize_arch_source(source: str) -> str: - value = str(source) - return ARCH_SOURCE_ALIASES.get(value.lower(), value.upper().replace("-", "_")) - - -def _normalize_geo_level(geo_level: str | None) -> str: - if not geo_level: - return "national" - normalized = geo_level.lower() - if normalized in {"congressional_district", "congressional-district"}: - return "district" - if normalized in { - "sldu", - "state_legislative_district_upper", - "state-legislative-district-upper", - "state_senate_district", - "state-senate-district", - }: - return "sldu" - if normalized in { - "sldl", - "state_legislative_district_lower", - "state-legislative-district-lower", - "state_house_district", - "state-house-district", - }: - return "sldl" - return normalized - - -def _sqlite_table_has_column( - conn: sqlite3.Connection, - table: str, - column: str, -) -> bool: - return column in _sqlite_table_columns(conn, table) - - -def _sqlite_table_columns(conn: sqlite3.Connection, table: str) -> set[str]: - names: set[str] = set() - for row in conn.execute(f"PRAGMA table_info({table})"): - names.add(str(row["name"] if isinstance(row, sqlite3.Row) else row[1])) - return names - - -def _sqlite_table_exists(conn: sqlite3.Connection, table: str) -> bool: - row = conn.execute( - "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?", - (table,), - ).fetchone() - return row is not None - - -def _looks_like_arch_consumer_fact_jsonl(path: Path) -> bool: - return path.suffix.lower() in {".jsonl", ".ndjson"} - - -def _as_arch_db_path_tuple( - value: str | Path | tuple[str | Path, ...], -) -> tuple[Path, ...]: - if isinstance(value, (str, Path)): - return (Path(value),) - paths = tuple(Path(path) for path in value) - if not paths: - raise ValueError("At least one Arch targets DB path is required") - return paths - - -def _single_or_many_paths(paths: list[str]) -> str | tuple[str, ...]: - return paths[0] if len(paths) == 1 else tuple(paths) - - -def _default_arch_target_artifact_roots() -> tuple[Path, ...]: - candidates = ( - Path.cwd() / "artifacts", - Path.cwd().parent / "arch", - Path("/tmp"), - ) - return tuple(path for path in candidates if path.exists()) - - -def discover_arch_target_artifacts( - roots: tuple[str | Path, ...], - *, - max_depth: int = 6, -) -> tuple[Path, ...]: - """Find local Arch target artifacts under bounded discovery roots.""" - - discovered: list[Path] = [] - seen: set[Path] = set() - for raw_root in roots: - root = Path(raw_root).expanduser() - if root.is_file(): - candidates = (root,) - elif root.is_dir(): - candidates = tuple(_walk_arch_target_artifact_candidates(root, max_depth)) - else: - continue - for candidate in candidates: - resolved = candidate.resolve() - if resolved in seen or not _is_arch_target_artifact(resolved): - continue - discovered.append(resolved) - seen.add(resolved) - return tuple(sorted(discovered, key=lambda path: str(path))) - - -def _walk_arch_target_artifact_candidates( - root: Path, max_depth: int -) -> tuple[Path, ...]: - import os - - skip_dir_names = { - ".git", - ".mypy_cache", - ".pytest_cache", - ".ruff_cache", - ".tox", - ".venv", - "__pycache__", - "node_modules", - "site-packages", - } - candidates: list[Path] = [] - root = root.resolve() - for directory, dirnames, filenames in os.walk(root): - current = Path(directory) - try: - depth = len(current.relative_to(root).parts) - except ValueError: - depth = 0 - if depth >= max_depth: - dirnames[:] = [] - else: - dirnames[:] = [ - dirname for dirname in dirnames if dirname not in skip_dir_names - ] - for filename in filenames: - candidate = current / filename - if _is_arch_target_artifact_candidate_name(candidate): - candidates.append(candidate) - return tuple(candidates) - - -def _is_arch_target_artifact_candidate_name(path: Path) -> bool: - name = path.name.lower() - suffix = path.suffix.lower() - if name in {"consumer_facts.jsonl", "consumer_facts.ndjson"}: - return True - if suffix not in {".db", ".sqlite", ".sqlite3"}: - return False - return name == "targets.db" or "arch_targets" in name - - -def _is_arch_target_artifact(path: Path) -> bool: - if not path.is_file(): - return False - if path.suffix.lower() in {".jsonl", ".ndjson"}: - return _is_arch_consumer_fact_jsonl(path) - if path.suffix.lower() in {".db", ".sqlite", ".sqlite3"}: - return _is_arch_sqlite_artifact(path) - return False - - -def _is_arch_consumer_fact_jsonl(path: Path) -> bool: - try: - with path.open() as file: - for line in file: - if not line.strip(): - continue - row = json.loads(line) - schema_version = str(row.get("schema_version") or "") - return schema_version.startswith("arch.consumer_fact") or ( - "aggregate_fact_key" in row and "observed_measure" in row - ) - except (OSError, json.JSONDecodeError): - return False - return False - - -def _is_arch_sqlite_artifact(path: Path) -> bool: - try: - conn = sqlite3.connect(path) - except sqlite3.Error: - return False - try: - tables = { - row[0] - for row in conn.execute( - "SELECT name FROM sqlite_master WHERE type = 'table'" - ).fetchall() - } - if "aggregate_facts" in tables: - return True - if not {"targets", "strata", "stratum_constraints"}.issubset(tables): - return False - target_columns = _sqlite_table_columns(conn, "targets") - required_target_columns = { - "id", - "stratum_id", - "variable", - "period", - "value", - "target_type", - "geographic_level", - "source", - } - return required_target_columns.issubset(target_columns) - except sqlite3.Error: - return False - finally: - conn.close() - - -def _filename_slug(value: str) -> str: - slug = "".join(character if character.isalnum() else "_" for character in value) - slug = "_".join(part for part in slug.split("_") if part) - return slug.lower() or "profile" - - -def _arch_target_refresh_summary_markdown( - coverage: ArchTargetProfileCoverageReport, - gaps: ArchTargetGapQueueReport, - *, - artifact_paths: tuple[Path, ...], - output_paths: tuple[Path, ...], -) -> str: - lines = [ - "# Arch Target Coverage Snapshot", - "", - f"- Profile: `{coverage.profile_name}`", - f"- Period: `{coverage.period}`", - f"- Target cells: `{coverage.target_cell_count}`", - f"- Covered cells: `{coverage.covered_cell_count}`", - f"- Uncovered cells: `{coverage.uncovered_cell_count}`", - f"- Coverage rate: `{coverage.coverage_rate:.1%}`", - "", - "## Coverage By Geography", - "", - "| Geography | Target cells | Covered | Uncovered |", - "| --- | ---: | ---: | ---: |", - ] - for geo_level, counts in sorted(coverage.by_geo_level.items()): - lines.append( - "| {geo_level} | {target_cell_count} | {covered_cell_count} | " - "{uncovered_cell_count} |".format(geo_level=geo_level, **counts) - ) - lines.extend( - [ - "", - "## Gap Categories", - "", - "| Category | Rows |", - "| --- | ---: |", - ] - ) - for category, count in sorted(gaps.by_gap_category.items()): - lines.append(f"| `{category}` | {count} |") - lines.extend( - [ - "", - "## Inputs", - "", - *(f"- `{path}`" for path in artifact_paths), - "", - "## Outputs", - "", - *(f"- `{path}`" for path in output_paths), - "", - ] - ) - return "\n".join(lines) - - -def _target_filter_tuple( - target: CanonicalTargetSpec, -) -> tuple[tuple[str, str, str], ...]: - return tuple( - sorted( - ( - str(target_filter.feature), - str(getattr(target_filter.operator, "value", target_filter.operator)), - _json_scalar_text(target_filter.value), - ) - for target_filter in target.filters - ) - ) - - -def _jurisdiction_clause(jurisdiction: str) -> str: - normalized = jurisdiction.upper().replace("-", "_") - if normalized == "US": - return "upper(s.jurisdiction) LIKE 'US%'" - return f"upper(s.jurisdiction) = '{normalized}'" - - -def _as_string_tuple(value: Any) -> tuple[str, ...]: - if value is None: - return () - if isinstance(value, str): - return (value,) - return tuple(str(item) for item in value) - - -def _as_target_cell_filters(value: Any) -> tuple[dict[str, Any], ...]: - if value is None: - return () - if isinstance(value, dict): - return (dict(value),) - return tuple(dict(item) for item in value if item is not None) - - -__all__ = [ - "ArchCompositeSQLiteTargetProvider", - "ArchConsumerFactJSONLTargetProvider", - "ArchFactSQLiteTargetProvider", - "ArchTargetCellCoverage", - "ArchTargetGapQueueReport", - "ArchTargetGapQueueRow", - "ArchTargetParityReport", - "ArchTargetParityRow", - "ArchTargetProfileCoverageReport", - "ArchSQLiteTargetProvider", - "ArchTargetRecord", - "SOIAgingFactors", - "arch_target_record_to_canonical_spec", - "resolve_arch_sqlite_target_provider", - "summarize_arch_target_gap_queue", - "summarize_arch_target_parity", - "summarize_arch_target_profile_coverage", -] diff --git a/src/microplex_us/targets/census_blocks.py b/src/microplex_us/targets/census_blocks.py deleted file mode 100644 index f1b53cab..00000000 --- a/src/microplex_us/targets/census_blocks.py +++ /dev/null @@ -1,363 +0,0 @@ -"""Census block-derived target providers.""" - -from __future__ import annotations - -from collections.abc import Iterable -from pathlib import Path -from typing import Any - -import pandas as pd -from microplex.core import EntityType -from microplex.targets import ( - TabularRollupSpec, - TabularRollupTargetProvider, - TargetAggregation, - TargetQuery, - TargetSpec, - as_string_tuple, - build_tabular_rollup_targets, -) - -from microplex_us.geography import ( - load_block_probabilities, - normalize_state_legislative_district_id, -) - -CENSUS_BLOCK_POPULATION_VARIABLE = "person_count" -CENSUS_BLOCK_POPULATION_SOURCE = "Census 2020 PL 94-171" -CENSUS_BLOCK_POPULATION_UNITS = "persons" -CENSUS_BLOCK_TARGET_PERIOD = 2024 -CENSUS_BLOCK_SOURCE_YEAR = 2020 -CENSUS_BLOCK_GEOGRAPHY_YEAR = 2020 - -DEFAULT_CENSUS_BLOCK_POPULATION_GEO_LEVELS: tuple[str, ...] = ( - "national", - "state", - "county", - "cd", - "sldu", - "sldl", - "cbsa", - "spm_metro_area", -) - - -CensusBlockPopulationRollup = TabularRollupSpec - - -CENSUS_BLOCK_POPULATION_ROLLUPS: dict[str, CensusBlockPopulationRollup] = { - "national": CensusBlockPopulationRollup( - geo_level="national", - source_column=None, - filter_feature=None, - group_name="census_block_population_national", - name_prefix="census_block_population_national", - ), - "state": CensusBlockPopulationRollup( - geo_level="state", - source_column="state_fips", - filter_feature="state_fips", - group_name="census_block_population_state", - name_prefix="census_block_population_state", - ), - "county": CensusBlockPopulationRollup( - geo_level="county", - source_column="county_fips", - filter_feature="county_fips", - group_name="census_block_population_county", - name_prefix="census_block_population_county", - ), - "tract": CensusBlockPopulationRollup( - geo_level="tract", - source_column="tract_geoid", - filter_feature="tract_geoid", - group_name="census_block_population_tract", - name_prefix="census_block_population_tract", - ), - "block": CensusBlockPopulationRollup( - geo_level="block", - source_column="geoid", - filter_feature="block_geoid", - group_name="census_block_population_block", - name_prefix="census_block_population_block", - ), - "cd": CensusBlockPopulationRollup( - geo_level="cd", - source_column="cd_id", - filter_feature="cd_id", - group_name="census_block_population_cd", - name_prefix="census_block_population_cd", - ), - "sldu": CensusBlockPopulationRollup( - geo_level="sldu", - source_column="sldu_id", - filter_feature="sldu_id", - group_name="census_block_population_sldu", - name_prefix="census_block_population_sldu", - ), - "sldl": CensusBlockPopulationRollup( - geo_level="sldl", - source_column="sldl_id", - filter_feature="sldl_id", - group_name="census_block_population_sldl", - name_prefix="census_block_population_sldl", - ), - "cbsa": CensusBlockPopulationRollup( - geo_level="cbsa", - source_column="cbsa_code", - filter_feature="cbsa_code", - group_name="census_block_population_cbsa", - name_prefix="census_block_population_cbsa", - ), - "spm_metro_area": CensusBlockPopulationRollup( - geo_level="spm_metro_area", - source_column="spm_metro_area", - filter_feature="spm_metro_area", - group_name="census_block_population_spm_metro_area", - name_prefix="census_block_population_spm_metro_area", - ), -} -CENSUS_BLOCK_POPULATION_GEO_LEVELS: tuple[str, ...] = tuple( - CENSUS_BLOCK_POPULATION_ROLLUPS -) - - -class CensusBlockPopulationTargetProvider(TabularRollupTargetProvider): - """Build population count targets by rolling Census blocks to parent geos.""" - - def __init__( - self, - block_probabilities: pd.DataFrame | None = None, - *, - block_probabilities_path: str | Path | None = None, - default_geo_levels: Iterable[str] = DEFAULT_CENSUS_BLOCK_POPULATION_GEO_LEVELS, - period: int = CENSUS_BLOCK_TARGET_PERIOD, - ) -> None: - super().__init__( - block_probabilities, - data_path=block_probabilities_path, - data_loader=load_block_probabilities, - prepare_data=_prepare_block_probabilities, - rollups=CENSUS_BLOCK_POPULATION_ROLLUPS, - value_column="population", - variable=CENSUS_BLOCK_POPULATION_VARIABLE, - variable_aliases=("population",), - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - period=period, - source=CENSUS_BLOCK_POPULATION_SOURCE, - units=CENSUS_BLOCK_POPULATION_UNITS, - default_geo_levels=default_geo_levels, - min_value=0.0, - normalize_geographic_id=_normalize_census_block_geographic_id, - base_metadata={ - "source_year": CENSUS_BLOCK_SOURCE_YEAR, - "geography_year": CENSUS_BLOCK_GEOGRAPHY_YEAR, - "source_artifact": "census_2020_pl_94_171_state_files", - "support_artifact": "block_probabilities.parquet", - "block_rollup": True, - }, - ) - - def load_target_set(self, query: TargetQuery | None = None): - """Load Census block rollup targets with US SLD ID alias support.""" - if query is None or "geographic_ids" not in query.provider_filters: - return super().load_target_set(query) - provider_filters = dict(query.provider_filters) - geo_levels = _requested_census_block_geo_levels( - provider_filters, - default_geo_levels=self.default_geo_levels, - ) - provider_filters["geographic_ids"] = _expand_census_block_geographic_ids( - provider_filters["geographic_ids"], - geo_levels=geo_levels, - ) - return super().load_target_set( - TargetQuery( - period=query.period, - entity=query.entity, - names=query.names, - metadata_filters=query.metadata_filters, - provider_filters=provider_filters, - ) - ) - - -def build_census_block_population_targets( - block_probabilities: pd.DataFrame, - *, - geo_levels: Iterable[str] = DEFAULT_CENSUS_BLOCK_POPULATION_GEO_LEVELS, - geographic_ids: Iterable[str] | None = None, - period: int = CENSUS_BLOCK_TARGET_PERIOD, -) -> list[TargetSpec]: - """Roll block-level Census population counts to canonical target specs.""" - requested_geo_levels = as_string_tuple(geo_levels) - resolved_geo_levels = ( - CENSUS_BLOCK_POPULATION_GEO_LEVELS - if requested_geo_levels == ("all",) - else requested_geo_levels - ) - return build_tabular_rollup_targets( - _prepare_block_probabilities(block_probabilities), - rollups=CENSUS_BLOCK_POPULATION_ROLLUPS, - value_column="population", - variable=CENSUS_BLOCK_POPULATION_VARIABLE, - entity=EntityType.PERSON, - aggregation=TargetAggregation.COUNT, - period=period, - source=CENSUS_BLOCK_POPULATION_SOURCE, - units=CENSUS_BLOCK_POPULATION_UNITS, - geo_levels=resolved_geo_levels, - geographic_ids=_expand_census_block_geographic_ids( - geographic_ids, - geo_levels=resolved_geo_levels, - ), - min_value=0.0, - normalize_geographic_id=_normalize_census_block_geographic_id, - base_metadata={ - "source_year": CENSUS_BLOCK_SOURCE_YEAR, - "geography_year": CENSUS_BLOCK_GEOGRAPHY_YEAR, - "source_artifact": "census_2020_pl_94_171_state_files", - "support_artifact": "block_probabilities.parquet", - "block_rollup": True, - }, - ) - - -def _prepare_block_probabilities(block_probabilities: pd.DataFrame) -> pd.DataFrame: - if "population" not in block_probabilities.columns: - raise ValueError("Block probabilities must include a population column") - blocks = block_probabilities.copy() - blocks["population"] = pd.to_numeric(blocks["population"], errors="coerce") - if "state_fips" in blocks.columns: - blocks["state_fips"] = _zero_pad_series(blocks["state_fips"], 2) - if "county_fips" in blocks.columns: - blocks["county_fips"] = _zero_pad_series(blocks["county_fips"], 5) - elif {"state_fips", "county"}.issubset(blocks.columns): - blocks["county_fips"] = blocks["state_fips"] + _zero_pad_series( - blocks["county"], 3 - ) - if "tract_geoid" not in blocks.columns and { - "state_fips", - "county", - "tract", - }.issubset(blocks.columns): - blocks["tract_geoid"] = ( - blocks["state_fips"] - + _zero_pad_series(blocks["county"], 3) - + _zero_pad_series(blocks["tract"], 6) - ) - if "sldu_id" in blocks.columns: - blocks["sldu_id"] = blocks["sldu_id"].map( - lambda value: ( - normalize_state_legislative_district_id( - value, - chamber="upper", - ) - or "" - ) - ) - if "sldl_id" in blocks.columns: - blocks["sldl_id"] = blocks["sldl_id"].map( - lambda value: ( - normalize_state_legislative_district_id( - value, - chamber="lower", - ) - or "" - ) - ) - for column in ( - "geoid", - "tract_geoid", - "cd_id", - "cbsa_code", - "spm_metro_area", - ): - if column in blocks.columns: - blocks[column] = blocks[column].map(_normalize_geographic_id) - return blocks - - -def _zero_pad_series(values: pd.Series, width: int) -> pd.Series: - text = values.astype("string").str.strip() - numeric = pd.to_numeric(text, errors="coerce") - numeric_text = numeric.round().astype("Int64").astype("string").str.zfill(width) - return text.where(numeric.isna(), numeric_text).str.zfill(width) - - -def _normalize_geographic_id(value: Any) -> str: - if pd.isna(value): - return "" - text = str(value).strip() - if not text: - return "" - if text.endswith(".0") and text[:-2].isdigit(): - return text[:-2] - return text - - -def _normalize_census_block_geographic_id(value: Any) -> str: - raw = "" if pd.isna(value) else str(value).strip() - normalized_sld = normalize_state_legislative_district_id(value) - if normalized_sld is not None and normalized_sld != raw: - return normalized_sld - return _normalize_geographic_id(value) - - -def _requested_census_block_geo_levels( - provider_filters: dict[str, Any], - *, - default_geo_levels: Iterable[str], -) -> tuple[str, ...]: - if "geo_levels" in provider_filters: - requested = as_string_tuple(provider_filters["geo_levels"]) - elif "geographic_levels" in provider_filters: - requested = as_string_tuple(provider_filters["geographic_levels"]) - else: - requested = tuple(default_geo_levels) - return ( - tuple(CENSUS_BLOCK_POPULATION_ROLLUPS) if requested == ("all",) else requested - ) - - -def _expand_census_block_geographic_ids( - geographic_ids: Iterable[str] | Any | None, - *, - geo_levels: Iterable[str], -) -> tuple[str, ...] | None: - if geographic_ids is None: - return None - levels = set(as_string_tuple(geo_levels)) - include_upper = "sldu" in levels - include_lower = "sldl" in levels - expanded: list[str] = [] - for value in as_string_tuple(geographic_ids): - normalized = _normalize_census_block_geographic_id(value) - if normalized: - expanded.append(normalized) - if include_upper: - upper = normalize_state_legislative_district_id(value, chamber="upper") - if upper: - expanded.append(upper) - if include_lower: - lower = normalize_state_legislative_district_id(value, chamber="lower") - if lower: - expanded.append(lower) - return tuple(dict.fromkeys(expanded)) - - -__all__ = [ - "CENSUS_BLOCK_GEOGRAPHY_YEAR", - "CENSUS_BLOCK_POPULATION_GEO_LEVELS", - "CENSUS_BLOCK_POPULATION_ROLLUPS", - "CENSUS_BLOCK_POPULATION_SOURCE", - "CENSUS_BLOCK_POPULATION_UNITS", - "CENSUS_BLOCK_POPULATION_VARIABLE", - "CENSUS_BLOCK_SOURCE_YEAR", - "CENSUS_BLOCK_TARGET_PERIOD", - "DEFAULT_CENSUS_BLOCK_POPULATION_GEO_LEVELS", - "CensusBlockPopulationRollup", - "CensusBlockPopulationTargetProvider", - "build_census_block_population_targets", -] diff --git a/src/microplex_us/targets/rac_mapping.py b/src/microplex_us/targets/rac_mapping.py deleted file mode 100644 index 3e795bef..00000000 --- a/src/microplex_us/targets/rac_mapping.py +++ /dev/null @@ -1,444 +0,0 @@ -""" -RAC Variable Mapping - -Maps calibration target variables to PolicyEngine RAC (statute) definitions. -Enables validation of microdata against encoded tax law. -""" - -from dataclasses import dataclass - - -@dataclass -class RACVariable: - """A variable defined in PolicyEngine RAC.""" - name: str - statute: str # e.g., "26/62" for IRC Section 62 - description: str - entity: str # Person, TaxUnit, Household - dtype: str # Money, Rate, Boolean, Integer - period: str # Year, Month - - -# Map from target variable names to RAC definitions -# Based on policyengine-us/statute structure -RAC_VARIABLE_MAP: dict[str, RACVariable] = { - # Income (IRC Section 61 - Gross Income) - "adjusted_gross_income": RACVariable( - name="adjusted_gross_income", - statute="26/62", - description="Adjusted Gross Income (AGI) per IRC Section 62", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "gross_income": RACVariable( - name="gross_income", - statute="26/61", - description="Gross income per IRC Section 61", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "employment_income": RACVariable( - name="employment_income", - statute="26/61/a/1", - description="Compensation for services (wages, salaries, tips)", - entity="Person", - dtype="Money", - period="Year", - ), - "self_employment_income": RACVariable( - name="self_employment_income", - statute="26/1402", - description="Net earnings from self-employment", - entity="Person", - dtype="Money", - period="Year", - ), - "interest_income": RACVariable( - name="interest_income", - statute="26/61/a/4", - description="Interest income", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "dividend_income": RACVariable( - name="dividend_income", - statute="26/61/a/7", - description="Dividends (ordinary and qualified)", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "qualified_dividend_income": RACVariable( - name="qualified_dividend_income", - statute="26/1/h/11", - description="Qualified dividends taxed at capital gains rates", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "rental_income": RACVariable( - name="rental_income", - statute="26/61/a/5", - description="Rents and royalties", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "capital_gains": RACVariable( - name="capital_gains", - statute="26/1222", - description="Net capital gain", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "social_security_income": RACVariable( - name="social_security_income", - statute="26/86", - description="Social security benefits (portion taxable)", - entity="Person", - dtype="Money", - period="Year", - ), - "pension_income": RACVariable( - name="pension_income", - statute="26/72", - description="Annuities and pension distributions", - entity="Person", - dtype="Money", - period="Year", - ), - "partnership_s_corp_income": RACVariable( - name="partnership_s_corp_income", - statute="26/702", - description="Partnership and S-corporation income", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "unemployment_compensation": RACVariable( - name="unemployment_compensation", - statute="26/85", - description="Unemployment compensation", - entity="Person", - dtype="Money", - period="Year", - ), - - # Deductions (IRC Section 63) - "standard_deduction": RACVariable( - name="standard_deduction", - statute="26/63/c", - description="Standard deduction amount", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "itemized_deductions": RACVariable( - name="itemized_deductions", - statute="26/63/d", - description="Total itemized deductions", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "charitable_deduction": RACVariable( - name="charitable_deduction", - statute="26/170", - description="Charitable contribution deduction", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "salt_deduction": RACVariable( - name="salt_deduction", - statute="26/164", - description="State and local tax deduction", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "mortgage_interest_deduction": RACVariable( - name="mortgage_interest_deduction", - statute="26/163/h", - description="Home mortgage interest deduction", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "medical_expense_deduction": RACVariable( - name="medical_expense_deduction", - statute="26/213", - description="Medical and dental expense deduction", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "qbi_deduction": RACVariable( - name="qbi_deduction", - statute="26/199A", - description="Qualified business income deduction", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - - # Tax computation (IRC Sections 1, 55) - "taxable_income": RACVariable( - name="taxable_income", - statute="26/63", - description="Taxable income (AGI minus deductions)", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "income_tax_before_credits": RACVariable( - name="income_tax_before_credits", - statute="26/1", - description="Regular tax on taxable income", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "alternative_minimum_tax": RACVariable( - name="alternative_minimum_tax", - statute="26/55", - description="Alternative minimum tax", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - - # Credits (IRC Sections 21-54) - "earned_income_credit": RACVariable( - name="earned_income_credit", - statute="26/32", - description="Earned Income Tax Credit (EITC)", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "child_tax_credit": RACVariable( - name="child_tax_credit", - statute="26/24", - description="Child Tax Credit (CTC)", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "additional_child_tax_credit": RACVariable( - name="additional_child_tax_credit", - statute="26/24/h", - description="Additional (refundable) Child Tax Credit", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "child_care_credit": RACVariable( - name="child_care_credit", - statute="26/21", - description="Child and Dependent Care Credit", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "education_credit": RACVariable( - name="education_credit", - statute="26/25A", - description="American Opportunity and Lifetime Learning Credits", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - "premium_tax_credit": RACVariable( - name="premium_tax_credit", - statute="26/36B", - description="Premium Tax Credit (ACA)", - entity="TaxUnit", - dtype="Money", - period="Year", - ), - - # Benefits (Title 7 - SNAP, Title 42 - SSI/Medicaid) - "snap_benefit": RACVariable( - name="snap_benefit", - statute="7/2017", - description="SNAP (food stamps) benefit amount", - entity="Household", - dtype="Money", - period="Month", - ), - "medicaid_eligible": RACVariable( - name="medicaid_eligible", - statute="42/1396a", - description="Medicaid eligibility", - entity="Person", - dtype="Boolean", - period="Year", - ), - "ssi_benefit": RACVariable( - name="ssi_benefit", - statute="42/1382", - description="Supplemental Security Income", - entity="Person", - dtype="Money", - period="Month", - ), - "tanf_benefit": RACVariable( - name="tanf_benefit", - statute="42/601", - description="TANF cash assistance", - entity="Household", - dtype="Money", - period="Month", - ), - "housing_subsidy": RACVariable( - name="housing_subsidy", - statute="42/1437f", - description="Section 8 housing assistance", - entity="Household", - dtype="Money", - period="Month", - ), - "wic": RACVariable( - name="wic", - statute="42/1786", - description="Women, Infants, and Children program", - entity="Person", - dtype="Money", - period="Month", - ), - "school_lunch": RACVariable( - name="school_lunch", - statute="42/1758", - description="National School Lunch Program", - entity="Person", - dtype="Money", - period="Year", - ), - "liheap": RACVariable( - name="liheap", - statute="42/8621", - description="Low Income Home Energy Assistance Program", - entity="Household", - dtype="Money", - period="Year", - ), - "ccdf": RACVariable( - name="ccdf", - statute="42/9858", - description="Child Care and Development Fund", - entity="Person", - dtype="Money", - period="Month", - ), - - # Demographics (not statutory but needed for calibration) - "age": RACVariable( - name="age", - statute=None, - description="Age in years", - entity="Person", - dtype="Integer", - period="Year", - ), - "is_tax_filer": RACVariable( - name="is_tax_filer", - statute="26/6012", - description="Required to file tax return", - entity="TaxUnit", - dtype="Boolean", - period="Year", - ), - "filing_status": RACVariable( - name="filing_status", - statute="26/1", - description="Tax filing status", - entity="TaxUnit", - dtype="Categorical", - period="Year", - ), -} - - -# Map from PolicyEngine variable names to our RAC variables -POLICYENGINE_TO_RAC: dict[str, str] = { - "adjusted_gross_income": "adjusted_gross_income", - "irs_employment_income": "employment_income", - "self_employment_income": "self_employment_income", - "taxable_interest_income": "interest_income", - "non_qualified_dividend_income": "dividend_income", - "qualified_dividend_income": "qualified_dividend_income", - "rental_income": "rental_income", - "loss_limited_net_capital_gains": "capital_gains", - "social_security": "social_security_income", - "pension_income": "pension_income", - "partnership_s_corp_income": "partnership_s_corp_income", - "unemployment_compensation": "unemployment_compensation", - "charitable_deduction": "charitable_deduction", - "salt_deduction": "salt_deduction", - "interest_deduction": "mortgage_interest_deduction", - "medical_expense_deduction": "medical_expense_deduction", - "qualified_business_income_deduction": "qbi_deduction", - "taxable_income": "taxable_income", - "income_tax_before_credits": "income_tax_before_credits", - "eitc": "earned_income_credit", - "ctc": "child_tax_credit", - "refundable_ctc": "additional_child_tax_credit", - "snap": "snap_benefit", - "is_medicaid_eligible": "medicaid_eligible", - "ssi": "ssi_benefit", - "tanf": "tanf_benefit", -} - - -# Map from microdata column names (CPS/PUF) to RAC variables -MICRODATA_TO_RAC: dict[str, str] = { - # CPS columns - "wage_income": "employment_income", - "self_employment_income": "self_employment_income", - "interest_income": "interest_income", - "dividend_income": "dividend_income", - "rental_income": "rental_income", - "social_security_income": "social_security_income", - "unemployment_compensation": "unemployment_compensation", - "adjusted_gross_income": "adjusted_gross_income", - "total_income": "gross_income", - "head_age": "age", - - # PUF columns (E-codes) - "E00100": "adjusted_gross_income", - "E00200": "employment_income", - "E00300": "interest_income", - "E00600": "dividend_income", - "E00650": "qualified_dividend_income", - "E00900": "self_employment_income", - "E01000": "capital_gains", - "E01500": "pension_income", - "E02300": "unemployment_compensation", - "E02400": "social_security_income", -} - - -def get_rac_for_target(target_name: str) -> RACVariable | None: - """Get RAC variable definition for a target name.""" - return RAC_VARIABLE_MAP.get(target_name) - - -def get_rac_for_pe_variable(pe_variable: str) -> RACVariable | None: - """Get RAC variable for a PolicyEngine variable name.""" - rac_name = POLICYENGINE_TO_RAC.get(pe_variable) - if rac_name: - return RAC_VARIABLE_MAP.get(rac_name) - return None - - -def get_rac_for_microdata_column(column: str) -> RACVariable | None: - """Get RAC variable for a microdata column name.""" - rac_name = MICRODATA_TO_RAC.get(column) - if rac_name: - return RAC_VARIABLE_MAP.get(rac_name) - return None diff --git a/src/microplex_us/targets_database.py b/src/microplex_us/targets_database.py deleted file mode 100644 index 38b10135..00000000 --- a/src/microplex_us/targets_database.py +++ /dev/null @@ -1,173 +0,0 @@ -""" -Legacy US calibration targets database models. - -This module is US-specific and remains separate from the canonical -`microplex.targets` abstractions used for cross-country target specs. -""" - -from dataclasses import dataclass, field -from enum import Enum - -import numpy as np -import pandas as pd - - -class TargetCategory(Enum): - """Categories of legacy US calibration targets.""" - - AGI_DISTRIBUTION = "agi_distribution" - INCOME_SOURCES = "income_sources" - DEDUCTIONS = "deductions" - TAX_LIABILITY = "tax_liability" - EITC = "eitc" - CTC = "ctc" - ACTC = "actc" - OTHER_CREDITS = "other_credits" - SNAP = "snap" - MEDICAID = "medicaid" - HOUSING = "housing" - SSI = "ssi" - TANF = "tanf" - UNEMPLOYMENT = "unemployment" - POPULATION = "population" - HOUSEHOLD_STRUCTURE = "household_structure" - AGE_DISTRIBUTION = "age_distribution" - EMPLOYMENT = "employment" - - -@dataclass -class Target: - """A legacy US calibration target.""" - - name: str - category: TargetCategory - value: float - year: int - source: str - source_url: str | None = None - geography: str = "US" - state_fips: str | None = None - filing_status: str | None = None - agi_lower: float = -np.inf - agi_upper: float = np.inf - is_count: bool = True - is_taxable_only: bool = False - rac_variable: str | None = None - rac_statute: str | None = None - microdata_column: str | None = None - notes: str | None = None - last_updated: str | None = None - - -@dataclass -class TargetsDatabase: - """Database of legacy US calibration targets.""" - - targets: list[Target] = field(default_factory=list) - _by_category: dict[TargetCategory, list[Target]] = field(default_factory=dict) - _by_geography: dict[str, list[Target]] = field(default_factory=dict) - - def add(self, target: Target): - self.targets.append(target) - if target.category not in self._by_category: - self._by_category[target.category] = [] - self._by_category[target.category].append(target) - if target.geography not in self._by_geography: - self._by_geography[target.geography] = [] - self._by_geography[target.geography].append(target) - - def add_many(self, targets: list[Target]): - for target in targets: - self.add(target) - - def get_by_category(self, category: TargetCategory) -> list[Target]: - return self._by_category.get(category, []) - - def get_by_geography(self, geography: str) -> list[Target]: - return self._by_geography.get(geography, []) - - def get_national(self) -> list[Target]: - return self.get_by_geography("US") - - def get_state(self, state_fips: str) -> list[Target]: - return [target for target in self.targets if target.state_fips == state_fips] - - def get_with_rac_mapping(self) -> list[Target]: - return [target for target in self.targets if target.rac_variable is not None] - - def to_dataframe(self) -> pd.DataFrame: - rows = [] - for target in self.targets: - rows.append( - { - "name": target.name, - "category": target.category.value, - "value": target.value, - "year": target.year, - "source": target.source, - "geography": target.geography, - "state_fips": target.state_fips, - "filing_status": target.filing_status, - "agi_lower": target.agi_lower, - "agi_upper": target.agi_upper, - "is_count": target.is_count, - "rac_variable": target.rac_variable, - "rac_statute": target.rac_statute, - "microdata_column": target.microdata_column, - } - ) - return pd.DataFrame(rows) - - def to_calibration_format( - self, - geography: str = "US", - year: int = 2021, - ) -> tuple[dict[str, dict], dict[str, float]]: - marginal_targets: dict[str, dict] = {} - continuous_targets: dict[str, float] = {} - - for target in self.targets: - if target.geography != geography or target.year != year: - continue - if target.microdata_column is None: - continue - if target.is_count: - variable = target.microdata_column - if variable not in marginal_targets: - marginal_targets[variable] = {} - if target.agi_lower != -np.inf or target.agi_upper != np.inf: - category = f"{target.agi_lower:.0f}_to_{target.agi_upper:.0f}" - else: - category = "all" - marginal_targets[variable][category] = target.value - else: - continuous_targets[target.microdata_column] = target.value - - return marginal_targets, continuous_targets - - def compare_to_policyengine(self, pe_targets: pd.DataFrame) -> pd.DataFrame: - our_df = self.to_dataframe() - comparison = our_df.merge( - pe_targets, - left_on=["name", "year"], - right_on=["Variable", "Year"], - how="outer", - suffixes=("_policyengine", "_pe"), - ) - comparison["difference"] = comparison["value"] - comparison["Value"] - comparison["pct_difference"] = comparison["difference"] / comparison["Value"] * 100 - return comparison - - def coverage_summary(self) -> dict[str, int]: - return { - category.value: len(self.get_by_category(category)) - for category in TargetCategory - } - - def __len__(self) -> int: - return len(self.targets) - - def __repr__(self) -> str: - coverage = self.coverage_summary() - non_zero = {key: value for key, value in coverage.items() if value > 0} - return f"TargetsDatabase({len(self)} targets across {len(non_zero)} categories)" diff --git a/src/microplex_us/unified_calibration.py b/src/microplex_us/unified_calibration.py deleted file mode 100644 index 7b57e753..00000000 --- a/src/microplex_us/unified_calibration.py +++ /dev/null @@ -1,291 +0,0 @@ -"""Unified multi-target calibration for PE parity.""" - -from dataclasses import dataclass - -import numpy as np -import pandas as pd - -from .pe_targets import PETargets - - -@dataclass -class CalibrationTarget: - """A calibration target.""" - name: str - target_value: float - column: str | None = None # Column to sum for this target - filter_col: str | None = None # Column to filter on - filter_val: str | None = None # Value to filter for - is_count: bool = False # If True, count rows instead of sum - - -class UnifiedCalibrator: - """Calibrate synthetic population to multiple target types. - - Supports: - - Geographic targets (CD, state, SLDU) - - Income totals (IRS SOI) - - Benefit program participation/spending - - Demographic distributions - """ - - def __init__( - self, - geographic_targets: dict[str, float] | None = None, - income_targets: bool = True, - benefit_targets: bool = True, - population_targets: bool = True, - ): - """Initialize unified calibrator. - - Args: - geographic_targets: Dict of geography_id -> population target - income_targets: Include IRS SOI income totals - benefit_targets: Include benefit program targets - population_targets: Include Census population targets - """ - self.geographic_targets = geographic_targets or {} - self.include_income = income_targets - self.include_benefits = benefit_targets - self.include_population = population_targets - - self._pe_targets = None - self._all_targets = None - - def _load_pe_targets(self) -> pd.DataFrame: - """Load PolicyEngine targets.""" - if self._pe_targets is None: - pe = PETargets() - self._pe_targets = pe.load_all() - return self._pe_targets - - def build_target_matrix( - self, - df: pd.DataFrame, - weight_col: str = 'weight' - ) -> tuple[np.ndarray, np.ndarray, list[str]]: - """Build design matrix and target vector for calibration. - - Args: - df: Synthetic population DataFrame - weight_col: Name of weight column - - Returns: - Tuple of (design_matrix, target_vector, target_names) - """ - targets = [] - target_names = [] - - n = len(df) - design_rows = [] - - # 1. Geographic targets - if self.geographic_targets: - # Determine geography column - geo_col = None - for col in ['cd_geoid', 'state_fips', 'sldu_geoid', 'sldl_geoid']: - if col in df.columns: - geo_col = col - break - - if geo_col: - for geo_id, target in self.geographic_targets.items(): - # Create indicator vector - indicator = (df[geo_col] == geo_id).astype(float).values - design_rows.append(indicator) - targets.append(target) - target_names.append(f"geo_{geo_id}") - - # 2. Income targets from PE - if self.include_income: - pe_df = self._load_pe_targets() - income_df = pe_df[pe_df['category'].str.contains('irs.soi')] - income_national = income_df[income_df['geography'] == 'national'] - - # Map PE target names to our column names - income_map = { - 'employment_income': 'employment_income', - 'self_employment_income': 'self_employment_income', - 'social_security': 'social_security', - 'dividend_income': 'dividend_income', - 'interest_income': 'interest_income', - 'rental_income': 'rental_income', - 'pension_income': 'pension_income', - 'taxable_pension_income': 'taxable_pension_income', - 'ssi': 'ssi', - 'unemployment_compensation': 'unemployment_compensation', - 'long_term_capital_gains': 'long_term_capital_gains', - 'short_term_capital_gains': 'short_term_capital_gains', - 'qualified_dividend_income': 'dividend_income', # Approximate - 'farm_income': 'farm_income', - 'alimony_income': 'alimony_income', - } - - for _, row in income_national.iterrows(): - pe_name = row['name'] - target = row['value'] - - if pe_name in income_map: - col_name = income_map[pe_name] - if col_name in df.columns: - # Use income values directly (will be multiplied by weights) - design_rows.append(df[col_name].fillna(0).values) - targets.append(target) - target_names.append(f"income_{pe_name}") - - # 3. Benefit targets from PE - if self.include_benefits: - pe_df = self._load_pe_targets() - - benefit_map = { - 'snap': ('snap', 'gov.cbo'), # CBO SNAP spending - 'ssi': ('ssi', 'gov.cbo'), - 'eitc': ('eitc', 'gov.treasury'), - } - - for var_name, (col_name, cat_prefix) in benefit_map.items(): - if col_name in df.columns: - # Find matching PE target - matching = pe_df[ - (pe_df['category'].str.startswith(cat_prefix)) & - (pe_df['name'].str.lower() == var_name) & - (pe_df['geography'] == 'national') - ] - - if not matching.empty: - target = matching.iloc[0]['value'] - design_rows.append(df[col_name].fillna(0).values) - targets.append(target) - target_names.append(f"benefit_{var_name}") - - # 4. Population targets - if self.include_population: - pe_df = self._load_pe_targets() - - # Total population - pop_row = pe_df[ - (pe_df['category'] == 'gov.census.populations') & - (pe_df['name'] == 'total') - ] - if not pop_row.empty: - # Just count people (weight of 1 each) - design_rows.append(np.ones(n)) - targets.append(pop_row.iloc[0]['value']) - target_names.append("population_total") - - # Convert to arrays - if not design_rows: - raise ValueError("No targets configured") - - design_matrix = np.column_stack(design_rows) - target_vector = np.array(targets) - - return design_matrix, target_vector, target_names - - def calibrate( - self, - df: pd.DataFrame, - weight_col: str = 'weight', - max_iter: int = 100, - tol: float = 1e-6, - bounds: tuple[float, float] = (0.1, 10.0) - ) -> pd.DataFrame: - """Calibrate weights using iterative proportional fitting. - - Args: - df: Synthetic population DataFrame - weight_col: Name of weight column - max_iter: Maximum iterations - tol: Convergence tolerance - bounds: (min_factor, max_factor) bounds on weight adjustments - - Returns: - DataFrame with calibrated weights - """ - df = df.copy() - - # Build target matrix - X, targets, names = self.build_target_matrix(df, weight_col) - - n_samples, n_targets = X.shape - print(f"Calibrating {n_samples:,} samples to {n_targets} targets") - - # Initialize weights - if weight_col in df.columns: - weights = df[weight_col].values.copy() - else: - weights = np.ones(n_samples) - - # IPF iteration - for iteration in range(max_iter): - old_weights = weights.copy() - - for j in range(n_targets): - # Current weighted sum for target j - current = np.sum(weights * X[:, j]) - - if current > 0: - # Adjustment factor - factor = targets[j] / current - - # Apply bounded adjustment to relevant samples - mask = X[:, j] > 0 - adjustment = np.clip(factor, bounds[0], bounds[1]) - weights[mask] *= adjustment - - # Check convergence - if np.max(np.abs(weights - old_weights) / (old_weights + 1e-10)) < tol: - print(f"Converged after {iteration + 1} iterations") - break - - # Compute final errors - print(f"\n{'Target':<40} {'Computed':>15} {'Target':>15} {'Error':>10}") - print("-" * 85) - - for j, name in enumerate(names): - computed = np.sum(weights * X[:, j]) - target = targets[j] - error = abs(computed - target) / target * 100 - - if target > 1e9: - comp_str = f"${computed/1e9:.1f}B" - tgt_str = f"${target/1e9:.1f}B" - elif target > 1e6: - comp_str = f"{computed/1e6:.1f}M" - tgt_str = f"{target/1e6:.1f}M" - else: - comp_str = f"{computed:,.0f}" - tgt_str = f"{target:,.0f}" - - print(f"{name:<40} {comp_str:>15} {tgt_str:>15} {error:>9.2f}%") - - df['calibrated_weight'] = weights - return df - - -def calibrate_to_pe_targets( - df: pd.DataFrame, - geo_targets: dict[str, float] | None = None, - include_income: bool = True, - include_benefits: bool = True, - **kwargs -) -> pd.DataFrame: - """Convenience function to calibrate to PE targets. - - Args: - df: Synthetic population DataFrame with income/benefit columns - geo_targets: Optional geographic targets (CD population, etc.) - include_income: Include IRS SOI income targets - include_benefits: Include benefit program targets - **kwargs: Additional args passed to calibrate() - - Returns: - DataFrame with calibrated_weight column - """ - calibrator = UnifiedCalibrator( - geographic_targets=geo_targets, - income_targets=include_income, - benefit_targets=include_benefits, - ) - - return calibrator.calibrate(df, **kwargs) diff --git a/src/microplex_us/validation/__init__.py b/src/microplex_us/validation/__init__.py deleted file mode 100644 index 90318010..00000000 --- a/src/microplex_us/validation/__init__.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Validation utilities for microplex. - -This module provides tools for validating synthetic microdata against -administrative targets (IRS SOI, SNAP, etc.). -""" - -from microplex_us.validation.baseline import ( - BaselineComparison, - MetricComparison, - compute_baseline_comparison, - export_comparison_json, -) -from microplex_us.validation.soi import ( - AGI_BRACKETS, - FILING_STATUSES, - SOITargets, - ValidationResult, - compute_validation_metrics, - load_soi_targets, - validate_against_soi, -) -from microplex_us.validation.soi import ( - get_available_years as get_soi_years, -) - -__all__ = [ - # SOI validation - "AGI_BRACKETS", - "FILING_STATUSES", - "SOITargets", - "get_soi_years", - "load_soi_targets", - "compute_validation_metrics", - "ValidationResult", - "validate_against_soi", - # Baseline comparison - "MetricComparison", - "BaselineComparison", - "compute_baseline_comparison", - "export_comparison_json", -] diff --git a/src/microplex_us/validation/baseline.py b/src/microplex_us/validation/baseline.py deleted file mode 100644 index 4d5ea354..00000000 --- a/src/microplex_us/validation/baseline.py +++ /dev/null @@ -1,241 +0,0 @@ -""" -Baseline comparison of CPS microdata against SOI targets. - -This module quantifies the gaps BEFORE any calibration, establishing -a baseline understanding of where CPS underreports relative to -administrative tax data. - -Key gaps documented: -- Capital gains: ~$1.2T in SOI, $0 in CPS -- Interest/dividends: underreported ~40% -- High-income returns: underrepresented in CPS -""" - -from dataclasses import dataclass, field -from typing import Any - -import polars as pl - -from microplex_us.data_sources.cps_transform import TransformedDataset -from microplex_us.validation.soi import SOITargets - - -@dataclass -class MetricComparison: - """Comparison of a single metric between CPS and SOI.""" - - name: str - category: str # "aggregate", "by_filing_status", "by_agi_bracket" - cps_value: float - soi_value: float - unit: str # "count", "dollars", "rate" - statute_ref: str | None = None - - @property - def pct_error(self) -> float | None: - """Percentage error: (CPS - SOI) / SOI.""" - if self.soi_value == 0: - return None - return (self.cps_value - self.soi_value) / abs(self.soi_value) - - @property - def abs_error(self) -> float: - """Absolute error.""" - return abs(self.cps_value - self.soi_value) - - def to_dict(self) -> dict[str, Any]: - """Convert to dictionary.""" - return { - "name": self.name, - "category": self.category, - "cps_value": self.cps_value, - "soi_value": self.soi_value, - "pct_error": self.pct_error, - "unit": self.unit, - "statute_ref": self.statute_ref, - } - - -@dataclass -class BaselineComparison: - """Baseline comparison of CPS vs SOI before calibration.""" - - cps_year: int - soi_year: int - metrics: dict[str, MetricComparison] = field(default_factory=dict) - coverage_gaps: list[dict] = field(default_factory=list) - - def summary(self) -> dict[str, Any]: - """Generate summary statistics.""" - errors = [ - abs(m.pct_error) - for m in self.metrics.values() - if m.pct_error is not None - ] - - if not errors: - return { - "n_metrics": 0, - "mean_abs_error": None, - "max_abs_error": None, - "worst_metric": None, - } - - worst = max( - [(k, abs(m.pct_error)) for k, m in self.metrics.items() if m.pct_error is not None], - key=lambda x: x[1], - ) - - return { - "n_metrics": len(errors), - "mean_abs_error": sum(errors) / len(errors), - "max_abs_error": max(errors), - "worst_metric": worst[0], - } - - -def compute_baseline_comparison( - transformed: TransformedDataset, - soi_targets: SOITargets, -) -> BaselineComparison: - """ - Compute baseline comparison between CPS and SOI. - - This establishes the gap BEFORE calibration, showing: - - How many returns CPS represents vs SOI - - How much income CPS captures vs SOI - - Where the major shortfalls are - - Args: - transformed: Transformed CPS dataset with tax units - soi_targets: SOI targets for comparison - - Returns: - BaselineComparison with all metric comparisons - """ - comparison = BaselineComparison( - cps_year=transformed.year, - soi_year=soi_targets.year, - ) - - tax_units = transformed.tax_units - - # Total returns (weighted sum of tax units) - cps_returns = float(tax_units["weight"].sum()) - comparison.metrics["total_returns"] = MetricComparison( - name="total_returns", - category="aggregate", - cps_value=cps_returns, - soi_value=float(soi_targets.total_returns), - unit="count", - statute_ref="26 USC 6012 - Returns required", - ) - - # Total AGI - if "agi_proxy" in tax_units.columns: - cps_agi = float((tax_units["weight"] * tax_units["agi_proxy"]).sum()) - comparison.metrics["total_agi"] = MetricComparison( - name="total_agi", - category="aggregate", - cps_value=cps_agi, - soi_value=float(soi_targets.total_agi), - unit="dollars", - statute_ref="26 USC 62(a) - Adjusted gross income defined", - ) - - # Total earned income (only if SOI has wages data) - if "earned_income" in tax_units.columns and soi_targets.total_wages is not None: - cps_earned = float((tax_units["weight"] * tax_units["earned_income"]).sum()) - comparison.metrics["total_earned_income"] = MetricComparison( - name="total_earned_income", - category="aggregate", - cps_value=cps_earned, - soi_value=float(soi_targets.total_wages), - unit="dollars", - statute_ref="26 USC 32(c)(2) - Earned income defined", - ) - - # Returns by filing status - if "filing_status" in tax_units.columns: - status_counts = ( - tax_units.group_by("filing_status") - .agg(pl.col("weight").sum().alias("count")) - ) - - for status, soi_count in soi_targets.returns_by_filing_status.items(): - status_df = status_counts.filter(pl.col("filing_status") == status) - cps_count = float(status_df["count"][0]) if len(status_df) > 0 else 0 - - comparison.metrics[f"returns_{status}"] = MetricComparison( - name=f"returns_{status}", - category="by_filing_status", - cps_value=cps_count, - soi_value=float(soi_count), - unit="count", - statute_ref="26 USC 1, 2 - Filing status", - ) - - # Document known coverage gaps from the transform - if hasattr(transformed, "coverage_report") and "gaps" in transformed.coverage_report: - comparison.coverage_gaps = transformed.coverage_report["gaps"] - else: - # Default gaps if not in coverage report - comparison.coverage_gaps = _get_default_coverage_gaps() - - return comparison - - -def _get_default_coverage_gaps() -> list[dict]: - """Return default coverage gaps for CPS vs SOI.""" - return [ - { - "variable": "agi_proxy", - "component": "capital_gains", - "statute_ref": "26 USC 1222", - "impact": "high", - "notes": "CPS does not collect capital gains. SOI 2021: ~$1.2T.", - }, - { - "variable": "agi_proxy", - "component": "above_line_deductions", - "statute_ref": "26 USC 62(a)(7)", - "impact": "medium", - "notes": "IRA contributions, student loan interest not in CPS.", - }, - { - "variable": "filing_status", - "component": "head_of_household", - "statute_ref": "26 USC 2(b)", - "impact": "medium", - "notes": "CPS married status doesn't map directly to HoH eligibility.", - }, - { - "variable": "interest_income", - "component": "underreporting", - "statute_ref": "26 USC 61(a)(4)", - "impact": "medium", - "notes": "Survey underreporting of interest income ~40%.", - }, - { - "variable": "dividend_income", - "component": "underreporting", - "statute_ref": "26 USC 61(a)(7)", - "impact": "medium", - "notes": "Survey underreporting of dividend income ~40%.", - }, - ] - - -def export_comparison_json(comparison: BaselineComparison) -> dict[str, Any]: - """ - Export comparison to JSON-serializable dictionary. - - Format designed for dashboard visualization. - """ - return { - "cps_year": comparison.cps_year, - "soi_year": comparison.soi_year, - "metrics": [m.to_dict() for m in comparison.metrics.values()], - "summary": comparison.summary(), - "coverage_gaps": comparison.coverage_gaps, - } diff --git a/src/microplex_us/validation/downstream.py b/src/microplex_us/validation/downstream.py deleted file mode 100644 index 19091e92..00000000 --- a/src/microplex_us/validation/downstream.py +++ /dev/null @@ -1,227 +0,0 @@ -"""Downstream tax-benefit aggregate validation (paper reviewer response B2). - -Input-target validation (see ``soi.py``, ``baseline.py``) asks whether -the calibrated synthetic frame's marginal sums match administrative -totals on the *variables the calibrator was told to target*. -Downstream validation asks the different, stricter question: when the -calibrated frame is ingested by ``policyengine_us.Microsimulation``, -do the *computed policy outputs* — federal income tax, EITC, CTC, -SNAP, SSI, ACA PTC — match administrative aggregates? - -This module contains: - -- ``DownstreamBenchmark`` record (name, computed, benchmark, unit, source). -- ``DOWNSTREAM_BENCHMARKS_2024`` canonical 2024 benchmark set. Each - record is sourced to an IRS / USDA / SSA / CMS / CBO publication. -- ``compute_downstream_aggregates(dataset_path, period)`` runs the - simulation and returns a dict of variable → weighted sum. -- ``compute_downstream_comparison(aggregates, benchmarks)`` joins - computed values to benchmarks and returns per-variable errors. - -Benchmark numbers are rounded publicly-reported totals; each has a -citation. Updates should be traceable to the cited source. -""" - -from __future__ import annotations - -from collections.abc import Iterable -from dataclasses import dataclass -from pathlib import Path - -import numpy as np - - -@dataclass(frozen=True) -class DownstreamBenchmark: - """One external-benchmark comparison. - - ``benchmark`` is the published external aggregate (e.g. IRS SOI - total EITC disbursed 2024). ``computed`` is the aggregate computed - on the calibrated synthetic frame by ``policyengine_us``. - """ - - name: str - computed: float - benchmark: float - unit: str - source: str - - @property - def abs_error(self) -> float: - return self.computed - self.benchmark - - @property - def rel_error(self) -> float | None: - if self.benchmark == 0: - return None - return (self.computed - self.benchmark) / self.benchmark - - def to_dict(self) -> dict[str, object]: - return { - "name": self.name, - "computed": self.computed, - "benchmark": self.benchmark, - "unit": self.unit, - "source": self.source, - "abs_error": self.abs_error, - "rel_error": self.rel_error, - } - - -@dataclass(frozen=True) -class DownstreamBenchmarkSpec: - """A benchmark definition without a computed value attached.""" - - name: str - benchmark: float - unit: str - source: str - - -DOWNSTREAM_BENCHMARKS_2024: tuple[DownstreamBenchmarkSpec, ...] = ( - DownstreamBenchmarkSpec( - name="income_tax", - benchmark=2_400_000_000_000.0, - unit="USD", - source=( - "IRS SOI 2022 total federal individual income tax liability " - "~$2.22T; CBO 2024 projection ~$2.4T" - ), - ), - DownstreamBenchmarkSpec( - name="eitc", - benchmark=64_000_000_000.0, - unit="USD", - source="IRS SOI 2023 EITC disbursed ~$64B (Table 2.5)", - ), - DownstreamBenchmarkSpec( - name="ctc", - benchmark=115_000_000_000.0, - unit="USD", - source=( - "IRS SOI 2023 CTC disbursed ~$115B (pre-OBBBA CTC of $2,000 " - "per qualifying child)" - ), - ), - DownstreamBenchmarkSpec( - name="snap", - benchmark=100_000_000_000.0, - unit="USD", - source="USDA FNS FY2024 SNAP benefits total ~$100B", - ), - DownstreamBenchmarkSpec( - name="ssi", - benchmark=66_000_000_000.0, - unit="USD", - source="SSA SSI Annual Statistical Report 2024 ~$66B total payments", - ), - DownstreamBenchmarkSpec( - name="aca_ptc", - benchmark=60_000_000_000.0, - unit="USD", - source=( - "CMS/IRS ACA Advance Premium Tax Credit & reconciled PTC " - "2024 ~$60B (IRA-enhanced subsidies in effect)" - ), - ), -) - -ENTITY_WEIGHT_VARIABLES: dict[str, str] = { - "household": "household_weight", - "person": "person_weight", - "tax_unit": "tax_unit_weight", - "spm_unit": "spm_unit_weight", - "family": "family_weight", - "marital_unit": "marital_unit_weight", -} - - -def compute_downstream_comparison( - aggregates: dict[str, float], - benchmarks: Iterable[DownstreamBenchmarkSpec], -) -> dict[str, DownstreamBenchmark]: - """Join computed aggregates to their external benchmarks. - - Variables in ``aggregates`` without a matching benchmark are - silently omitted — they're either not in the benchmark set or the - caller passed extra diagnostic values. - """ - benchmark_by_name = {spec.name: spec for spec in benchmarks} - result: dict[str, DownstreamBenchmark] = {} - for name, computed in aggregates.items(): - spec = benchmark_by_name.get(name) - if spec is None: - continue - result[name] = DownstreamBenchmark( - name=name, - computed=float(computed), - benchmark=spec.benchmark, - unit=spec.unit, - source=spec.source, - ) - return result - - -def _coerce_simulation_values(values: object) -> np.ndarray: - raw = getattr(values, "values", values) - return np.asarray(raw, dtype=float) - - -def compute_downstream_weighted_aggregate( - simulation: object, - variable: str, - period: int = 2024, -) -> float: - """Compute one entity-weighted downstream aggregate from a Microsimulation.""" - - tax_benefit_system = getattr(simulation, "tax_benefit_system", None) - if tax_benefit_system is None: - raise ValueError("Microsimulation is missing tax_benefit_system metadata") - entity = tax_benefit_system.get_variable(variable).entity - entity_key = getattr(entity, "key", None) - weight_variable = ENTITY_WEIGHT_VARIABLES.get(entity_key) - if weight_variable is None: - raise ValueError( - f"Unsupported entity {entity_key!r} for downstream aggregate {variable!r}" - ) - - values = _coerce_simulation_values(simulation.calculate(variable, period)) - weights = _coerce_simulation_values(simulation.calculate(weight_variable, period)) - if len(values) != len(weights): - raise ValueError( - f"Downstream aggregate {variable!r} length {len(values)} does not match " - f"{weight_variable!r} length {len(weights)}" - ) - return float(np.dot(values, weights)) - - -def compute_downstream_aggregates( - dataset_path: str | Path, - period: int = 2024, - variables: Iterable[str] = ( - "income_tax", - "eitc", - "ctc", - "snap", - "ssi", - "aca_ptc", - ), -) -> dict[str, float]: - """Load a PolicyEngine-US dataset and compute weighted sums for ``variables``. - - Returns a dict of variable → weighted aggregate (float). Requires - ``policyengine_us`` to be installed. - """ - # Import lazily so the rest of this module (benchmark records, - # comparison function) stays importable in environments without PE. - from policyengine_us import Microsimulation # noqa: PLC0415 - - simulation = Microsimulation(dataset=str(dataset_path)) - aggregates: dict[str, float] = {} - for variable in variables: - aggregates[variable] = compute_downstream_weighted_aggregate( - simulation, - variable, - period, - ) - return aggregates diff --git a/src/microplex_us/validation/soi.py b/src/microplex_us/validation/soi.py deleted file mode 100644 index 923b10fe..00000000 --- a/src/microplex_us/validation/soi.py +++ /dev/null @@ -1,598 +0,0 @@ -""" -IRS Statistics of Income (SOI) validation targets. - -SOI provides authoritative aggregate statistics on individual income tax returns. -We use these as calibration targets to ensure synthetic microdata matches -published administrative totals. - -Data source: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics -""" - -from dataclasses import dataclass -import os -import sqlite3 -from pathlib import Path - -import polars as pl - -# AGI brackets used in SOI tables (in dollars) -# Format: (lower_bound, upper_bound) -AGI_BRACKETS: list[tuple[float, float]] = [ - (float("-inf"), 1), # Under $1 (includes losses) - (1, 5_000), - (5_000, 10_000), - (10_000, 15_000), - (15_000, 20_000), - (20_000, 25_000), - (25_000, 30_000), - (30_000, 40_000), - (40_000, 50_000), - (50_000, 75_000), - (75_000, 100_000), - (100_000, 200_000), - (200_000, 500_000), - (500_000, 1_000_000), - (1_000_000, 1_500_000), - (1_500_000, 2_000_000), - (2_000_000, 5_000_000), - (5_000_000, 10_000_000), - (10_000_000, float("inf")), -] - -# Filing statuses -FILING_STATUSES: list[str] = [ - "single", - "married_joint", - "married_separate", - "head_of_household", - "qualifying_widow", -] - -# SOI data by year (from IRS Table 1.1) -# Source: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-returns-publication-1304-complete-report -_SOI_DATA: dict[int, dict] = { - 2021: { - "total_returns": 153_774_296, - "total_agi": 14_447_858_000_000, # $14.4 trillion - "returns_by_agi_bracket": { - "under_1": 13_276_584, - "1_to_5k": 8_848_458, - "5k_to_10k": 8_844_285, - "10k_to_15k": 9_547_842, - "15k_to_20k": 8_857_890, - "20k_to_25k": 8_146_626, - "25k_to_30k": 7_253_485, - "30k_to_40k": 12_547_123, - "40k_to_50k": 10_347_252, - "50k_to_75k": 18_892_456, - "75k_to_100k": 13_857_425, - "100k_to_200k": 21_758_943, - "200k_to_500k": 8_547_823, - "500k_to_1m": 1_847_234, - "1m_to_1_5m": 478_234, - "1_5m_to_2m": 198_523, - "2m_to_5m": 324_567, - "5m_to_10m": 89_234, - "10m_plus": 57_812, - }, - "agi_by_bracket": { - "under_1": -82_458_000_000, - "1_to_5k": 28_547_000_000, - "5k_to_10k": 66_458_000_000, - "10k_to_15k": 119_547_000_000, - "15k_to_20k": 155_478_000_000, - "20k_to_25k": 183_547_000_000, - "25k_to_30k": 199_875_000_000, - "30k_to_40k": 437_548_000_000, - "40k_to_50k": 465_478_000_000, - "50k_to_75k": 1_175_478_000_000, - "75k_to_100k": 1_198_547_000_000, - "100k_to_200k": 3_047_856_000_000, - "200k_to_500k": 2_547_896_000_000, - "500k_to_1m": 1_247_856_000_000, - "1m_to_1_5m": 578_965_000_000, - "1_5m_to_2m": 345_678_000_000, - "2m_to_5m": 947_856_000_000, - "5m_to_10m": 612_458_000_000, - "10m_plus": 1_171_148_000_000, - }, - "returns_by_filing_status": { - "single": 76_854_234, - "married_joint": 54_478_234, - "married_separate": 3_547_823, - "head_of_household": 17_847_234, - "qualifying_widow": 1_046_771, - }, - }, - 2020: { - "total_returns": 150_344_285, - "total_agi": 12_534_856_000_000, - "returns_by_agi_bracket": { - "under_1": 14_547_234, - "1_to_5k": 9_234_567, - "5k_to_10k": 9_123_456, - "10k_to_15k": 9_876_543, - "15k_to_20k": 9_234_567, - "20k_to_25k": 8_456_789, - "25k_to_30k": 7_654_321, - "30k_to_40k": 12_876_543, - "40k_to_50k": 10_654_321, - "50k_to_75k": 18_234_567, - "75k_to_100k": 13_234_567, - "100k_to_200k": 19_876_543, - "200k_to_500k": 6_234_567, - "500k_to_1m": 1_234_567, - "1m_to_1_5m": 345_678, - "1_5m_to_2m": 156_789, - "2m_to_5m": 245_678, - "5m_to_10m": 67_890, - "10m_plus": 45_618, - }, - "agi_by_bracket": { - "under_1": -98_765_000_000, - "1_to_5k": 24_567_000_000, - "5k_to_10k": 58_765_000_000, - "10k_to_15k": 110_234_000_000, - "15k_to_20k": 145_678_000_000, - "20k_to_25k": 171_234_000_000, - "25k_to_30k": 187_654_000_000, - "30k_to_40k": 398_765_000_000, - "40k_to_50k": 428_765_000_000, - "50k_to_75k": 1_087_654_000_000, - "75k_to_100k": 1_098_765_000_000, - "100k_to_200k": 2_765_432_000_000, - "200k_to_500k": 1_876_543_000_000, - "500k_to_1m": 834_567_000_000, - "1m_to_1_5m": 423_456_000_000, - "1_5m_to_2m": 271_234_000_000, - "2m_to_5m": 723_456_000_000, - "5m_to_10m": 467_890_000_000, - "10m_plus": 958_622_000_000, - }, - "returns_by_filing_status": { - "single": 75_234_567, - "married_joint": 52_456_789, - "married_separate": 3_234_567, - "head_of_household": 17_456_789, - "qualifying_widow": 961_573, - }, - }, -} - - -@dataclass -class SOITargets: - """Container for SOI validation targets.""" - - year: int - total_returns: int - total_agi: int # In dollars - - returns_by_agi_bracket: dict[str, int] - agi_by_bracket: dict[str, int] - returns_by_filing_status: dict[str, int] - - # Optional additional targets - total_wages: int | None = None - total_dividends: int | None = None - total_interest: int | None = None - total_capital_gains: int | None = None - - def is_consistent(self, tolerance: float = 0.01) -> bool: - """Check if targets are internally consistent.""" - # Returns by bracket should sum to total - if self.returns_by_agi_bracket: - bracket_sum = sum(self.returns_by_agi_bracket.values()) - if self.total_returns and abs(bracket_sum - self.total_returns) / self.total_returns > tolerance: - return False - - # AGI by bracket should sum to total - if self.agi_by_bracket: - agi_sum = sum(self.agi_by_bracket.values()) - if self.total_agi and abs(agi_sum - self.total_agi) / abs(self.total_agi) > tolerance: - return False - - # Filing status should sum to total - if self.returns_by_filing_status: - status_sum = sum(self.returns_by_filing_status.values()) - if self.total_returns and abs(status_sum - self.total_returns) / self.total_returns > tolerance: - return False - - return True - - def to_dict(self) -> dict: - """Convert to flat dictionary for validation.""" - result = { - "total_returns": self.total_returns, - "total_agi": self.total_agi, - } - - for bracket, count in self.returns_by_agi_bracket.items(): - result[f"returns_{bracket}"] = count - - for bracket, agi in self.agi_by_bracket.items(): - result[f"agi_{bracket}"] = agi - - for status, count in self.returns_by_filing_status.items(): - result[f"returns_{status}"] = count - - return result - - -def get_available_years() -> list[int]: - """Return list of years with SOI data available.""" - return sorted(_SOI_DATA.keys()) - - -def _agi_bracket_label(lower: float, upper: float) -> str: - if lower == float("-inf"): - if upper == 1: - return "under_1" - return f"under_{int(upper)}" - if upper == float("inf"): - if lower == 10_000_000: - return "10m_plus" - if lower >= 1_000_000: - lower_m = lower / 1_000_000 - if float(lower_m).is_integer(): - return f"{int(lower_m)}m_plus" - return f"{lower_m:g}m_plus" - if lower >= 1_000: - return f"{int(lower)}_plus" - return f"{int(lower)}_plus" - if lower == float("-inf") and upper == 1: - return "under_1" - if lower == 1 and upper == 5_000: - return "1_to_5k" - if lower == 5_000 and upper == 10_000: - return "5k_to_10k" - if lower == 10_000 and upper == 15_000: - return "10k_to_15k" - if lower == 15_000 and upper == 20_000: - return "15k_to_20k" - if lower == 20_000 and upper == 25_000: - return "20k_to_25k" - if lower == 25_000 and upper == 30_000: - return "25k_to_30k" - if lower == 30_000 and upper == 40_000: - return "30k_to_40k" - if lower == 40_000 and upper == 50_000: - return "40k_to_50k" - if lower == 50_000 and upper == 75_000: - return "50k_to_75k" - if lower == 75_000 and upper == 100_000: - return "75k_to_100k" - if lower == 100_000 and upper == 200_000: - return "100k_to_200k" - if lower == 200_000 and upper == 500_000: - return "200k_to_500k" - if lower == 500_000 and upper == 1_000_000: - return "500k_to_1m" - if lower == 1_000_000 and upper == 1_500_000: - return "1m_to_1_5m" - if lower == 1_500_000 and upper == 2_000_000: - return "1_5m_to_2m" - if lower == 2_000_000 and upper == 5_000_000: - return "2m_to_5m" - if lower == 5_000_000 and upper == 10_000_000: - return "5m_to_10m" - if lower == 10_000_000 and upper == float("inf"): - return "10m_plus" - return f"{int(lower)}_to_{int(upper)}" - - -def _load_soi_targets_from_db(year: int, targets_db: str | Path) -> SOITargets: - db_path = Path(targets_db) - if not db_path.exists(): - raise FileNotFoundError(f"PolicyEngine targets DB not found: {db_path}") - - conn = sqlite3.connect(db_path) - conn.row_factory = sqlite3.Row - try: - rows = conn.execute( - """ - SELECT - t.target_id, - t.variable, - t.value, - t.period, - t.stratum_id, - sc.constraint_variable, - sc.operation, - sc.value AS constraint_value - FROM targets t - JOIN strata s ON t.stratum_id = s.stratum_id - LEFT JOIN stratum_constraints sc - ON s.stratum_id = sc.stratum_id - WHERE t.active = 1 - AND t.reform_id = 0 - AND t.period <= ? - AND t.variable IN ('adjusted_gross_income', 'person_count', 'tax_unit_count') - """, - (year,), - ).fetchall() - finally: - conn.close() - - targets_by_id: dict[int, dict[str, object]] = {} - for row in rows: - target_id = int(row["target_id"]) - target = targets_by_id.setdefault( - target_id, - { - "target_id": target_id, - "variable": row["variable"], - "value": float(row["value"]), - "period": int(row["period"]), - "stratum_id": int(row["stratum_id"]), - "constraints": [], - }, - ) - if row["constraint_variable"] is not None: - target["constraints"].append( - { - "variable": row["constraint_variable"], - "operation": row["operation"], - "value": row["constraint_value"], - } - ) - - best_targets: dict[tuple[int, str], dict[str, object]] = {} - for target in targets_by_id.values(): - key = (int(target["stratum_id"]), str(target["variable"])) - existing = best_targets.get(key) - if existing is None or int(target["period"]) > int(existing["period"]): - best_targets[key] = target - - total_agi = None - returns_by_agi_bracket: dict[str, int] = {} - agi_by_bracket: dict[str, int] = {} - returns_by_filing_status: dict[str, int] = {} - - for target in best_targets.values(): - constraints = target["constraints"] - if not constraints: - continue - - constraint_vars = {c["variable"] for c in constraints} - lower = float("-inf") - upper = float("inf") - has_agi_bounds = False - for c in constraints: - if c["variable"] == "adjusted_gross_income": - has_agi_bounds = True - value = float(c["value"]) - if c["operation"] in (">=", ">"): - lower = max(lower, value) - elif c["operation"] in ("<=", "<"): - upper = min(upper, value) - - has_state = "state_fips" in constraint_vars - has_district = "congressional_district_geoid" in constraint_vars - is_filer_slice = "tax_unit_is_filer" in constraint_vars - - if target["variable"] == "adjusted_gross_income" and not has_agi_bounds: - if is_filer_slice and not has_state and not has_district: - total_agi = int(float(target["value"])) - continue - - if has_district: - continue - - if has_agi_bounds: - label = _agi_bracket_label(lower, upper) - if target["variable"] == "tax_unit_count" and has_state: - returns_by_agi_bracket[label] = returns_by_agi_bracket.get(label, 0) + int( - float(target["value"]) - ) - continue - if target["variable"] == "adjusted_gross_income" and has_state: - agi_by_bracket[label] = agi_by_bracket.get(label, 0) + int( - float(target["value"]) - ) - continue - if target["variable"] == "person_count" and is_filer_slice and not has_state: - returns_by_agi_bracket[label] = int(float(target["value"])) - continue - - if target["variable"] == "tax_unit_count" and "filing_status" in constraint_vars: - status_value = next( - c["value"] for c in constraints if c["variable"] == "filing_status" - ) - returns_by_filing_status[str(status_value)] = int(float(target["value"])) - - total_returns = sum(returns_by_agi_bracket.values()) if returns_by_agi_bracket else 0 - if total_agi is None: - total_agi = sum(agi_by_bracket.values()) if agi_by_bracket else 0 - - return SOITargets( - year=year, - total_returns=total_returns, - total_agi=total_agi, - returns_by_agi_bracket=returns_by_agi_bracket, - agi_by_bracket=agi_by_bracket, - returns_by_filing_status=returns_by_filing_status, - ) - - -def load_soi_targets(year: int, *, targets_db: str | Path | None = None) -> SOITargets: - """ - Load SOI targets for a given year. - - Args: - year: Tax year (e.g., 2021) - - Returns: - SOITargets with published aggregates - - Raises: - ValueError: If year not available - """ - resolved_db = targets_db - if resolved_db is None: - resolved_db = os.getenv("MICROPLEX_POLICYENGINE_TARGETS_DB") or os.getenv( - "POLICYENGINE_TARGETS_DB" - ) - - if resolved_db is not None: - return _load_soi_targets_from_db(year, resolved_db) - - if year not in _SOI_DATA: - available = ", ".join(str(y) for y in sorted(_SOI_DATA.keys())) - raise ValueError(f"SOI data for {year} not available. Available years: {available}") - - data = _SOI_DATA[year] - - return SOITargets( - year=year, - total_returns=data["total_returns"], - total_agi=data["total_agi"], - returns_by_agi_bracket=data["returns_by_agi_bracket"], - agi_by_bracket=data["agi_by_bracket"], - returns_by_filing_status=data["returns_by_filing_status"], - ) - - -def compute_validation_metrics( - simulated: dict[str, float], - targets: dict[str, float], - weights: dict[str, float] | None = None, -) -> dict[str, float]: - """ - Compute validation metrics comparing simulated to targets. - - Args: - simulated: Simulated aggregate values - targets: Target aggregate values - weights: Optional weights for each metric - - Returns: - Dictionary of error metrics - """ - metrics = {} - errors = [] - weighted_errors = [] - - for key in targets: - if key not in simulated: - continue - - target_val = targets[key] - sim_val = simulated[key] - - if target_val == 0: - if sim_val == 0: - pct_error = 0.0 - else: - pct_error = float("inf") - else: - pct_error = (sim_val - target_val) / abs(target_val) - - metrics[f"{key}_error"] = pct_error - errors.append(abs(pct_error)) - - weight = weights.get(key, 1.0) if weights else 1.0 - weighted_errors.append(abs(pct_error) * weight) - - # Summary statistics - if errors: - metrics["mean_absolute_pct_error"] = sum(errors) / len(errors) - metrics["max_absolute_pct_error"] = max(errors) - - if weighted_errors and weights: - total_weight = sum(weights.get(k, 1.0) for k in targets if k in simulated) - if total_weight > 0: - metrics["weighted_mape"] = sum(weighted_errors) / total_weight - - return metrics - - -@dataclass -class ValidationResult: - """Result of validating microdata against SOI targets.""" - - simulated: dict[str, float] - targets: dict[str, float] - errors: dict[str, float] - year: int - - def summary(self) -> dict: - """Generate summary of validation results.""" - abs_errors = [abs(v) for k, v in self.errors.items() if k.endswith("_error")] - - if not abs_errors: - return {"status": "no_metrics", "pass": False} - - max_error = max(abs_errors) - mean_error = sum(abs_errors) / len(abs_errors) - - # Find worst metric - worst_metric = max( - [(k, abs(v)) for k, v in self.errors.items() if k.endswith("_error")], - key=lambda x: x[1], - )[0].replace("_error", "") - - # Pass if all errors under 5% - threshold = 0.05 - passed = all(e < threshold for e in abs_errors) - - return { - "status": "pass" if passed else "fail", - "pass": passed, - "max_error": max_error, - "mean_error": mean_error, - "worst_metric": worst_metric, - "n_metrics": len(abs_errors), - "threshold": threshold, - } - - -def validate_against_soi( - microdata: pl.DataFrame, - targets: SOITargets, - weight_col: str = "weight", - agi_col: str = "agi", - filing_status_col: str = "filing_status", -) -> ValidationResult: - """ - Validate microdata against SOI targets. - - Args: - microdata: DataFrame with individual records - targets: SOI targets to validate against - weight_col: Column name for sample weights - agi_col: Column name for AGI - filing_status_col: Column name for filing status - - Returns: - ValidationResult with simulated values, targets, and errors - """ - simulated = {} - - # Total returns (sum of weights) - simulated["total_returns"] = float(microdata[weight_col].sum()) - - # Total AGI - if agi_col in microdata.columns: - simulated["total_agi"] = float((microdata[weight_col] * microdata[agi_col]).sum()) - - # Returns by filing status - if filing_status_col in microdata.columns: - status_counts = ( - microdata.group_by(filing_status_col) - .agg(pl.col(weight_col).sum().alias("count")) - ) - for row in status_counts.iter_rows(named=True): - status = row[filing_status_col] - simulated[f"returns_{status}"] = float(row["count"]) - - # Compute errors - target_dict = targets.to_dict() - errors = compute_validation_metrics(simulated, target_dict) - - return ValidationResult( - simulated=simulated, - targets=target_dict, - errors=errors, - year=targets.year, - ) diff --git a/src/microplex_us/variables.py b/src/microplex_us/variables.py deleted file mode 100644 index 68fb7f5b..00000000 --- a/src/microplex_us/variables.py +++ /dev/null @@ -1,1121 +0,0 @@ -"""Helpers for working with atomic vs derived variables and donor specs.""" - -from __future__ import annotations - -from collections import defaultdict -from collections.abc import Callable, Iterable, Mapping -from dataclasses import dataclass, field -from enum import Enum - -import numpy as np -import pandas as pd -from microplex.core import EntityType -from microplex.core.semantics import ( - FrameSemanticCheck, - FrameSemanticCheckReport, - FrameSemanticTransform, - SemanticTransformStage, - apply_frame_semantic_transforms, - evaluate_frame_semantic_checks, -) - -try: - from microplex.core import SourceVariableCapability -except ImportError: - - @dataclass(frozen=True) - class SourceVariableCapability: - """Compatibility shim for older core branches.""" - - authoritative: bool = True - usable_as_condition: bool = True - notes: str | None = None - - -class DonorMatchStrategy(Enum): - """How donor-generated scores should be mapped back onto donor support.""" - - RANK = "rank" - - -class VariableSupportFamily(Enum): - """Statistical support family for one variable.""" - - CONTINUOUS = "continuous" - SUPPORT_SENSITIVE = "support_sensitive" - BOUNDED_SHARE = "bounded_share" - - -class ConditionScoreMode(Enum): - """How donor condition variables should be scored for one target family.""" - - VALUE_ONLY = "value_only" - VALUE_AND_SUPPORT = "value_and_support" - - -class ProjectionAggregation(Enum): - """How person-native features should be projected onto a group entity.""" - - FIRST = "first" - SUM = "sum" - MAX = "max" - MEAN = "mean" - - -PE_STYLE_PUF_IRS_DEMOGRAPHIC_PREDICTORS = ( - "age", - "is_male", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", -) -PUF_IRS_TAX_PREFERRED_CONDITION_VARS = ( - "age", - "is_male", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", -) -# Keep PE-aligned PUF tax-leaf conditioning structural by default. -PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS: tuple[str, ...] = () -# Explicit challenger widening for live PUF tax-leaf blocks. These vars are only -# meant for opt-in experiments that retain the PE structural backbone while -# layering in a narrow source-native raw-overlap surface. -PUF_DIVIDEND_INTEREST_CHALLENGER_SHARED_CONDITION_VARS = ( - "self_employment_income", - "rental_income", - "social_security_retirement", -) -PUF_PENSION_CHALLENGER_SHARED_CONDITION_VARS = ( - "social_security_retirement", - "social_security_disability", - "unemployment_compensation", -) -PUF_PARTNERSHIP_CHALLENGER_SHARED_CONDITION_VARS = ( - "self_employment_income", - "rental_income", - "alimony_income", -) -RENTAL_INCOME_COMPONENT_PREFERRED_CONDITION_VARS = ( - "state_fips", - "tenure", - "age", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "income", - "employment_income", - "self_employment_income", - "real_estate_taxes", -) - - -@dataclass(frozen=True) -class DonorImputationBlockSpec: - """Declarative donor-model spec for one imputation block.""" - - model_variables: tuple[str, ...] - restored_variables: tuple[str, ...] - native_entity: EntityType = EntityType.PERSON - condition_entities: tuple[EntityType, ...] = () - match_strategies: Mapping[str, DonorMatchStrategy] = field(default_factory=dict) - prepare_frame: Callable[[pd.DataFrame], pd.DataFrame] | None = None - restore_frame: Callable[[pd.DataFrame], pd.DataFrame] | None = None - - def strategy_for(self, variable_name: str) -> DonorMatchStrategy: - return self.match_strategies.get(variable_name, DonorMatchStrategy.RANK) - - -@dataclass(frozen=True) -class VariableSemanticSpec: - """Declarative semantics for variables that can be derived from an atomic basis.""" - - native_entity: EntityType = EntityType.PERSON - condition_entities: tuple[EntityType, ...] = () - projection_aggregation: ProjectionAggregation = ProjectionAggregation.FIRST - support_family: VariableSupportFamily = VariableSupportFamily.CONTINUOUS - derived_from: tuple[str, ...] = () - donor_match_strategy: DonorMatchStrategy = DonorMatchStrategy.RANK - donor_transform: FrameSemanticTransform | None = None - donor_check: FrameSemanticCheck | None = None - preferred_condition_vars: tuple[str, ...] = () - supplemental_shared_condition_vars: tuple[str, ...] = () - challenger_shared_condition_vars: tuple[str, ...] = () - notes: str | None = None - - def is_redundant_given(self, variable_names: Iterable[str]) -> bool: - """Return whether this variable is redundant given the observed variables.""" - if not self.derived_from: - return False - available = set(variable_names) - return set(self.derived_from).issubset(available) - - @property - def condition_score_mode(self) -> ConditionScoreMode: - if self.support_family is VariableSupportFamily.SUPPORT_SENSITIVE: - return ConditionScoreMode.VALUE_AND_SUPPORT - return ConditionScoreMode.VALUE_ONLY - - @property - def allowed_condition_entities(self) -> tuple[EntityType, ...]: - if self.condition_entities: - return self.condition_entities - if self.native_entity is EntityType.PERSON: - record_entity = getattr(EntityType, "RECORD", None) - return tuple(entity for entity in EntityType if entity is not record_entity) - return (EntityType.HOUSEHOLD, self.native_entity) - - -def zero_minor_employment_income(frame: pd.DataFrame) -> pd.DataFrame: - """Enforce zero employment income for minors on donor-integrated seed frames.""" - if "employment_income" not in frame.columns or "age" not in frame.columns: - return frame - ages = pd.to_numeric(frame["age"], errors="coerce") - if ages.isna().all(): - return frame - result = frame.copy() - minor_mask = ages.lt(18).fillna(False) - if not minor_mask.any(): - return result - result["employment_income"] = ( - pd.to_numeric(result["employment_income"], errors="coerce") - .fillna(0.0) - .astype(float) - ) - result.loc[minor_mask, "employment_income"] = 0.0 - return result - - -def suppress_retired_senior_employment_income_without_esi( - frame: pd.DataFrame, -) -> pd.DataFrame: - """Suppress donor-overridden wage income for retired seniors without ESI.""" - required_columns = {"employment_income", "age", "has_esi"} - if not required_columns.issubset(frame.columns): - return frame - ages = pd.to_numeric(frame["age"], errors="coerce") - if ages.isna().all(): - return frame - social_security_income = social_security_retirement_compatible_amount(frame) - has_esi = ( - pd.to_numeric(frame["has_esi"], errors="coerce") - .fillna(0.0) - .astype(float) - .gt(0.0) - ) - retired_senior_mask = ( - ages.ge(65).fillna(False) & social_security_income.gt(0.0) & ~has_esi - ) - if not retired_senior_mask.any(): - return frame - result = frame.copy() - result["employment_income"] = ( - pd.to_numeric(result["employment_income"], errors="coerce") - .fillna(0.0) - .astype(float) - ) - result.loc[retired_senior_mask, "employment_income"] = 0.0 - return result - - -def normalize_employment_income_donor_values(frame: pd.DataFrame) -> pd.DataFrame: - """Apply donor-side employment income semantic guards in a stable order.""" - adjusted = normalize_social_security_columns(frame) - adjusted = zero_minor_employment_income(adjusted) - return suppress_retired_senior_employment_income_without_esi(adjusted) - - -def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: - """Return rows where minors still carry positive employment income.""" - if "employment_income" not in frame.columns or "age" not in frame.columns: - return pd.Series(False, index=frame.index, dtype=bool) - ages = pd.to_numeric(frame["age"], errors="coerce") - income = pd.to_numeric(frame["employment_income"], errors="coerce").fillna(0.0) - return ages.lt(18).fillna(False) & income.gt(0.0) - - -VARIABLE_SEMANTIC_SPECS: dict[str, VariableSemanticSpec] = { - "age": VariableSemanticSpec( - projection_aggregation=ProjectionAggregation.MAX, - ), - "income": VariableSemanticSpec( - projection_aggregation=ProjectionAggregation.SUM, - ), - "state_fips": VariableSemanticSpec(native_entity=EntityType.HOUSEHOLD), - "tenure": VariableSemanticSpec(native_entity=EntityType.HOUSEHOLD), - "state": VariableSemanticSpec(native_entity=EntityType.HOUSEHOLD), - "household_vehicles_owned": VariableSemanticSpec( - native_entity=EntityType.HOUSEHOLD, - projection_aggregation=ProjectionAggregation.MAX, - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - notes="Household vehicle count from the SIPP asset donor.", - ), - "household_vehicles_value": VariableSemanticSpec( - native_entity=EntityType.HOUSEHOLD, - projection_aggregation=ProjectionAggregation.MAX, - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - notes="Household vehicle value from the SIPP asset donor.", - ), - "dividend_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - derived_from=( - "qualified_dividend_income", - "non_qualified_dividend_income", - ), - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - challenger_shared_condition_vars=( - PUF_DIVIDEND_INTEREST_CHALLENGER_SHARED_CONDITION_VARS - ), - notes="Dividend totals are derived from the qualified and non-qualified atomic basis.", - ), - "ordinary_dividend_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - derived_from=( - "qualified_dividend_income", - "non_qualified_dividend_income", - ), - notes="Ordinary dividend totals are derived from the qualified and non-qualified atomic basis.", - ), - "qualified_dividend_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - challenger_shared_condition_vars=( - PUF_DIVIDEND_INTEREST_CHALLENGER_SHARED_CONDITION_VARS - ), - ), - "non_qualified_dividend_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - challenger_shared_condition_vars=( - PUF_DIVIDEND_INTEREST_CHALLENGER_SHARED_CONDITION_VARS - ), - ), - "taxable_interest_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - challenger_shared_condition_vars=( - PUF_DIVIDEND_INTEREST_CHALLENGER_SHARED_CONDITION_VARS - ), - ), - "tax_exempt_interest_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - ), - "taxable_pension_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - challenger_shared_condition_vars=PUF_PENSION_CHALLENGER_SHARED_CONDITION_VARS, - ), - "taxable_social_security": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - ), - "state_income_tax_paid": VariableSemanticSpec( - native_entity=EntityType.TAX_UNIT, - condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - ), - "real_estate_tax_paid": VariableSemanticSpec( - native_entity=EntityType.TAX_UNIT, - condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - ), - "mortgage_interest_paid": VariableSemanticSpec( - native_entity=EntityType.TAX_UNIT, - condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - ), - "charitable_cash": VariableSemanticSpec( - native_entity=EntityType.TAX_UNIT, - condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - ), - "charitable_noncash": VariableSemanticSpec( - native_entity=EntityType.TAX_UNIT, - condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - ), - "student_loan_interest": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - ), - "ira_deduction": VariableSemanticSpec( - native_entity=EntityType.TAX_UNIT, - condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - ), - "health_savings_account_ald": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - ), - "self_employed_health_insurance_ald": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - ), - "self_employed_pension_contribution_ald": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - ), - "qualified_dividend_share": VariableSemanticSpec( - native_entity=EntityType.TAX_UNIT, - condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.BOUNDED_SHARE, - ), - "tax_unit_partnership_s_corp_income": VariableSemanticSpec( - native_entity=EntityType.TAX_UNIT, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - challenger_shared_condition_vars=( - PUF_PARTNERSHIP_CHALLENGER_SHARED_CONDITION_VARS - ), - ), - "employment_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - donor_transform=FrameSemanticTransform( - name="normalize_employment_income_donor_values", - required_columns=("employment_income", "age"), - transform_frame=normalize_employment_income_donor_values, - stage=SemanticTransformStage.POST_DONOR_INTEGRATION, - notes=( - "Employment income donor overrides should not assign positive wages " - "to minors and should suppress implausible retired-senior wages " - "when retirement Social Security is present without ESI." - ), - ), - donor_check=FrameSemanticCheck( - name="minor_positive_employment_income", - required_columns=("employment_income", "age"), - violation_mask=minor_positive_employment_income_mask, - stage=SemanticTransformStage.POST_DONOR_INTEGRATION, - notes="Minors should not retain positive donor-overridden wage income.", - ), - notes=( - "Employment income donor overrides should respect basic wage support " - "semantics for minors and retired seniors." - ), - ), - "self_employment_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - notes="Self-employment income is signed and must preserve losses.", - ), - "rental_income_positive": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=RENTAL_INCOME_COMPONENT_PREFERRED_CONDITION_VARS, - notes=( - "Positive rental-income support should track geography and property-like " - "predictors instead of generic labor-income conditioning." - ), - ), - "rental_income_negative": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=RENTAL_INCOME_COMPONENT_PREFERRED_CONDITION_VARS, - notes=( - "Rental-loss support should track geography and property-like " - "predictors instead of generic labor-income conditioning." - ), - ), - "partnership_s_corp_income": VariableSemanticSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - challenger_shared_condition_vars=( - PUF_PARTNERSHIP_CHALLENGER_SHARED_CONDITION_VARS - ), - ), - "has_medicaid": VariableSemanticSpec( - projection_aggregation=ProjectionAggregation.MAX, - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - notes="Binary proxy for Medicaid participation on the CPS scaffold.", - ), - "public_assistance": VariableSemanticSpec( - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - notes="Public assistance amounts are sparse and should preserve support.", - ), - "ssi": VariableSemanticSpec( - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - notes="SSI amounts are sparse and should preserve support.", - ), - "social_security": VariableSemanticSpec( - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - notes="Reported Social Security amounts are sparse and support-sensitive.", - ), - "snap": VariableSemanticSpec( - native_entity=EntityType.SPM_UNIT, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.SPM_UNIT, - ), - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - ), -} - -# SCF net-worth component leaves (G1). Person-entity, zero-inflated positive -# magnitudes (debt leaves are stored as positive balances; the -1 sign is -# applied only at net-worth reconciliation, mirroring eCPS -# NET_WORTH_COMPONENT_SIGNS). Imputed from the SCF donor on SCF_PREDICTORS -# (age, is_female, cps_race, is_married, own_children_in_household, -# employment_income, interest_dividend_income, social_security_pension_income). -SCF_NET_WORTH_COMPONENT_LEAVES: tuple[str, ...] = ( - "scf_certificates_of_deposit", - "scf_savings_bonds", - "scf_retirement_assets", - "scf_cash_value_life_insurance", - "scf_other_managed_assets", - "scf_other_financial_assets", - "scf_primary_residence_value", - "scf_other_residential_real_estate", - "scf_nonresidential_real_estate_equity", - "scf_business_equity", - "scf_other_nonfinancial_assets", - "scf_mortgage_debt", - "scf_other_residential_debt", - "scf_other_lines_of_credit", - "scf_credit_card_debt", - "scf_vehicle_installment_debt", - "scf_student_loan_debt", - "scf_other_installment_debt", - "scf_other_debt", -) -SCF_COMPONENT_PREFERRED_CONDITION_VARS: tuple[str, ...] = ( - "age", - "is_female", - "cps_race", - "is_married", - "own_children_in_household", - "employment_income", - "interest_dividend_income", - "social_security_pension_income", -) -for _scf_component_leaf in SCF_NET_WORTH_COMPONENT_LEAVES: - VARIABLE_SEMANTIC_SPECS[_scf_component_leaf] = VariableSemanticSpec( - native_entity=EntityType.PERSON, - support_family=VariableSupportFamily.SUPPORT_SENSITIVE, - preferred_condition_vars=SCF_COMPONENT_PREFERRED_CONDITION_VARS, - notes="SCF balance-sheet component leaf; positive magnitude.", - ) - -DIVIDEND_COMPONENT_COLUMNS = ( - "qualified_dividend_income", - "non_qualified_dividend_income", -) -DIVIDEND_TOTAL_COLUMNS = ( - "ordinary_dividend_income", - "dividend_income", -) -DIVIDEND_SHARE_COLUMN = "qualified_dividend_share" -DIVIDEND_COMPOSITION_MODEL_COLUMNS = ( - "dividend_income", - DIVIDEND_SHARE_COLUMN, -) -SOCIAL_SECURITY_COMPONENT_COLUMNS = ( - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", -) -SOCIAL_SECURITY_UNCLASSIFIED_COLUMN = "social_security_unclassified" - - -def _nonnegative_series(frame: pd.DataFrame, column: str) -> pd.Series: - if column not in frame.columns: - return pd.Series(0.0, index=frame.index, dtype=float) - return ( - pd.to_numeric(frame[column], errors="coerce") - .fillna(0.0) - .clip(lower=0.0) - .astype(float) - ) - - -# Share of a dividend total that is qualified when no observed qualified/ -# non-qualified breakdown is available (e.g. CPS DIV_VAL, which reports only a -# total). Basis: SOI 2015 PUF E00650/E00600 = $204.0B/$260.9B = 0.782 qualified. -# Splitting an unsplit total by this share avoids zeroing -# qualified_dividend_income on every CPS-native dividend row (which previously -# dumped 100% into non-qualified and inverted the national qualified vs -# non-qualified split relative to the SOI targets). -UNSPLIT_DIVIDEND_QUALIFIED_SHARE = 0.78 - - -def normalize_dividend_columns(frame: pd.DataFrame) -> pd.DataFrame: - """Normalize dividends onto an atomic basis, then derive totals.""" - result = frame.copy() - qualified = _nonnegative_series(result, "qualified_dividend_income") - non_qualified = _nonnegative_series(result, "non_qualified_dividend_income") - ordinary_total = _nonnegative_series(result, "ordinary_dividend_income") - dividend_total = _nonnegative_series(result, "dividend_income") - if "ordinary_dividend_income" in result.columns: - total = ordinary_total.where(ordinary_total.ne(0.0), dividend_total) - else: - total = dividend_total - - has_qualified = "qualified_dividend_income" in result.columns - has_non_qualified = "non_qualified_dividend_income" in result.columns - - if has_qualified and has_non_qualified: - component_total = qualified + non_qualified - total_only = component_total.eq(0.0) & total.gt(0.0) - # Allocate an unsplit total by the SOI qualified share rather than - # defaulting the whole amount to non-qualified. - qualified = qualified.where( - ~total_only, total * UNSPLIT_DIVIDEND_QUALIFIED_SHARE - ) - non_qualified = non_qualified.where( - ~total_only, total * (1.0 - UNSPLIT_DIVIDEND_QUALIFIED_SHARE) - ) - component_total = qualified + non_qualified - normalized_total = component_total.where(component_total.ne(0.0), total) - elif has_qualified: - normalized_total = np.maximum( - total.to_numpy(dtype=float), qualified.to_numpy(dtype=float) - ) - non_qualified = pd.Series( - normalized_total - qualified.to_numpy(dtype=float), - index=result.index, - dtype=float, - ) - normalized_total = pd.Series(normalized_total, index=result.index, dtype=float) - elif has_non_qualified: - normalized_total = np.maximum( - total.to_numpy(dtype=float), - non_qualified.to_numpy(dtype=float), - ) - qualified = pd.Series( - normalized_total - non_qualified.to_numpy(dtype=float), - index=result.index, - dtype=float, - ) - normalized_total = pd.Series(normalized_total, index=result.index, dtype=float) - else: - normalized_total = total.astype(float) - qualified = normalized_total * UNSPLIT_DIVIDEND_QUALIFIED_SHARE - non_qualified = normalized_total * (1.0 - UNSPLIT_DIVIDEND_QUALIFIED_SHARE) - - result["qualified_dividend_income"] = qualified.astype(float) - result["non_qualified_dividend_income"] = non_qualified.astype(float) - result["ordinary_dividend_income"] = normalized_total.astype(float) - result["dividend_income"] = normalized_total.astype(float) - return result - - -def normalize_social_security_columns(frame: pd.DataFrame) -> pd.DataFrame: - """Normalize Social Security onto an explicit component basis. - - Preserve any observed component columns and store any remaining gross - Social Security residual as an explicit unclassified amount. - """ - result = frame.copy() - component_series = { - column: _nonnegative_series(result, column) - for column in SOCIAL_SECURITY_COMPONENT_COLUMNS - } - component_sum = sum( - component_series.values(), start=pd.Series(0.0, index=result.index) - ) - existing_unclassified = _nonnegative_series( - result, SOCIAL_SECURITY_UNCLASSIFIED_COLUMN - ) - - if "social_security" in result.columns: - observed_total = _nonnegative_series(result, "social_security") - else: - observed_total = _nonnegative_series(result, "gross_social_security") - normalized_total = pd.Series( - np.maximum( - observed_total.to_numpy(dtype=float), - (component_sum + existing_unclassified).to_numpy(dtype=float), - ), - index=result.index, - dtype=float, - ) - unclassified = pd.Series( - np.maximum( - normalized_total.to_numpy(dtype=float) - - component_sum.to_numpy(dtype=float), - 0.0, - ), - index=result.index, - dtype=float, - ) - - for column, values in component_series.items(): - result[column] = values.astype(float) - result[SOCIAL_SECURITY_UNCLASSIFIED_COLUMN] = unclassified.astype(float) - result["social_security"] = normalized_total.astype(float) - return result - - -def social_security_retirement_compatible_amount(frame: pd.DataFrame) -> pd.Series: - """Return the PE-compatible retirement component amount. - - PolicyEngine models total Social Security as the sum of the four component - variables. Until we have a better backward allocator, treat any - unclassified residual as retirement at compatibility points. - """ - retirement = _nonnegative_series(frame, "social_security_retirement") - unclassified = _nonnegative_series(frame, SOCIAL_SECURITY_UNCLASSIFIED_COLUMN) - return (retirement + unclassified).astype(float) - - -def add_dividend_composition_features(frame: pd.DataFrame) -> pd.DataFrame: - """Add dividend total/share features derived from the atomic basis.""" - result = normalize_dividend_columns(frame) - total = _nonnegative_series(result, "dividend_income") - qualified = _nonnegative_series(result, "qualified_dividend_income") - share_values = np.divide( - qualified.to_numpy(dtype=float), - total.to_numpy(dtype=float), - out=np.zeros(len(result), dtype=float), - where=total.to_numpy(dtype=float) > 0.0, - ) - result[DIVIDEND_SHARE_COLUMN] = pd.Series( - np.clip(share_values, 0.0, 1.0), - index=result.index, - dtype=float, - ) - return result - - -def restore_dividend_components_from_composition(frame: pd.DataFrame) -> pd.DataFrame: - """Reconstruct dividend components from total + qualified share.""" - result = frame.copy() - total = _nonnegative_series(result, "dividend_income") - share = ( - pd.to_numeric(result.get(DIVIDEND_SHARE_COLUMN, 0.0), errors="coerce") - .fillna(0.0) - .clip(lower=0.0, upper=1.0) - .astype(float) - ) - qualified = pd.Series( - total.to_numpy(dtype=float) * share.to_numpy(dtype=float), - index=result.index, - dtype=float, - ) - non_qualified = pd.Series( - total.to_numpy(dtype=float) - qualified.to_numpy(dtype=float), - index=result.index, - dtype=float, - ) - result["qualified_dividend_income"] = qualified - result["non_qualified_dividend_income"] = non_qualified - result["ordinary_dividend_income"] = total - result["dividend_income"] = total - if DIVIDEND_SHARE_COLUMN in result.columns: - result = result.drop(columns=[DIVIDEND_SHARE_COLUMN]) - return result - - -DIVIDEND_DONOR_BLOCK_SPEC = DonorImputationBlockSpec( - native_entity=EntityType.PERSON, - condition_entities=( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ), - model_variables=DIVIDEND_COMPOSITION_MODEL_COLUMNS, - restored_variables=DIVIDEND_COMPONENT_COLUMNS, - match_strategies={DIVIDEND_SHARE_COLUMN: DonorMatchStrategy.RANK}, - prepare_frame=add_dividend_composition_features, - restore_frame=restore_dividend_components_from_composition, -) - -DONOR_CHAIN_PRIORITY: tuple[str, ...] = ( - "income", - "employment_income", - "employment_income_before_lsr", - "self_employment_income", - "self_employment_income_before_lsr", - "sstb_self_employment_income", - "sstb_self_employment_income_before_lsr", - "partnership_s_corp_income", - "tax_unit_partnership_s_corp_income", - "rental_income", - "rental_income_positive", - "rental_income_negative", - "farm_income", - "farm_operations_income", - "taxable_interest_income", - "tax_exempt_interest_income", - "dividend_income", - "ordinary_dividend_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - DIVIDEND_SHARE_COLUMN, - "short_term_capital_gains", - "long_term_capital_gains", - "net_capital_gains", - "social_security", - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - "taxable_social_security", - "pension_income", - "taxable_pension_income", - "tax_exempt_pension_income", - "unemployment_compensation", - "taxable_unemployment_compensation", - "health_savings_account_ald", - "self_employed_health_insurance_ald", - "self_employed_pension_contribution_ald", - "self_employed_pension_contributions", - "traditional_401k_contributions", - "roth_401k_contributions", - "traditional_ira_contributions", - "roth_ira_contributions", - "ira_deduction", - "student_loan_interest", - "state_income_tax_paid", - "real_estate_tax_paid", - "mortgage_interest_paid", - "charitable_cash", - "charitable_noncash", -) -_DONOR_CHAIN_PRIORITY_INDEX = { - variable: index for index, variable in enumerate(DONOR_CHAIN_PRIORITY) -} - - -def _donor_chain_sort_key(variable_name: str) -> tuple[int, str]: - return (_DONOR_CHAIN_PRIORITY_INDEX.get(variable_name, 10_000), variable_name) - - -def variable_semantic_spec_for(variable_name: str) -> VariableSemanticSpec: - """Return semantic metadata for one variable.""" - return VARIABLE_SEMANTIC_SPECS.get(variable_name, VariableSemanticSpec()) - - -def score_donor_condition_var( - condition_series: pd.Series, - target_series_list: Iterable[pd.Series], - *, - score_modes: Iterable[ConditionScoreMode], -) -> float: - """Score one shared conditioning variable against one donor target block.""" - condition = pd.to_numeric( - condition_series, - errors="coerce", - ).replace([np.inf, -np.inf], np.nan) - if condition.dropna().nunique() <= 1: - return 0.0 - - include_support = ConditionScoreMode.VALUE_AND_SUPPORT in set(score_modes) - best_score = 0.0 - for target_series in target_series_list: - target = pd.to_numeric( - target_series, - errors="coerce", - ).replace([np.inf, -np.inf], np.nan) - aligned = pd.concat( - [condition.rename("condition"), target.rename("target")], - axis=1, - ).dropna() - if len(aligned) < 3 or aligned["target"].nunique() <= 1: - continue - - value_correlation = aligned["condition"].corr( - aligned["target"], - method="spearman", - ) - if pd.notna(value_correlation): - best_score = max(best_score, abs(float(value_correlation))) - - if not include_support: - continue - support = (aligned["target"].abs() > 0).astype(float) - if 0.0 < float(support.mean()) < 1.0: - support_correlation = aligned["condition"].corr( - support, - method="spearman", - ) - if pd.notna(support_correlation): - best_score = max(best_score, abs(float(support_correlation))) - - return best_score - - -def is_condition_var_compatible_with_entity( - condition_variable: str, - *, - target_entity: EntityType, -) -> bool: - """Return whether a condition variable is semantically compatible with a target entity.""" - condition_entity = variable_semantic_spec_for(condition_variable).native_entity - allowed_entities = VariableSemanticSpec( - native_entity=target_entity - ).allowed_condition_entities - return condition_entity in set(allowed_entities) - - -def resolve_condition_entities_for_targets( - target_variables: Iterable[str], -) -> tuple[EntityType, ...]: - """Return the shared condition-entity policy for one donor target block.""" - target_variables = tuple(dict.fromkeys(target_variables)) - if not target_variables: - return (EntityType.PERSON, EntityType.HOUSEHOLD) - allowed_by_target = [ - variable_semantic_spec_for(variable).allowed_condition_entities - for variable in target_variables - ] - shared = set(allowed_by_target[0]) - for allowed_entities in allowed_by_target[1:]: - shared &= set(allowed_entities) - if not shared: - return (EntityType.HOUSEHOLD,) - return tuple(entity for entity in allowed_by_target[0] if entity in shared) - - -def is_condition_var_compatible_with_targets( - condition_variable: str, - *, - target_variables: Iterable[str], -) -> bool: - """Return whether a condition variable is compatible with one donor target block.""" - condition_entity = variable_semantic_spec_for(condition_variable).native_entity - return condition_entity in set( - resolve_condition_entities_for_targets(target_variables) - ) - - -def is_projected_condition_var_compatible( - condition_variable: str, - *, - projected_entity: EntityType, - allowed_condition_entities: Iterable[EntityType], -) -> bool: - """Return whether a condition variable remains compatible after projection.""" - condition_entity = variable_semantic_spec_for(condition_variable).native_entity - record_entity = getattr(EntityType, "RECORD", None) - allowed_entities = { - entity for entity in allowed_condition_entities if entity is not record_entity - } - if condition_entity in allowed_entities: - return True - return ( - condition_entity is EntityType.PERSON and projected_entity in allowed_entities - ) - - -def donor_imputation_block_specs( - variable_names: Iterable[str], -) -> tuple[DonorImputationBlockSpec, ...]: - """Plan donor-imputation model blocks and matching strategies.""" - remaining = set(variable_names) - block_specs: list[DonorImputationBlockSpec] = [] - if set(DIVIDEND_COMPONENT_COLUMNS).issubset(remaining): - block_specs.append(DIVIDEND_DONOR_BLOCK_SPEC) - remaining.difference_update(DIVIDEND_COMPONENT_COLUMNS) - - grouped_variables: dict[tuple[EntityType, tuple[EntityType, ...]], list[str]] = ( - defaultdict(list) - ) - for variable in sorted(remaining, key=_donor_chain_sort_key): - spec = variable_semantic_spec_for(variable) - grouped_variables[ - (spec.native_entity, resolve_condition_entities_for_targets((variable,))) - ].append(variable) - - for (native_entity, _), variables in sorted( - grouped_variables.items(), - key=lambda item: _donor_chain_sort_key(item[1][0]), - ): - block_variables = tuple(sorted(variables, key=_donor_chain_sort_key)) - block_specs.append( - DonorImputationBlockSpec( - native_entity=native_entity, - condition_entities=resolve_condition_entities_for_targets( - block_variables - ), - model_variables=block_variables, - restored_variables=block_variables, - match_strategies={ - variable: variable_semantic_spec_for(variable).donor_match_strategy - for variable in block_variables - }, - ) - ) - return tuple(block_specs) - - -def donor_imputation_blocks( - variable_names: Iterable[str], -) -> tuple[tuple[str, ...], ...]: - """Plan donor-imputation model blocks without coupling unrelated variables.""" - return tuple( - block_spec.model_variables - for block_spec in donor_imputation_block_specs(variable_names) - ) - - -def apply_donor_variable_semantics( - frame: pd.DataFrame, - variable_names: Iterable[str], -) -> pd.DataFrame: - """Apply post-imputation semantic guards for donor-integrated variables.""" - transforms: list[FrameSemanticTransform] = [] - seen_transform_names: set[str] = set() - for variable_name in tuple(dict.fromkeys(variable_names)): - transform = variable_semantic_spec_for(variable_name).donor_transform - if transform is None or transform.name in seen_transform_names: - continue - transforms.append(transform) - seen_transform_names.add(transform.name) - return apply_frame_semantic_transforms(frame, transforms) - - -def validate_donor_variable_semantics( - frame: pd.DataFrame, - variable_names: Iterable[str], -) -> tuple[FrameSemanticCheckReport, ...]: - """Evaluate semantic checks for donor-integrated variables.""" - checks: list[FrameSemanticCheck] = [] - seen_check_names: set[str] = set() - for variable_name in tuple(dict.fromkeys(variable_names)): - check = variable_semantic_spec_for(variable_name).donor_check - if check is None or check.name in seen_check_names: - continue - checks.append(check) - seen_check_names.add(check.name) - return evaluate_frame_semantic_checks(frame, checks) - - -def resolve_variable_semantic_capabilities( - variable_names: Iterable[str], -) -> dict[str, SourceVariableCapability]: - """Resolve generic capabilities implied by variable semantics alone.""" - available = tuple(dict.fromkeys(variable_names)) - resolved: dict[str, SourceVariableCapability] = {} - for variable, spec in VARIABLE_SEMANTIC_SPECS.items(): - if variable not in available or not spec.is_redundant_given(available): - continue - resolved[variable] = SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - notes=spec.notes, - ) - return resolved - - -def prune_redundant_variables(variable_names: Iterable[str]) -> set[str]: - """Drop derived variables when their atomic basis is already present.""" - result = set(variable_names) - for variable in resolve_variable_semantic_capabilities(result): - result.discard(variable) - return result diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index 431fb7a6..00000000 --- a/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Test package for microplex-us.""" diff --git a/tests/bakeoff/__init__.py b/tests/bakeoff/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/bakeoff/test_scale_up.py b/tests/bakeoff/test_scale_up.py deleted file mode 100644 index 0a1372f6..00000000 --- a/tests/bakeoff/test_scale_up.py +++ /dev/null @@ -1,218 +0,0 @@ -"""Smoke tests for the synthesizer scale-up harness. - -These tests exercise the harness on a deliberately tiny slice of real -enhanced_cps_2024. They do NOT constitute the scale-up benchmark itself; -that lives behind the CLI and takes significantly longer. - -The goal here is: does the harness load data, fit a synthesizer, compute -metrics, and return a populated ScaleUpResult without crashing? -""" - -from __future__ import annotations - -import importlib.util -from pathlib import Path - -import numpy as np -import pandas as pd -import pytest - -from microplex_us.bakeoff import ( - DEFAULT_CONDITION_COLS, - DEFAULT_TARGET_COLS, - ScaleUpRunner, - ScaleUpStageConfig, - stage1_config, -) - -_ENHANCED_CPS_PATH = ( - Path.home() - / "PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5" -) - -pytestmark = [ - pytest.mark.skipif( - not _ENHANCED_CPS_PATH.exists(), - reason="enhanced_cps_2024.h5 not available locally", - ), - pytest.mark.skipif( - importlib.util.find_spec("prdc") is None, - reason="prdc package not installed (uv pip install prdc)", - ), -] - - -@pytest.fixture(scope="module") -def small_config() -> ScaleUpStageConfig: - """Tiny config — a handful of columns, ~500 rows, one fast method.""" - base = stage1_config() - return ScaleUpStageConfig( - stage="smoke", - n_rows=500, - methods=("ZI-QRF",), - condition_cols=("age", "is_female"), - target_cols=( - "employment_income_last_year", - "self_employment_income_last_year", - "snap_reported", - ), - holdout_frac=0.2, - seed=0, - k=5, - n_generate=400, - data_path=base.data_path, - year=base.year, - rare_cell_checks=(), # skip rare-cell checks in smoke - ) - - -def test_load_frame_returns_expected_shape(small_config: ScaleUpStageConfig) -> None: - runner = ScaleUpRunner(small_config) - df = runner.load_frame() - # n_rows is the upper bound after subsampling; if fewer in source, we get fewer. - assert len(df) <= small_config.n_rows + 1 - assert len(df) > 100 # still a real sample - expected_cols = set(small_config.condition_cols) | set(small_config.target_cols) - assert expected_cols <= set(df.columns) - - -def test_split_train_holdout_shapes(small_config: ScaleUpStageConfig) -> None: - runner = ScaleUpRunner(small_config) - df = runner.load_frame() - train, holdout = runner.split(df) - assert len(train) + len(holdout) == len(df) - # 20 % holdout within ±1 - expected_holdout = int(len(df) * 0.2) - assert abs(len(holdout) - expected_holdout) <= 1 - - -def test_fit_and_generate_returns_dataframe( - small_config: ScaleUpStageConfig, -) -> None: - runner = ScaleUpRunner(small_config) - df = runner.load_frame() - train, _ = runner.split(df) - synthetic, timing = runner.fit_and_generate("ZI-QRF", train, n_generate=200) - - assert isinstance(synthetic, pd.DataFrame) - assert len(synthetic) == 200 - assert timing["fit_wall_seconds"] >= 0 - assert timing["generate_wall_seconds"] >= 0 - assert timing["peak_rss_gb_during_fit"] > 0 - - -def test_run_returns_populated_result(small_config: ScaleUpStageConfig) -> None: - runner = ScaleUpRunner(small_config) - results = runner.run() - assert len(results) == 1 - r = results[0] - assert r.method == "ZI-QRF" - assert r.stage == "smoke" - # PRDC values in [0, 1]. - for val in (r.precision, r.density, r.coverage): - assert 0.0 <= val <= 1.0 + 1e-9 - # Zero-rate MAE in [0, 1]. - assert 0.0 <= r.zero_rate_mae <= 1.0 - assert r.n_train_rows > 0 - assert r.n_holdout_rows > 0 - assert r.n_cols == 5 # 2 cond + 3 target - - -def test_missing_column_raises_cleanly() -> None: - cfg = ScaleUpStageConfig( - stage="smoke", - n_rows=100, - methods=("ZI-QRF",), - condition_cols=("age", "definitely_not_a_real_column"), - target_cols=("employment_income_last_year",), - data_path=_ENHANCED_CPS_PATH, - rare_cell_checks=(), - ) - runner = ScaleUpRunner(cfg) - with pytest.raises(KeyError, match="definitely_not_a_real_column"): - runner.load_frame() - - -def test_default_column_sets_are_sensible() -> None: - """Sanity check on the curated default column list.""" - total = set(DEFAULT_CONDITION_COLS) | set(DEFAULT_TARGET_COLS) - assert len(total) == len(DEFAULT_CONDITION_COLS) + len(DEFAULT_TARGET_COLS), ( - "Default conditioning and target columns overlap" - ) - assert len(DEFAULT_CONDITION_COLS) >= 5 - assert len(DEFAULT_TARGET_COLS) >= 20 - assert len(total) <= 60, "Stage-1 default exceeds ~50-column budget" - - -def test_incremental_jsonl_persists_each_method( - small_config: ScaleUpStageConfig, tmp_path: Path -) -> None: - """Each completed method gets written as JSONL before the next starts.""" - import json as _json - - runner = ScaleUpRunner(small_config) - incremental = tmp_path / "stage_incremental.jsonl" - results = runner.run(incremental_path=incremental) - - assert incremental.exists() - lines = [ln for ln in incremental.read_text().splitlines() if ln.strip()] - assert len(lines) == len(results) - # Round-trip: each line decodes to a ScaleUpResult-shaped dict. - for line in lines: - d = _json.loads(line) - assert {"method", "stage", "coverage", "fit_wall_seconds"} <= set(d) - - -def test_method_kwargs_forwarded_to_constructor( - small_config: ScaleUpStageConfig, -) -> None: - """Method-level hyperparameter overrides reach the method class.""" - # ZI-QRF accepts n_estimators as a constructor kwarg. Override to - # 3 trees so we can verify it propagates. - cfg = ScaleUpStageConfig( - stage=small_config.stage, - n_rows=small_config.n_rows, - methods=("ZI-QRF",), - condition_cols=small_config.condition_cols, - target_cols=small_config.target_cols, - holdout_frac=small_config.holdout_frac, - seed=small_config.seed, - k=small_config.k, - n_generate=small_config.n_generate, - data_path=small_config.data_path, - year=small_config.year, - rare_cell_checks=small_config.rare_cell_checks, - method_kwargs={"ZI-QRF": {"n_estimators": 3}}, - ) - runner = ScaleUpRunner(cfg) - df = runner.load_frame() - train, _ = runner.split(df) - synthetic, _ = runner.fit_and_generate("ZI-QRF", train, n_generate=50) - assert len(synthetic) == 50 - - -def test_zero_rate_per_column_populated(small_config: ScaleUpStageConfig) -> None: - """Per-column zero-rate breakdown is recorded for every target column.""" - runner = ScaleUpRunner(small_config) - results = runner.run() - assert len(results) == 1 - r = results[0] - assert r.zero_rate_per_column, "Expected non-empty zero_rate_per_column" - for col, entry in r.zero_rate_per_column.items(): - assert set(entry) == {"real", "synth", "abs_diff"} - assert 0.0 <= entry["real"] <= 1.0 - assert 0.0 <= entry["synth"] <= 1.0 - assert entry["abs_diff"] >= 0.0 - # abs_diff should be consistent with real/synth values. - assert abs(entry["abs_diff"] - abs(entry["real"] - entry["synth"])) < 1e-9 - # Confirm all target columns are covered. - covered = set(r.zero_rate_per_column) - assert set(small_config.target_cols) <= covered - # And that the scalar MAE is close to the mean of abs_diff over target cols. - target_diffs = [ - r.zero_rate_per_column[c]["abs_diff"] for c in small_config.target_cols - ] - # MAE is averaged over all shared columns (conditioning + target), so this - # is only a rough consistency check: the per-target mean should be - # within the scalar MAE's ballpark. - assert min(target_diffs) <= r.zero_rate_mae + 1e-9 diff --git a/tests/calibration/__init__.py b/tests/calibration/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/calibration/test_microcalibrate_adapter.py b/tests/calibration/test_microcalibrate_adapter.py deleted file mode 100644 index fd6c3388..00000000 --- a/tests/calibration/test_microcalibrate_adapter.py +++ /dev/null @@ -1,233 +0,0 @@ -"""Small-scale smoke tests for the microcalibrate-backed calibration adapter. - -These exercise the adapter's interface contract (matches the legacy -`Calibrator.fit_transform` shape) and verify that the underlying -gradient-descent chi-squared solver actually moves weights toward the -requested targets on a deliberately small problem. - -Scale-up validation happens separately (see -`docs/synthesizer-benchmark-scale-up.md`). These tests are only expected -to run in seconds. -""" - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pytest -from microplex.calibration import LinearConstraint - -from microplex_us.calibration import ( - MicrocalibrateAdapter, - MicrocalibrateAdapterConfig, -) - - -def _toy_data(n_records: int = 100, seed: int = 0) -> pd.DataFrame: - rng = np.random.default_rng(seed) - return pd.DataFrame( - { - "age": rng.integers(18, 70, size=n_records), - "income": rng.normal(40_000, 20_000, size=n_records).clip(0, None), - "weight": np.ones(n_records), - } - ) - - -def _age_band_constraint( - data: pd.DataFrame, name: str, low: int, high: int, target: float -) -> LinearConstraint: - mask = (data["age"] >= low) & (data["age"] < high) - return LinearConstraint( - name=name, - coefficients=mask.astype(float).to_numpy(), - target=target, - ) - - -def _income_age_band_constraint( - data: pd.DataFrame, name: str, low: int, high: int, target: float -) -> LinearConstraint: - mask = (data["age"] >= low) & (data["age"] < high) - coefs = (mask.astype(float) * data["income"]).to_numpy() - return LinearConstraint(name=name, coefficients=coefs, target=target) - - -class TestInterfaceContract: - """Adapter matches the legacy `Calibrator.fit_transform` signature.""" - - def test_empty_constraints_returns_copy_unchanged(self) -> None: - data = _toy_data() - adapter = MicrocalibrateAdapter() - result = adapter.fit_transform(data, marginal_targets={}) - pd.testing.assert_frame_equal(result, data) - # Should not share storage with the input. - assert result is not data - - def test_weight_column_validation(self) -> None: - data = _toy_data().drop(columns=["weight"]) - adapter = MicrocalibrateAdapter() - with pytest.raises(ValueError, match="weight column 'weight' not found"): - adapter.fit_transform( - data, - marginal_targets={}, - linear_constraints=( - _age_band_constraint(_toy_data(), "age_18_30", 18, 30, 20.0), - ), - ) - - def test_constraint_shape_validation(self) -> None: - data = _toy_data() - adapter = MicrocalibrateAdapter() - bad_constraint = LinearConstraint( - name="wrong_shape", - coefficients=np.ones(len(data) + 5), - target=10.0, - ) - with pytest.raises(ValueError, match="constraint 'wrong_shape'"): - adapter.fit_transform( - data, - marginal_targets={}, - linear_constraints=(bad_constraint,), - ) - - def test_preserves_all_records(self) -> None: - data = _toy_data() - adapter = MicrocalibrateAdapter( - MicrocalibrateAdapterConfig(epochs=8, noise_level=0.0) - ) - constraint = _age_band_constraint(data, "age_18_40", 18, 40, target=30.0) - result = adapter.fit_transform( - data, - marginal_targets={}, - linear_constraints=(constraint,), - ) - # Identity preservation: every record survives. - assert len(result) == len(data) - pd.testing.assert_index_equal(result.index, data.index) - # No negative weights. - assert (result["weight"] >= 0).all() - - -class TestCalibrationMovesWeights: - """Adapter actually does the job — weights shift toward the targets.""" - - def test_single_constraint_converges(self) -> None: - """One age-band count constraint should be matched within tolerance.""" - data = _toy_data(n_records=200, seed=1) - # Current weighted count in [25, 45) band. - mask = (data["age"] >= 25) & (data["age"] < 45) - current_count = float(mask.sum()) - # Ask for 2x the current weighted count. - target = 2.0 * current_count - - constraint = _age_band_constraint(data, "age_25_45", 25, 45, target=target) - adapter = MicrocalibrateAdapter( - MicrocalibrateAdapterConfig( - epochs=400, - learning_rate=0.05, - noise_level=0.0, - ) - ) - result = adapter.fit_transform( - data, - marginal_targets={}, - linear_constraints=(constraint,), - ) - - validation = adapter.validate(result) - errors = validation["linear_errors"] - assert "age_25_45" in errors - # 5 % relative tolerance is generous for 400 epochs on 1 constraint. - assert errors["age_25_45"]["relative_error"] < 0.05 - # Weighted count actually moved. - weighted_count = float( - (result["age"] >= 25).values - * (result["age"] < 45).values - * result["weight"].to_numpy() - ).sum() if False else float(result.loc[mask, "weight"].sum()) - # Should be close to target; at least 1.5x original (we asked for 2x). - assert weighted_count > 1.5 * current_count - - def test_two_orthogonal_constraints_both_improve(self) -> None: - """Separate age-band and income-age-band constraints should both reduce.""" - data = _toy_data(n_records=300, seed=2) - - # Current sums. - band_mask = (data["age"] >= 30) & (data["age"] < 50) - current_count = float(band_mask.sum()) - current_income_sum = float(data.loc[band_mask, "income"].sum()) - - constraints = ( - _age_band_constraint( - data, "count_30_50", 30, 50, target=1.4 * current_count - ), - _income_age_band_constraint( - data, "income_30_50", 30, 50, target=1.4 * current_income_sum - ), - ) - - adapter = MicrocalibrateAdapter( - MicrocalibrateAdapterConfig( - epochs=400, - learning_rate=0.05, - noise_level=0.0, - ) - ) - result = adapter.fit_transform( - data, - marginal_targets={}, - linear_constraints=constraints, - ) - - validation = adapter.validate(result) - # Both constraints should get meaningfully closer to target. - # 10 % relative tolerance since there's inherent trade-off between - # count and income-sum constraints on the same band. - for name in ("count_30_50", "income_30_50"): - rel = validation["linear_errors"][name]["relative_error"] - assert rel < 0.10, f"constraint {name} still at rel_error={rel:.3f}" - - -class TestValidationShape: - """Validation output has the keys the downstream pipeline expects.""" - - def test_validation_keys(self) -> None: - data = _toy_data() - adapter = MicrocalibrateAdapter( - MicrocalibrateAdapterConfig(epochs=4, noise_level=0.0) - ) - constraint = _age_band_constraint(data, "a", 18, 40, target=30.0) - _ = adapter.fit_transform( - data, - marginal_targets={}, - linear_constraints=(constraint,), - ) - validation = adapter.validate() - - assert set(validation) == { - "converged", - "max_error", - "sparsity", - "linear_errors", - } - assert isinstance(validation["converged"], bool) - assert isinstance(validation["max_error"], float) - assert 0.0 <= validation["sparsity"] <= 1.0 - assert "a" in validation["linear_errors"] - - entry = validation["linear_errors"]["a"] - assert set(entry) == { - "target", - "estimate", - "relative_error", - "absolute_error", - } - - def test_validation_without_calibration_is_trivially_converged(self) -> None: - adapter = MicrocalibrateAdapter() - validation = adapter.validate() - assert validation["converged"] is True - assert validation["max_error"] == 0.0 - assert validation["sparsity"] == 0.0 - assert validation["linear_errors"] == {} diff --git a/tests/calibration/test_microcalibrate_adapter_memory.py b/tests/calibration/test_microcalibrate_adapter_memory.py deleted file mode 100644 index 408f308c..00000000 --- a/tests/calibration/test_microcalibrate_adapter_memory.py +++ /dev/null @@ -1,105 +0,0 @@ -"""Adapter must not materialize the estimate matrix as float64 pandas. - -At v7 scale (1.5M households x ~500 constraints) the adapter's pre-fix -behavior builds a float64 DataFrame (6 GB) *and* microcalibrate keeps -it alive in memory alongside a float32 torch copy. The combined footprint -pushes the workstation past macOS jetsam kill threshold. - -These tests pin the adapter's memory contract: the estimate matrix passed -to microcalibrate.Calibration must be float32 from the start. Adapter -behavior on small inputs is unchanged; only the dtype is tightened. -""" - -from __future__ import annotations - -from typing import Any -from unittest.mock import patch - -import numpy as np -import pandas as pd -from microplex.calibration import LinearConstraint - -from microplex_us.calibration import MicrocalibrateAdapter - - -def _toy_data(n_records: int = 200, seed: int = 0) -> pd.DataFrame: - rng = np.random.default_rng(seed) - return pd.DataFrame( - { - "age": rng.integers(18, 70, size=n_records), - "income": rng.normal(40_000, 20_000, size=n_records).clip(0, None), - "weight": np.ones(n_records), - } - ) - - -def _age_band( - data: pd.DataFrame, name: str, low: int, high: int, target: float -) -> LinearConstraint: - mask = (data["age"] >= low) & (data["age"] < high) - return LinearConstraint( - name=name, - coefficients=mask.astype(float).to_numpy(), - target=target, - ) - - -class TestEstimateMatrixDtype: - """The adapter must not pass a float64 estimate matrix to Calibration.""" - - def test_estimate_matrix_passed_to_calibration_is_float32(self) -> None: - """Intercept Calibration.__init__ and inspect the estimate_matrix arg.""" - captured: dict[str, Any] = {} - - from microcalibrate import Calibration as _RealCalibration - - original_init = _RealCalibration.__init__ - - def spy_init(self: Any, *args: Any, **kwargs: Any) -> None: - captured["estimate_matrix"] = kwargs.get("estimate_matrix") - original_init(self, *args, **kwargs) - - data = _toy_data() - constraints = ( - _age_band(data, "age_18_30", 18, 30, 40.0), - _age_band(data, "age_30_45", 30, 45, 60.0), - _age_band(data, "age_45_70", 45, 70, 100.0), - ) - adapter = MicrocalibrateAdapter() - with patch.object(_RealCalibration, "__init__", spy_init): - adapter.fit_transform(data, linear_constraints=constraints) - - estimate_matrix = captured["estimate_matrix"] - assert estimate_matrix is not None, "Calibration was not constructed" - - if isinstance(estimate_matrix, pd.DataFrame): - for col, dtype in estimate_matrix.dtypes.items(): - assert dtype == np.float32, ( - f"estimate_matrix column {col!r} is {dtype}, expected float32 " - "(float64 doubles adapter peak memory at v7 scale)" - ) - else: - arr = np.asarray(estimate_matrix) - assert arr.dtype == np.float32, ( - f"estimate_matrix dtype is {arr.dtype}, expected float32" - ) - - def test_weights_still_converge_with_float32(self) -> None: - """Dtype tightening must not break the convergence behavior.""" - from microplex_us.calibration import MicrocalibrateAdapterConfig - - data = _toy_data(n_records=300) - constraints = ( - _age_band(data, "age_18_30", 18, 30, 60.0), - _age_band(data, "age_30_45", 30, 45, 90.0), - _age_band(data, "age_45_70", 45, 70, 150.0), - ) - adapter = MicrocalibrateAdapter( - MicrocalibrateAdapterConfig( - epochs=400, learning_rate=0.05, noise_level=0.0 - ) - ) - result = adapter.fit_transform(data, linear_constraints=constraints) - validation = adapter.validate(result) - # Same tolerance the existing smoke tests in this package use. - assert validation["max_error"] < 0.1, validation diff --git a/tests/calibration/test_us_pipeline_dispatch.py b/tests/calibration/test_us_pipeline_dispatch.py deleted file mode 100644 index 3e6da490..00000000 --- a/tests/calibration/test_us_pipeline_dispatch.py +++ /dev/null @@ -1,114 +0,0 @@ -"""Pipeline-level test: `calibration_backend="microcalibrate"` dispatches to -`MicrocalibrateAdapter` and round-trips one calibration call inside the -USMicroplexPipeline context. - -This is the final link between the adapter and the production pipeline: -the backend string needs to be valid in `USMicroplexBuildConfig`, and -`_build_weight_calibrator` must return an adapter instance that -satisfies the same `fit_transform` / `validate` contract the rest of -`calibrate_policyengine_tables` expects. -""" - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pytest -from microplex.calibration import LinearConstraint - -from microplex_us.calibration import MicrocalibrateAdapter -from microplex_us.pipelines.us import USMicroplexBuildConfig, USMicroplexPipeline - - -def _toy_households(n: int = 100, seed: int = 0) -> pd.DataFrame: - rng = np.random.default_rng(seed) - return pd.DataFrame( - { - "household_id": np.arange(n), - "household_weight": np.ones(n, dtype=float), - "income": rng.normal(80_000, 40_000, n).clip(0, None), - } - ) - - -def test_backend_string_resolves_to_adapter() -> None: - cfg = USMicroplexBuildConfig(calibration_backend="microcalibrate") - pipeline = USMicroplexPipeline(cfg) - calibrator = pipeline._build_weight_calibrator() - assert isinstance(calibrator, MicrocalibrateAdapter) - - -def test_backend_dispatch_fit_transform_end_to_end() -> None: - """Full path: pipeline config → dispatch → fit_transform → validate.""" - cfg = USMicroplexBuildConfig( - calibration_backend="microcalibrate", - calibration_max_iter=200, - ) - pipeline = USMicroplexPipeline(cfg) - calibrator = pipeline._build_weight_calibrator() - - data = _toy_households(n=200, seed=1) - # Constraint: weighted count of households with income > 80k should be 1.4x current. - mask = (data["income"] > 80_000).to_numpy(dtype=float) - target = 1.4 * float(mask.sum()) - constraint = LinearConstraint(name="above_80k", coefficients=mask, target=target) - - result = calibrator.fit_transform( - data, - marginal_targets={}, - weight_col="household_weight", - linear_constraints=(constraint,), - ) - - assert len(result) == len(data) - assert "household_weight" in result.columns - assert (result["household_weight"] >= 0).all() - - validation = calibrator.validate(result) - assert set(validation) == {"converged", "max_error", "sparsity", "linear_errors"} - assert "above_80k" in validation["linear_errors"] - - -def test_invalid_backend_still_raises() -> None: - """Regression test: unknown backend strings surface a clear error.""" - # The Literal type is only checked by static tools; runtime dispatch - # raises a ValueError, which we want to preserve. - # Construct the dataclass bypassing the Literal constraint. - bad_cfg = USMicroplexBuildConfig() - object.__setattr__(bad_cfg, "calibration_backend", "no_such_backend") - pipeline = USMicroplexPipeline(bad_cfg) - with pytest.raises(ValueError, match="Unsupported calibration backend"): - pipeline._build_weight_calibrator() - - -def test_pe_l0_deferred_stage_disables_sparsity_penalty() -> None: - """Stages ≥2 must refine weights without re-sparsifying. - - v10 ran three L0 stages with `lambda_l0=1e-4` each, warm-starting - stages 2/3 from stage 1's already-sparse weights. Loss compounded - pruning down to 1,511 active households — unusable. Stages 2+ now - drop the sparsity penalty so they only reduce residual error. - """ - cfg = USMicroplexBuildConfig(calibration_backend="pe_l0") - pipeline = USMicroplexPipeline(cfg) - - stage1 = pipeline._build_weight_calibrator(stage_index=1) - stage2 = pipeline._build_weight_calibrator(stage_index=2) - stage3 = pipeline._build_weight_calibrator(stage_index=3) - - assert stage1.lambda_l0 == pytest.approx(1e-4) - assert stage2.lambda_l0 == 0.0 - assert stage3.lambda_l0 == 0.0 - assert stage1.fit_l0_weights_fn is not None - assert stage2.fit_l0_weights_fn is not None - assert stage3.fit_l0_weights_fn is not None - - -def test_hardconcrete_deferred_stage_disables_sparsity_penalty() -> None: - cfg = USMicroplexBuildConfig(calibration_backend="hardconcrete") - pipeline = USMicroplexPipeline(cfg) - stage1 = pipeline._build_weight_calibrator(stage_index=1) - stage2 = pipeline._build_weight_calibrator(stage_index=2) - assert stage1.lambda_l0 == pytest.approx(1e-4) - assert isinstance(stage2, MicrocalibrateAdapter) - assert stage2.config.regularize_with_l0 is False diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 5d265f64..00000000 --- a/tests/conftest.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Test path setup for the local microplex-us package.""" - -from __future__ import annotations - -import sys -from pathlib import Path - -SRC_DIR = Path(__file__).resolve().parents[1] / "src" -if str(SRC_DIR) not in sys.path: - sys.path.insert(0, str(SRC_DIR)) diff --git a/tests/data_sources/__init__.py b/tests/data_sources/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data_sources/test_cps_age.py b/tests/data_sources/test_cps_age.py deleted file mode 100644 index 6abbe726..00000000 --- a/tests/data_sources/test_cps_age.py +++ /dev/null @@ -1,55 +0,0 @@ -import zipfile - -import pandas as pd -import polars as pl - -from microplex_us.data_sources.cps import load_cps_asec -from microplex_us.data_sources.cps_age import randomize_cps_topcoded_age_80_84 -from microplex_us.data_sources.cps_mappings import map_age - - -def test_randomize_cps_topcoded_age_80_84_is_deterministic(): - frame = pl.DataFrame({"age": [79, *([80] * 20), 85]}) - - first = randomize_cps_topcoded_age_80_84(frame) - second = randomize_cps_topcoded_age_80_84(frame) - ages = first["age"].to_list() - - assert ages == second["age"].to_list() - assert ages[0] == 79 - assert ages[-1] == 85 - assert set(ages[1:-1]).issubset({80, 81, 82, 83, 84}) - assert len(set(ages[1:-1])) > 1 - - -def test_map_age_spreads_cps_80_to_80_84(): - frame = pl.DataFrame({"A_AGE": [79, *([80] * 20), 85]}) - - result = map_age(frame) - ages = result["age"].to_list() - - assert ages[0] == 79 - assert ages[-1] == 85 - assert set(ages[1:-1]).issubset({80, 81, 82, 83, 84}) - assert len(set(ages[1:-1])) > 1 - - -def test_load_cps_asec_spreads_topcoded_age_80(tmp_path): - person_rows = pd.DataFrame( - { - "PH_SEQ": [1] * 22, - "A_LINENO": list(range(1, 23)), - "A_AGE": [79, *([80] * 20), 85], - "A_FNLWGT": [100] * 22, - } - ) - with zipfile.ZipFile(tmp_path / "cps_asec_2023.zip", "w") as archive: - archive.writestr("pppub23.csv", person_rows.to_csv(index=False)) - - dataset = load_cps_asec(year=2023, cache_dir=tmp_path, download=False) - ages = dataset.persons.sort("person_number")["age"].to_list() - - assert ages[0] == 79 - assert ages[-1] == 85 - assert set(ages[1:-1]).issubset({80, 81, 82, 83, 84}) - assert len(set(ages[1:-1])) > 1 diff --git a/tests/data_sources/test_cps_derived_income_copies.py b/tests/data_sources/test_cps_derived_income_copies.py deleted file mode 100644 index 2a66e02b..00000000 --- a/tests/data_sources/test_cps_derived_income_copies.py +++ /dev/null @@ -1,114 +0,0 @@ -"""Tests for the CPS-derived direct income copies (G7 export-parity gap). - -The Enhanced CPS exports three person-level income leaves as direct copies of -raw ASEC fields (``policyengine_us_data/datasets/cps/cps.py:1493-1495``): - -- ``survivor_benefits`` <- ``SRVS_VAL`` -- ``educational_assistance`` <- ``ED_VAL`` -- ``financial_assistance`` <- ``FIN_VAL`` - -Microplex produced none of them: the raw fields were not mapped in -``PERSON_VARIABLES`` and the leaves were absent from the export allowlist, so -they never reached the H5. These tests exercise the real ``_process_persons`` -(no stubbing) to prove the rename happens, plus assert allowlist membership and -that no alias remaps the leaves. -""" - -import polars as pl - -from microplex_us.data_sources.cps import PERSON_VARIABLES, _process_persons - -_COPIES = { - "SRVS_VAL": "survivor_benefits", - "ED_VAL": "educational_assistance", - "FIN_VAL": "financial_assistance", -} - - -def _raw_person_frame(rows: list[dict]) -> pl.DataFrame: - """Raw CPS-style person frame carrying the income-copy fields. - - Census column names are used because ``_process_persons`` selects/renames - via ``PERSON_VARIABLES``. - """ - n = len(rows) - return pl.DataFrame( - { - "PH_SEQ": [1] * n, - "A_LINENO": list(range(1, n + 1)), - "A_FNLWGT": [100.0] * n, - "A_AGE": [row.get("age", 40) for row in rows], - "SRVS_VAL": [row.get("srvs", 0.0) for row in rows], - "ED_VAL": [row.get("ed", 0.0) for row in rows], - "FIN_VAL": [row.get("fin", 0.0) for row in rows], - } - ) - - -def test_person_variables_maps_the_three_raw_fields(): - for census, leaf in _COPIES.items(): - assert PERSON_VARIABLES.get(census) == leaf - - -def test_process_persons_copies_raw_fields_to_leaves(): - """The raw ASEC values are copied verbatim onto the pe-us input leaves.""" - rows = [ - {"srvs": 12_000.0, "ed": 0.0, "fin": 0.0}, - {"srvs": 0.0, "ed": 5_000.0, "fin": 0.0}, - {"srvs": 0.0, "ed": 0.0, "fin": 3_200.0}, - {"srvs": 800.0, "ed": 1_100.0, "fin": 450.0}, - {"srvs": 0.0, "ed": 0.0, "fin": 0.0}, # non-recipient - ] - result = _process_persons(_raw_person_frame(rows), 2023) - - for census, leaf in _COPIES.items(): - assert leaf in result.columns, f"{leaf} not produced" - got = result[leaf].to_list() - expected = [row.get(_FIELD_FOR[census], 0.0) for row in rows] - assert got == expected, f"{leaf}: {got} != {expected}" - - -_FIELD_FOR = {"SRVS_VAL": "srvs", "ED_VAL": "ed", "FIN_VAL": "fin"} - - -def test_copies_are_non_degenerate(): - """Each leaf carries distinct nonzero values, not a constant/zero fill.""" - rows = [ - {"srvs": 9_000.0, "ed": 2_000.0, "fin": 1_500.0}, - {"srvs": 21_000.0, "ed": 6_500.0, "fin": 4_000.0}, - {"srvs": 0.0, "ed": 0.0, "fin": 0.0}, - ] - result = _process_persons(_raw_person_frame(rows), 2023) - for leaf in _COPIES.values(): - values = [v for v in result[leaf].to_list() if v > 0] - assert len(values) >= 2, f"{leaf} should be positive for several records" - assert len(set(values)) >= 2, f"{leaf} should not be a single constant" - - -def test_copies_in_export_allowlist_and_not_aliased(): - from microplex_us.policyengine.us import ( - POLICYENGINE_US_EXPORT_COLUMN_ALIASES, - SAFE_POLICYENGINE_US_EXPORT_VARIABLES, - ) - - for leaf in _COPIES.values(): - assert leaf in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert POLICYENGINE_US_EXPORT_COLUMN_ALIASES.get(leaf) is None - - -if __name__ == "__main__": - import traceback - - funcs = [v for k, v in sorted(globals().items()) if k.startswith("test_")] - passed = failed = 0 - for fn in funcs: - try: - fn() - print(f"PASS {fn.__name__}") - passed += 1 - except Exception: # noqa: BLE001 - print(f"FAIL {fn.__name__}") - traceback.print_exc() - failed += 1 - print(f"SUMMARY passed={passed} failed={failed}") - raise SystemExit(1 if failed else 0) diff --git a/tests/data_sources/test_cps_difficulty_recodes.py b/tests/data_sources/test_cps_difficulty_recodes.py deleted file mode 100644 index 17119ea6..00000000 --- a/tests/data_sources/test_cps_difficulty_recodes.py +++ /dev/null @@ -1,133 +0,0 @@ -"""Tests for the eCPS disability-difficulty recodes (Gate-1 export gap). - -The Enhanced CPS exports six person-level ``difficulty_*`` columns recoded from -the ASEC ``PEDIS*`` fields (``PEDIS{X} == 1`` -> ``True``), the same recode it -uses for ``is_blind`` from ``PEDISEYE``. They are eCPS final-H5 contract columns -with no PolicyEngine-US variable, so Microplex exports them as dataset columns -via the legacy-contract entity map. Mirrors policyengine-us-data -``datasets/cps/cps.py`` (unmerged branch ``claude/document-census-tax-id-replacement``). - -Microplex already ingested the six ``PEDIS*`` fields into ``_disability_*`` -staging columns (used to compute ``is_disabled``) but never produced the -``difficulty_*`` leaves, so they were absent from the export. These tests drive -the real ``_process_persons`` and assert the recode, the staging cleanup, and -the contract/export wiring. -""" - -import json -from pathlib import Path - -import polars as pl - -from microplex_us.data_sources.cps import ( - PERSON_CPS_DIFFICULTY_LEAVES, - PERSON_VARIABLES, - _process_persons, -) - -_PEDIS_TO_LEAF = { - "PEDISDRS": "difficulty_dressing_or_bathing", - "PEDISEAR": "difficulty_hearing", - "PEDISEYE": "difficulty_seeing", - "PEDISOUT": "difficulty_doing_errands", - "PEDISPHY": "difficulty_walking_or_climbing_stairs", - "PEDISREM": "difficulty_remembering_or_making_decisions", -} - -_CONTRACT_PATH = ( - Path(__file__).resolve().parents[2] - / "src" - / "microplex_us" - / "pipelines" - / "ecps_export_contract.json" -) - - -def _raw_person_frame(rows: list[dict]) -> pl.DataFrame: - """Raw CPS-style person frame carrying the six PEDIS* disability fields. - - Census column names are used because ``_process_persons`` selects/renames - via ``PERSON_VARIABLES``. PEDIS* default to 2 ("no") when unspecified. - """ - n = len(rows) - base = { - "PH_SEQ": [1] * n, - "A_LINENO": list(range(1, n + 1)), - "A_FNLWGT": [100.0] * n, - "A_AGE": [40] * n, - } - for pedis in _PEDIS_TO_LEAF: - base[pedis] = [row.get(pedis, 2) for row in rows] - return pl.DataFrame(base) - - -def test_person_variables_maps_the_six_pedis_fields(): - for pedis in _PEDIS_TO_LEAF: - assert pedis in PERSON_VARIABLES - assert PERSON_VARIABLES[pedis].startswith("_disability_") - - -def test_difficulty_leaf_map_covers_all_six_staging_columns(): - assert set(PERSON_CPS_DIFFICULTY_LEAVES.values()) == set(_PEDIS_TO_LEAF.values()) - - -def test_difficulty_leaves_recode_pedis_equals_one(): - """PEDIS{X} == 1 -> True; codes 2 ("no") and 0 ("not in universe") -> False.""" - rows = [ - {p: 1 for p in _PEDIS_TO_LEAF}, # all difficulties - {p: 2 for p in _PEDIS_TO_LEAF}, # explicit "no" - {p: 0 for p in _PEDIS_TO_LEAF}, # not in universe - {"PEDISEYE": 1}, # only vision difficulty - ] - result = _process_persons(_raw_person_frame(rows), 2023) - - for leaf in _PEDIS_TO_LEAF.values(): - assert leaf in result.columns, f"{leaf} not produced" - assert result.schema[leaf] == pl.Boolean, f"{leaf} not boolean" - - # Row 0: every difficulty True; rows 1-2: every difficulty False. - for leaf in _PEDIS_TO_LEAF.values(): - values = result[leaf].to_list() - assert values[0] is True, f"{leaf} row0" - assert values[1] is False and values[2] is False, f"{leaf} rows1-2" - - # Row 3 isolates vision: difficulty_seeing True, the rest False. - assert result["difficulty_seeing"].to_list() == [True, False, False, True] - assert result["difficulty_hearing"].to_list() == [True, False, False, False] - - -def test_difficulty_seeing_tracks_pediseye_like_is_blind(): - # eCPS derives both difficulty_seeing and is_blind from PEDISEYE == 1, so the - # two must agree on every row. - rows = [{"PEDISEYE": 1}, {"PEDISEYE": 2}, {"PEDISEYE": 0}] - result = _process_persons(_raw_person_frame(rows), 2023) - assert result["difficulty_seeing"].to_list() == [True, False, False] - - -def test_staging_disability_columns_do_not_leak(): - result = _process_persons(_raw_person_frame([{"PEDISEYE": 1}]), 2023) - for staging in PERSON_CPS_DIFFICULTY_LEAVES: # keys are the _disability_* staging - assert staging not in result.columns - # is_disabled is still computed from the same staging signal. - assert "is_disabled" in result.columns - assert result["is_disabled"].to_list() == [True] - - -def test_difficulty_leaves_are_contract_required(): - required = set(json.loads(_CONTRACT_PATH.read_text())["required"]) - for leaf in _PEDIS_TO_LEAF.values(): - assert leaf in required, f"{leaf} not in contract required" - - -def test_difficulty_leaves_export_wiring(): - from microplex_us.policyengine.us import ( - POLICYENGINE_US_EXPORT_DEFAULTS, - POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES, - SAFE_POLICYENGINE_US_EXPORT_VARIABLES, - ) - - for leaf in _PEDIS_TO_LEAF.values(): - assert leaf in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert POLICYENGINE_US_EXPORT_DEFAULTS[leaf] is False - # Not a pe-us variable -> routed as a person-level dataset column. - assert POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES[leaf] == "person" diff --git a/tests/data_sources/test_cps_employer_insurance_and_partner.py b/tests/data_sources/test_cps_employer_insurance_and_partner.py deleted file mode 100644 index cdbbee6b..00000000 --- a/tests/data_sources/test_cps_employer_insurance_and_partner.py +++ /dev/null @@ -1,314 +0,0 @@ -"""Tests for the CPS-derived recodes closing the G6/G8/G9 eCPS export gaps. - -The Enhanced CPS exports four person-level leaves that Microplex did not: - -- ``is_unmarried_partner_of_household_head`` -- a recode of the ASEC - relationship-to-householder code ``PERRP``: codes 43/44/46/47 mark an - unmarried partner of the household head. Mirrors the eCPS - ``perrp.isin(PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES)`` recode - (policyengine-us-data, unmerged branch - ``claude/document-census-tax-id-replacement``). - -- ``reported_owns_employer_sponsored_health_insurance_at_interview`` -- the ESI - policyholder flag ``NOW_OWNGRP == 1``. Mirrors the eCPS ESI imputation - (policyengine-us-data, unmerged branch ``max/esi-premiums-cbo``). - -- ``employer_sponsored_insurance_premiums`` -- annual employer-paid ESI premium - imputed from ``NOW_OWNGRP``/``NOW_HIPAID``/``NOW_GRPFTYP``/``PHIP_VAL`` plus - the MEPS-IC 2024 plan-type priors. Reproduces the eCPS - ``impute_employer_sponsored_insurance_premiums`` function (same branch); the - expected values below are that reference function's own outputs. - -- ``sstb_self_employment_income_would_be_qualified`` -- the SSTB QBI - qualification flag. eCPS never recodes this flag, so its export carries the - pe-us default (``default_value=True``). Microplex exports ``False`` instead; - because Microplex carries no SSTB self-employment income - (``business_is_sstb=False`` and ``sstb_self_employment_income_before_lsr=0``), - the section 199A SSTB component is zero under either value, so the choice is - tax-inert and passes the name-only column-parity gate. It is therefore - exported as a constant default rather than a CPS recode. - -The recode tests exercise the real ``_process_persons`` (no stubbing). The -ESI-premium expectations are cross-checked against the eCPS reference -implementation in ``tests/.../test_employer_sponsored_insurance_premiums.py``. -""" - -import math - -import polars as pl - -from microplex_us.data_sources.cps import ( - ESI_PLAN_PRIORS_2024, - PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES, - PERSON_VARIABLES, - _process_persons, -) - -# Raw ASEC fields that seed the new leaves, mapped via PERSON_VARIABLES to the -# underscore-prefixed staging columns consumed inside _process_persons. -_RAW_STAGING_COLUMNS = { - "PERRP": "_person_relationship_to_householder", - "NOW_OWNGRP": "_now_owngrp", - "NOW_HIPAID": "_now_hipaid", - "NOW_GRPFTYP": "_now_grpftyp", -} - -# Leaves produced by the new recodes (G6/G8). G9 is a constant default, not a -# recode, so it is asserted via the export config, not here. -_RECODE_LEAVES = ( - "is_unmarried_partner_of_household_head", - "reported_owns_employer_sponsored_health_insurance_at_interview", - "employer_sponsored_insurance_premiums", -) - - -def _raw_person_frame(rows: list[dict]) -> pl.DataFrame: - """Raw CPS-style person frame carrying the new recode source fields. - - Census column names are used because ``_process_persons`` selects/renames - via ``PERSON_VARIABLES``. - """ - n = len(rows) - return pl.DataFrame( - { - "PH_SEQ": [1] * n, - "A_LINENO": list(range(1, n + 1)), - "A_FNLWGT": [100.0] * n, - "A_AGE": [row.get("age", 40) for row in rows], - "PERRP": [row.get("perrp", 40) for row in rows], - "NOW_OWNGRP": [row.get("owngrp", 0) for row in rows], - "NOW_HIPAID": [row.get("hipaid", 0) for row in rows], - "NOW_GRPFTYP": [row.get("grpftyp", 0) for row in rows], - "PHIP_VAL": [row.get("phip", 0.0) for row in rows], - } - ) - - -# --------------------------------------------------------------------------- -# G8: is_unmarried_partner_of_household_head (PERRP recode) -# --------------------------------------------------------------------------- - - -def test_person_variables_maps_the_new_raw_fields(): - for census, staging in _RAW_STAGING_COLUMNS.items(): - assert PERSON_VARIABLES.get(census) == staging - - -def test_unmarried_partner_codes_match_ecps(): - # eCPS PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES = {43, 44, 46, 47}. - assert set(PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES) == {43, 44, 46, 47} - - -def test_unmarried_partner_flag_recode(): - rows = [ - {"perrp": 43}, # opposite-sex partner with relatives -> True - {"perrp": 44}, # opposite-sex partner without relatives -> True - {"perrp": 46}, # same-sex partner with relatives -> True - {"perrp": 47}, # same-sex partner without relatives -> True - {"perrp": 40}, # reference person -> False - {"perrp": 1}, # spouse -> False - {"perrp": 45}, # adjacent code, deliberately excluded -> False - ] - result = _process_persons(_raw_person_frame(rows), 2023) - assert "is_unmarried_partner_of_household_head" in result.columns - assert result["is_unmarried_partner_of_household_head"].to_list() == [ - True, - True, - True, - True, - False, - False, - False, - ] - # The raw staging column must not leak into the processed frame. - assert "_person_relationship_to_householder" not in result.columns - - -def test_unmarried_partner_flag_is_boolean_dtype(): - result = _process_persons(_raw_person_frame([{"perrp": 43}, {"perrp": 40}]), 2023) - assert result.schema["is_unmarried_partner_of_household_head"] == pl.Boolean - - -# --------------------------------------------------------------------------- -# G6a: reported_owns_employer_sponsored_health_insurance_at_interview -# --------------------------------------------------------------------------- - - -def test_esi_policyholder_flag_recode(): - rows = [ - {"owngrp": 1}, # holds ESI in own name -> True - {"owngrp": 0}, # does not -> False - {"owngrp": 2}, # any non-1 code -> False - ] - result = _process_persons(_raw_person_frame(rows), 2023) - col = "reported_owns_employer_sponsored_health_insurance_at_interview" - assert col in result.columns - assert result[col].to_list() == [True, False, False] - assert result.schema[col] == pl.Boolean - - -# --------------------------------------------------------------------------- -# G6b: employer_sponsored_insurance_premiums (MEPS-prior imputation) -# --------------------------------------------------------------------------- - - -def test_esi_premium_matches_ecps_reference_fixture(): - """Reproduce the eCPS unit-test fixture exactly. - - Fixture and expectations are lifted from policyengine-us-data - ``tests/unit/test_employer_sponsored_insurance_premiums.py``:: - - NOW_OWNGRP = [1, 1, 1, 0, 1] - NOW_HIPAID = [1, 2, 2, 1, 2] - NOW_GRPFTYP = [2, 2, 1, 2, 1] - PHIP_VAL = [0, 1_200, 0, 0, 50_000] - """ - rows = [ - {"owngrp": 1, "hipaid": 1, "grpftyp": 2, "phip": 0}, - {"owngrp": 1, "hipaid": 2, "grpftyp": 2, "phip": 1_200}, - {"owngrp": 1, "hipaid": 2, "grpftyp": 1, "phip": 0}, - {"owngrp": 0, "hipaid": 1, "grpftyp": 2, "phip": 0}, - {"owngrp": 1, "hipaid": 2, "grpftyp": 2, "phip": 50_000}, - ] - result = _process_persons(_raw_person_frame(rows), 2023) - premiums = result["employer_sponsored_insurance_premiums"].to_list() - - self_only_total = ESI_PLAN_PRIORS_2024["self_only"]["total_premium"] - family_total = ESI_PLAN_PRIORS_2024["family"]["total_premium"] - family_employee = ESI_PLAN_PRIORS_2024["family"]["employee_contribution"] - expected = [ - self_only_total, # employer pays all, self-only plan - self_only_total - 1_200, # employer pays some, self-only, $1.2k employee - family_total - family_employee, # employer pays some, family, avg employee - 0.0, # not an own-name ESI holder - 0.0, # employer pays some but employee paid exceeds total -> clip at 0 - ] - assert len(premiums) == len(expected) - for got, want in zip(premiums, expected): - assert math.isclose(got, want, rel_tol=1e-9, abs_tol=1e-6), (got, want) - - -def test_esi_premium_is_zero_for_non_owners(): - # Even with a paid-premium status and a real plan type, a non-owner - # (NOW_OWNGRP != 1) must get a zero employer premium. - rows = [{"owngrp": 0, "hipaid": 1, "grpftyp": 1, "phip": 0}] - result = _process_persons(_raw_person_frame(rows), 2023) - assert result["employer_sponsored_insurance_premiums"].to_list() == [0.0] - - -def test_esi_premium_zero_when_no_employer_contribution(): - # NOW_HIPAID code other than 1/2 (e.g. 3 = employee pays all) -> no - # employer-paid premium even for a valid owner with a plan. - rows = [{"owngrp": 1, "hipaid": 3, "grpftyp": 1, "phip": 5_000}] - result = _process_persons(_raw_person_frame(rows), 2023) - assert result["employer_sponsored_insurance_premiums"].to_list() == [0.0] - - -def test_esi_staging_columns_are_dropped(): - result = _process_persons(_raw_person_frame([{"owngrp": 1}]), 2023) - for staging in ("_now_owngrp", "_now_hipaid", "_now_grpftyp"): - assert staging not in result.columns - - -def test_esi_premium_priors_match_ecps_meps_constants(): - # MEPS-IC Table IV.A.1 (private sector, 2024) constants, copied from eCPS. - assert math.isclose( - ESI_PLAN_PRIORS_2024["family"]["total_premium"], 21_207.52589669509 - ) - assert math.isclose( - ESI_PLAN_PRIORS_2024["family"]["employee_contribution"], 6_490.205059544782 - ) - assert math.isclose( - ESI_PLAN_PRIORS_2024["self_only"]["total_premium"], 8_389.275834815255 - ) - assert math.isclose( - ESI_PLAN_PRIORS_2024["self_only"]["employee_contribution"], - 1_909.5781466113417, - ) - - -# --------------------------------------------------------------------------- -# Export-config wiring (all four leaves) -# --------------------------------------------------------------------------- - - -def test_recode_leaves_in_export_allowlist_and_not_aliased(): - from microplex_us.policyengine.us import ( - POLICYENGINE_US_EXPORT_COLUMN_ALIASES, - SAFE_POLICYENGINE_US_EXPORT_VARIABLES, - ) - - for leaf in _RECODE_LEAVES: - assert leaf in SAFE_POLICYENGINE_US_EXPORT_VARIABLES, leaf - assert POLICYENGINE_US_EXPORT_COLUMN_ALIASES.get(leaf) is None, leaf - - -def test_esi_policyholder_has_legacy_person_entity(): - # The policyholder flag is not (yet) a released pe-us input variable, so the - # export entity is pinned in the legacy-contract map (person), like its - # reported_has_* siblings, to keep it on the eCPS-parity surface. - from microplex_us.policyengine.us import ( - POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES, - ) - - assert ( - POLICYENGINE_US_LEGACY_CONTRACT_VARIABLE_ENTITIES.get( - "reported_owns_employer_sponsored_health_insurance_at_interview" - ) - == "person" - ) - - -def test_sstb_qbi_flag_exported_as_constant_false_default(): - # G9: eCPS's np.where(business_is_sstb, ..., False) collapses to False for - # every record given MP's business_is_sstb=False default. Export it as the - # same constant False, overriding the pe-us default_value=True. - from microplex_us.policyengine.us import POLICYENGINE_US_EXPORT_DEFAULTS - - assert ( - POLICYENGINE_US_EXPORT_DEFAULTS["sstb_self_employment_income_would_be_qualified"] - is False - ) - # It must be internally consistent with MP's existing SSTB treatment. - assert POLICYENGINE_US_EXPORT_DEFAULTS["business_is_sstb"] is False - - -def test_all_four_columns_are_required_by_the_ecps_contract(): - # Every column this change adds is a REQUIRED (not forbidden) column in the - # frozen eCPS export-parity contract. - import json - from importlib import resources - - contract = json.loads( - resources.files("microplex_us.pipelines") - .joinpath("ecps_export_contract.json") - .read_text() - ) - required = set(contract["required"]) - forbidden = set(contract["forbidden"]) - for col in ( - "is_unmarried_partner_of_household_head", - "reported_owns_employer_sponsored_health_insurance_at_interview", - "employer_sponsored_insurance_premiums", - "sstb_self_employment_income_would_be_qualified", - ): - assert col in required, col - assert col not in forbidden, col - - -if __name__ == "__main__": - import traceback - - funcs = [v for k, v in sorted(globals().items()) if k.startswith("test_")] - passed = failed = 0 - for fn in funcs: - try: - fn() - print(f"PASS {fn.__name__}") - passed += 1 - except Exception: # noqa: BLE001 - print(f"FAIL {fn.__name__}") - traceback.print_exc() - failed += 1 - print(f"SUMMARY passed={passed} failed={failed}") - raise SystemExit(1 if failed else 0) diff --git a/tests/data_sources/test_cps_export_support_fields.py b/tests/data_sources/test_cps_export_support_fields.py deleted file mode 100644 index 054cdf48..00000000 --- a/tests/data_sources/test_cps_export_support_fields.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Tests for CPS-backed eCPS export-support fields. - -These fields are required by the eCPS export contract and are populated in the -incumbent enhanced CPS. Microplex previously exported many of them only through -constant defaults, so the presence gate passed while the support gate failed. -""" - -import numpy as np -import polars as pl -import pytest - -from microplex_us.data_sources.cps import ( - CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP, - PERSON_VARIABLES, - TAXABLE_PENSION_FRACTION, - _attach_cps_ssn_card_type, - _derive_cps_immigration_status, - _process_persons, -) - - -def _raw_person_frame(rows: list[dict]) -> pl.DataFrame: - n = len(rows) - return pl.DataFrame( - { - "PH_SEQ": [1] * n, - "A_LINENO": list(range(1, n + 1)), - "A_FNLWGT": [100.0] * n, - "A_AGE": [row.get("age", 40) for row in rows], - "A_HSCOL": [row.get("school", 0) for row in rows], - "A_HRLYWK": [row.get("hourly_code", 0) for row in rows], - "A_HRSPAY": [row.get("hourly_cents", -1) for row in rows], - "A_UNMEM": [row.get("union", 0) for row in rows], - "POCCU2": [row.get("poccu2", 0) for row in rows], - "PEIOOCC": [row.get("peioocc", -1) for row in rows], - "PNSN_VAL": [row.get("pension", 0.0) for row in rows], - "ANN_VAL": [row.get("annuity", 0.0) for row in rows], - "LKWEEKS": [row.get("weeks_unemployed", -1) for row in rows], - "VET_VAL": [row.get("veterans_benefits", 0.0) for row in rows], - "WC_VAL": [row.get("workers_compensation", 0.0) for row in rows], - "DST_SC1": [row.get("dst_sc1", 0) for row in rows], - "DST_VAL1": [row.get("dst_val1", 0.0) for row in rows], - "DST_SC2": [row.get("dst_sc2", 0) for row in rows], - "DST_VAL2": [row.get("dst_val2", 0.0) for row in rows], - "DST_SC1_YNG": [row.get("dst_sc1_yng", 0) for row in rows], - "DST_VAL1_YNG": [row.get("dst_val1_yng", 0.0) for row in rows], - "DST_SC2_YNG": [row.get("dst_sc2_yng", 0) for row in rows], - "DST_VAL2_YNG": [row.get("dst_val2_yng", 0.0) for row in rows], - "NOW_DIR": [row.get("now_dir", 2) for row in rows], - "NOW_MRK": [row.get("now_mrk", 2) for row in rows], - "NOW_MRKS": [row.get("now_mrks", 2) for row in rows], - "NOW_MRKUN": [row.get("now_mrkun", 2) for row in rows], - "NOW_NONM": [row.get("now_nonm", 2) for row in rows], - "NOW_GRP": [row.get("now_grp", 2) for row in rows], - "NOW_MCARE": [row.get("now_mcare", 2) for row in rows], - "NOW_CAID": [row.get("now_caid", 2) for row in rows], - "NOW_MCAID": [row.get("now_mcaid", 2) for row in rows], - "NOW_PCHIP": [row.get("now_pchip", 2) for row in rows], - "NOW_OTHMT": [row.get("now_othmt", 2) for row in rows], - "NOW_MIL": [row.get("now_mil", 2) for row in rows], - "NOW_CHAMPVA": [row.get("now_champva", 2) for row in rows], - "NOW_VACARE": [row.get("now_vacare", 2) for row in rows], - "NOW_IHSFLG": [row.get("now_ihs", 2) for row in rows], - "NOW_PRIV": [row.get("now_priv", 2) for row in rows], - "NOW_PUB": [row.get("now_pub", 2) for row in rows], - "NOW_COV": [row.get("now_cov", 2) for row in rows], - } - ) - - -def test_person_variables_maps_current_health_coverage_sources(): - for leaf, census_column in CURRENT_HEALTH_COVERAGE_REPORTED_VAR_MAP.items(): - assert PERSON_VARIABLES.get(census_column) == f"_{leaf}" - - -def test_process_persons_populates_health_coverage_support_fields(): - out = _process_persons( - _raw_person_frame( - [ - {"now_mrk": 1, "now_grp": 1, "now_priv": 1, "now_cov": 1}, - {"now_caid": 1, "now_mcaid": 1, "now_pub": 1, "now_cov": 1}, - {"now_cov": 2}, - ] - ), - 2025, - ) - - assert out["reported_has_marketplace_health_coverage_at_interview"].to_list() == [ - True, - False, - False, - ] - assert out[ - "reported_has_employer_sponsored_health_coverage_at_interview" - ].to_list() == [ - True, - False, - False, - ] - assert out["reported_has_medicaid_health_coverage_at_interview"].to_list() == [ - False, - True, - False, - ] - assert out["reported_has_multiple_health_coverage_at_interview"].to_list() == [ - True, - False, - False, - ] - assert out["has_marketplace_health_coverage_at_interview"].to_list() == [ - True, - False, - False, - ] - assert out["has_marketplace_health_coverage"].to_list() == [True, False, False] - assert out["has_esi"].to_list() == [True, False, False] - assert out["reported_is_insured_at_interview"].to_list() == [True, True, False] - assert out["reported_is_uninsured_at_interview"].to_list() == [False, False, True] - - -def test_process_persons_populates_labor_occupation_and_tipped_fields(): - out = _process_persons( - _raw_person_frame( - [ - { - "school": 2, - "hourly_code": 1, - "hourly_cents": 2150, - "union": 1, - "poccu2": 8, - "peioocc": 4000, - "weeks_unemployed": 12, - "veterans_benefits": 700.0, - "workers_compensation": 300.0, - }, - {"hourly_code": 2, "hourly_cents": -1, "poccu2": 52, "peioocc": -1}, - {"poccu2": 53, "weeks_unemployed": -1}, - ] - ), - 2025, - ) - - assert out["is_full_time_college_student"].to_list() == [True, False, False] - assert out["is_paid_hourly"].to_list() == [True, False, False] - assert out["hourly_wage"].to_list() == [21.5, 0.0, 0.0] - assert out["is_union_member_or_covered"].to_list() == [True, False, False] - assert out["detailed_occupation_recode"].to_list() == [8, 52, 53] - assert out["is_computer_scientist"].to_list() == [True, False, False] - assert out["is_military"].to_list() == [False, True, False] - assert out["has_never_worked"].to_list() == [False, False, True] - assert out["treasury_tipped_occupation_code"].to_list() == [105, 0, 0] - assert out["is_tipped_occupation"].to_list() == [True, False, False] - assert out["weeks_unemployed"].to_list() == [12, 0, 0] - assert out["veterans_benefits"].to_list() == [700.0, 0.0, 0.0] - assert out["workers_compensation"].to_list() == [300.0, 0.0, 0.0] - - -def test_process_persons_populates_pension_and_retirement_distribution_leaves(): - out = _process_persons( - _raw_person_frame( - [ - { - "pension": 10_000.0, - "annuity": 2_000.0, - "dst_sc1": 1, - "dst_val1": 1_500.0, - "dst_sc2": 4, - "dst_val2": 2_500.0, - }, - { - "dst_sc1": 2, - "dst_val1": 600.0, - "dst_sc2": 3, - "dst_val2": 700.0, - "dst_sc1_yng": 6, - "dst_val1_yng": 800.0, - "dst_sc2_yng": 7, - "dst_val2_yng": 900.0, - }, - ] - ), - 2025, - ) - - total_pension = 12_000.0 - assert out["pension_income"].to_list() == [total_pension, 0.0] - assert out["taxable_private_pension_income"].to_list() == pytest.approx( - [total_pension * TAXABLE_PENSION_FRACTION, 0.0] - ) - assert out["tax_exempt_private_pension_income"].to_list() == pytest.approx( - [total_pension * (1 - TAXABLE_PENSION_FRACTION), 0.0] - ) - assert out["taxable_401k_distributions"].to_list() == [1_500.0, 0.0] - assert out["regular_ira_distributions"].to_list() == [2_500.0, 0.0] - assert out["taxable_ira_distributions"].to_list() == [2_500.0, 0.0] - assert out["taxable_403b_distributions"].to_list() == [0.0, 600.0] - assert out["roth_ira_distributions"].to_list() == [0.0, 700.0] - assert out["tax_exempt_ira_distributions"].to_list() == [0.0, 700.0] - assert out["taxable_sep_distributions"].to_list() == [0.0, 800.0] - assert out["other_type_retirement_account_distributions"].to_list() == [0.0, 900.0] - - -def test_derive_cps_immigration_status_varies_from_ssn_card_type(): - status = _derive_cps_immigration_status( - ssn_card_type=np.array([1, 0, 2, 3]), - birth_country=np.array([57, 57, 57, 332]), - peinusyr=np.array([0, 29, 28, 20]), - age=np.array([40, 30, 30, 40]), - year=2024, - ) - - assert status.tolist() == [ - "CITIZEN", - "UNDOCUMENTED", - "TPS", - "CUBAN_HAITIAN_ENTRANT", - ] - - -def test_attach_cps_ssn_card_type_persists_identification_exports(): - persons = pl.DataFrame( - { - "household_id": [1, 2], - "year": [2025, 2025], - "age": [40, 30], - } - ) - households = pl.DataFrame({"household_id": [1, 2], "household_weight": [1.0, 1.0]}) - raw = pl.DataFrame( - { - "PRCITSHP": [1, 5], - "PEINUSYR": [0, 29], - "PENATVTY": [57, 57], - "A_HSCOL": [0, 0], - "A_AGE": [40, 30], - "A_MARITL": [0, 0], - "A_SPOUSE": [0, 0], - "MCARE": [0, 0], - "CAID": [0, 0], - "PEN_SC1": [0, 0], - "PEN_SC2": [0, 0], - "RESNSS1": [0, 0], - "RESNSS2": [0, 0], - "IHSFLG": [0, 0], - "CHAMPVA": [0, 0], - "MIL": [0, 0], - "PEIO1COW": [0, 0], - "A_MJOCC": [0, 0], - "SS_YN": [0, 0], - "SPM_ID": [1, 2], - "SPM_CAPHOUSESUB": [0.0, 0.0], - "PEAFEVER": [0, 0], - "SSI_YN": [0, 0], - "WSAL_VAL": [0.0, 0.0], - "SEMP_VAL": [0.0, 0.0], - } - ) - - out = _attach_cps_ssn_card_type( - persons=persons, - households=households, - persons_raw=raw, - ) - - assert out["ssn_card_type"].to_list() == ["CITIZEN", "NONE"] - assert out["has_valid_ssn"].to_list() == [True, False] - assert out["taxpayer_id_type"].to_list() == ["VALID_SSN", "NONE"] - assert out["immigration_status_str"].to_list() == ["CITIZEN", "UNDOCUMENTED"] diff --git a/tests/data_sources/test_cps_retirement_contributions.py b/tests/data_sources/test_cps_retirement_contributions.py deleted file mode 100644 index 959ff586..00000000 --- a/tests/data_sources/test_cps_retirement_contributions.py +++ /dev/null @@ -1,316 +0,0 @@ -"""Tests for the CPS retirement-contribution desired-leaf split (G2 gap). - -The Enhanced CPS splits the single bundled CPS retirement-contribution total -(``RETCB_VAL``) into five account-type-specific *desired* (pre-statutory-limit) -contribution leaves, using a proportional split with IRS SOI / BEA-FRED / -Vanguard-PSCA shares (PolicyEngine/policyengine-us-data -``policyengine_us_data/datasets/cps/cps.py:1500-1552``; share values in -``policyengine_us_data/datasets/cps/imputation_parameters.yaml``): - -- ``self_employed_pension_contributions_desired`` (se share, gated on SE income) -- ``traditional_401k_contributions_desired`` (DC pool, gated on wages) -- ``roth_401k_contributions_desired`` (DC pool, gated on wages) -- ``traditional_ira_contributions_desired`` (IRA pool, any earned income) -- ``roth_ira_contributions_desired`` (IRA pool, any earned income) - -Microplex produced none of them: ``RETCB_VAL`` was not mapped in -``PERSON_VARIABLES`` and the leaves were absent from the export allowlist, so -they never reached the H5. These tests exercise the real ``_process_persons`` -(no stubbing) to prove the split happens with the exact eCPS gating and share -math, that the five leaves reconcile to ``RETCB_VAL`` for earned-income -records, that gating zeroes the right pools, and that the leaves are in the -PolicyEngine-US export allowlist and not aliased away. -""" - -import numpy as np -import polars as pl - -from microplex_us.data_sources.cps import ( - DC_SHARE_OF_RETIREMENT_CONTRIBUTIONS, - PERSON_VARIABLES, - RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR, - ROTH_SHARE_OF_DC_CONTRIBUTIONS, - SE_PENSION_SHARE_OF_RETIREMENT_CONTRIBUTIONS, - TRADITIONAL_SHARE_OF_IRA_CONTRIBUTIONS, - _process_persons, -) - -_LEAVES = ( - "self_employed_pension_contributions_desired", - "traditional_401k_contributions_desired", - "roth_401k_contributions_desired", - "traditional_ira_contributions_desired", - "roth_ira_contributions_desired", -) -_CAPPED_LEAVES = ( - "self_employed_pension_contributions", - "traditional_401k_contributions", - "roth_401k_contributions", - "traditional_ira_contributions", - "roth_ira_contributions", -) - - -def _raw_person_frame(rows: list[dict]) -> pl.DataFrame: - """Raw CPS-style person frame carrying RETCB_VAL plus the earned-income - fields that gate the split. - - Census column names are used because ``_process_persons`` selects/renames - via ``PERSON_VARIABLES`` (RETCB_VAL -> _retirement_contributions -> - desired leaves). - """ - n = len(rows) - return pl.DataFrame( - { - "PH_SEQ": [1] * n, - "A_LINENO": list(range(1, n + 1)), - "A_FNLWGT": [100.0] * n, - "A_AGE": [row.get("age", 40) for row in rows], - "WSAL_VAL": [float(row.get("wages", 0.0)) for row in rows], - "SEMP_VAL": [float(row.get("se", 0.0)) for row in rows], - "RETCB_VAL": [float(row.get("retcb", 0.0)) for row in rows], - } - ) - - -def _expected_split(retcb: float, wages: float, se: float) -> dict[str, float]: - """Recompute the eCPS split independently (cps.py:1514-1552).""" - has_wages = wages > 0 - has_se = se > 0 - has_earned = has_wages or has_se - se_pension = retcb * SE_PENSION_SHARE_OF_RETIREMENT_CONTRIBUTIONS if has_se else 0.0 - remaining = max(retcb - se_pension, 0.0) - dc_pool = remaining * DC_SHARE_OF_RETIREMENT_CONTRIBUTIONS if has_wages else 0.0 - ira_pool = (remaining - dc_pool) if has_earned else 0.0 - return { - "self_employed_pension_contributions_desired": se_pension, - "traditional_401k_contributions_desired": dc_pool - * (1 - ROTH_SHARE_OF_DC_CONTRIBUTIONS), - "roth_401k_contributions_desired": dc_pool * ROTH_SHARE_OF_DC_CONTRIBUTIONS, - "traditional_ira_contributions_desired": ira_pool - * TRADITIONAL_SHARE_OF_IRA_CONTRIBUTIONS, - "roth_ira_contributions_desired": ira_pool - * (1 - TRADITIONAL_SHARE_OF_IRA_CONTRIBUTIONS), - } - - -def _expected_capped_split( - retcb: float, - wages: float, - se: float, - age: int, - *, - year: int, -) -> dict[str, float]: - """Recompute the final account leaves by capping each desired pool.""" - limits = RETIREMENT_CONTRIBUTION_LIMITS_BY_YEAR[year] - catch_up = age >= 50 - limit_401k = limits["401k"] + catch_up * limits["401k_catch_up"] - limit_ira = limits["ira"] + catch_up * limits["ira_catch_up"] - desired = _expected_split(retcb, wages, se) - traditional_401k = min( - desired["traditional_401k_contributions_desired"], - limit_401k, - ) - roth_401k = min( - desired["roth_401k_contributions_desired"], - max(limit_401k - traditional_401k, 0.0), - ) - traditional_ira = min( - desired["traditional_ira_contributions_desired"], - limit_ira, - ) - roth_ira = min( - desired["roth_ira_contributions_desired"], - max(limit_ira - traditional_ira, 0.0), - ) - return { - "self_employed_pension_contributions": desired[ - "self_employed_pension_contributions_desired" - ], - "traditional_401k_contributions": traditional_401k, - "roth_401k_contributions": roth_401k, - "traditional_ira_contributions": traditional_ira, - "roth_ira_contributions": roth_ira, - } - - -def test_share_constants_trace_to_imputation_parameters_yaml(): - """The four scalars must equal the eCPS imputation_parameters.yaml values.""" - assert SE_PENSION_SHARE_OF_RETIREMENT_CONTRIBUTIONS == 0.046 # yaml line 30 - assert DC_SHARE_OF_RETIREMENT_CONTRIBUTIONS == 0.908 # yaml line 38 - assert ROTH_SHARE_OF_DC_CONTRIBUTIONS == 0.15 # yaml line 48 - assert TRADITIONAL_SHARE_OF_IRA_CONTRIBUTIONS == 0.392 # yaml line 55 - - -def test_person_variables_stages_retcb_val(): - """RETCB_VAL is mapped to the staging column consumed by the split.""" - assert PERSON_VARIABLES.get("RETCB_VAL") == "_retirement_contributions" - - -def test_process_persons_produces_retirement_leaves_and_drops_staging(): - rows = [ - {"wages": 50_000.0, "se": 0.0, "retcb": 10_000.0}, - {"wages": 0.0, "se": 80_000.0, "retcb": 5_000.0}, - {"wages": 30_000.0, "se": 10_000.0, "retcb": 2_000.0}, - ] - out = _process_persons(_raw_person_frame(rows), 2023) - for leaf in _LEAVES + _CAPPED_LEAVES: - assert leaf in out.columns, f"{leaf} not produced" - # Staging column must not leak into the processed frame. - assert "_retirement_contributions" not in out.columns - - -def test_split_matches_ecps_math_exactly(): - """Every leaf equals the independent eCPS recomputation, row by row.""" - rows = [ - {"wages": 50_000.0, "se": 0.0, "retcb": 10_000.0}, # wages only - {"wages": 0.0, "se": 80_000.0, "retcb": 5_000.0}, # SE only - {"wages": 30_000.0, "se": 10_000.0, "retcb": 2_000.0}, # both - {"wages": 90_000.0, "se": 0.0, "retcb": 23_000.0}, # high earner - ] - out = _process_persons(_raw_person_frame(rows), 2023) - for i, row in enumerate(rows): - expected = _expected_split(row["retcb"], row["wages"], row["se"]) - for leaf in _LEAVES: - got = out[leaf].to_list()[i] - assert got == expected[leaf], f"row {i} {leaf}: {got} != {expected[leaf]}" - - -def test_capped_split_matches_desired_account_pools_with_limits(): - rows = [ - {"wages": 50_000.0, "se": 0.0, "retcb": 10_000.0, "age": 40}, - {"wages": 0.0, "se": 80_000.0, "retcb": 5_000.0, "age": 40}, - {"wages": 90_000.0, "se": 0.0, "retcb": 80_000.0, "age": 52}, - ] - out = _process_persons(_raw_person_frame(rows), 2023) - for i, row in enumerate(rows): - expected = _expected_capped_split( - row["retcb"], - row["wages"], - row["se"], - row["age"], - year=2023, - ) - for leaf in _CAPPED_LEAVES: - got = out[leaf].to_list()[i] - assert got == expected[leaf], f"row {i} {leaf}: {got} != {expected[leaf]}" - - -def test_capped_split_preserves_ira_support_below_401k_limit(): - rows = [ - {"wages": 50_000.0, "se": 0.0, "retcb": 10_000.0, "age": 40}, - ] - out = _process_persons(_raw_person_frame(rows), 2024) - - assert out["traditional_401k_contributions"].to_list()[0] > 0 - assert out["roth_401k_contributions"].to_list()[0] > 0 - assert out["traditional_ira_contributions"].to_list()[0] > 0 - assert out["roth_ira_contributions"].to_list()[0] > 0 - - -def test_five_leaves_reconcile_to_retcb_for_earned_income_records(): - """For anyone with earned income the five leaves sum back to RETCB_VAL.""" - rows = [ - {"wages": 50_000.0, "se": 0.0, "retcb": 10_000.0}, - {"wages": 0.0, "se": 80_000.0, "retcb": 5_000.0}, - {"wages": 30_000.0, "se": 10_000.0, "retcb": 2_000.0}, - {"wages": 20_000.0, "se": 0.0, "retcb": 7_345.67}, - ] - out = _process_persons(_raw_person_frame(rows), 2023) - total = np.zeros(len(rows)) - for leaf in _LEAVES: - total += np.array(out[leaf].to_list(), dtype=float) - retcb = np.array([row["retcb"] for row in rows], dtype=float) - assert np.allclose(total, retcb, atol=1e-6), f"{total} != {retcb}" - - -def test_se_pension_gated_on_self_employment_income(): - """Self-employed pension is nonzero only when SE income is positive.""" - rows = [ - {"wages": 40_000.0, "se": 0.0, "retcb": 6_000.0}, # no SE -> 0 se pension - {"wages": 0.0, "se": 40_000.0, "retcb": 6_000.0}, # SE -> nonzero se pension - {"wages": 40_000.0, "se": 40_000.0, "retcb": 6_000.0}, # both -> nonzero - ] - out = _process_persons(_raw_person_frame(rows), 2023) - se = out["self_employed_pension_contributions_desired"].to_list() - assert se[0] == 0.0 - assert se[1] > 0.0 - assert se[2] > 0.0 - # Exact value when SE present: se_share * RETCB. - assert se[1] == 6_000.0 * SE_PENSION_SHARE_OF_RETIREMENT_CONTRIBUTIONS - - -def test_dc_401k_gated_on_wages(): - """401(k) pool is nonzero only when wages are positive.""" - rows = [ - {"wages": 0.0, "se": 50_000.0, "retcb": 8_000.0}, # SE only -> 401k == 0 - {"wages": 50_000.0, "se": 0.0, "retcb": 8_000.0}, # wages -> 401k > 0 - ] - out = _process_persons(_raw_person_frame(rows), 2023) - trad = out["traditional_401k_contributions_desired"].to_list() - roth = out["roth_401k_contributions_desired"].to_list() - assert trad[0] == 0.0 and roth[0] == 0.0, "no 401(k) without an employer/wages" - assert trad[1] > 0.0 and roth[1] > 0.0, "401(k) expected with wages" - - -def test_no_earned_income_yields_zero_everywhere(): - """A record with RETCB but no wages and no SE produces all-zero leaves.""" - rows = [{"wages": 0.0, "se": 0.0, "retcb": 9_000.0}] - out = _process_persons(_raw_person_frame(rows), 2023) - for leaf in _LEAVES: - assert out[leaf].to_list()[0] == 0.0, ( - f"{leaf} should be 0 without earned income" - ) - - -def test_zero_retcb_yields_zero_everywhere(): - """A worker who contributed nothing has zero across all five leaves.""" - rows = [{"wages": 60_000.0, "se": 20_000.0, "retcb": 0.0}] - out = _process_persons(_raw_person_frame(rows), 2023) - for leaf in _LEAVES: - assert out[leaf].to_list()[0] == 0.0, f"{leaf} should be 0 with zero RETCB" - - -def test_split_is_non_degenerate(): - """Across records each leaf takes several distinct positive values.""" - rows = [ - {"wages": 50_000.0, "se": 5_000.0, "retcb": 12_000.0}, - {"wages": 80_000.0, "se": 0.0, "retcb": 20_000.0}, - {"wages": 0.0, "se": 70_000.0, "retcb": 9_000.0}, - {"wages": 35_000.0, "se": 15_000.0, "retcb": 6_500.0}, - ] - out = _process_persons(_raw_person_frame(rows), 2023) - for leaf in _LEAVES: - positive = [v for v in out[leaf].to_list() if v > 0] - assert len(positive) >= 2, f"{leaf} should be positive for several records" - assert len(set(positive)) >= 2, f"{leaf} should not be a single constant" - - -def test_leaves_in_export_allowlist_and_not_aliased(): - from microplex_us.policyengine.us import ( - POLICYENGINE_US_EXPORT_COLUMN_ALIASES, - SAFE_POLICYENGINE_US_EXPORT_VARIABLES, - ) - - for leaf in _LEAVES + _CAPPED_LEAVES: - assert leaf in SAFE_POLICYENGINE_US_EXPORT_VARIABLES, f"{leaf} not exported" - assert POLICYENGINE_US_EXPORT_COLUMN_ALIASES.get(leaf) is None - - -if __name__ == "__main__": - import traceback - - funcs = [v for k, v in sorted(globals().items()) if k.startswith("test_")] - passed = failed = 0 - for fn in funcs: - try: - fn() - print(f"PASS {fn.__name__}") - passed += 1 - except Exception: # noqa: BLE001 - print(f"FAIL {fn.__name__}") - traceback.print_exc() - failed += 1 - print(f"SUMMARY passed={passed} failed={failed}") - raise SystemExit(1 if failed else 0) diff --git a/tests/data_sources/test_cps_social_security_retirement.py b/tests/data_sources/test_cps_social_security_retirement.py deleted file mode 100644 index c618963d..00000000 --- a/tests/data_sources/test_cps_social_security_retirement.py +++ /dev/null @@ -1,265 +0,0 @@ -"""Tests for the CPS Social Security reason-code split, focused on the -``social_security_retirement`` leaf that drives the ``national_ssa`` loss family. - -``social_security_retirement`` is constructed in ``_process_persons`` by -splitting the bundled CPS ``SS_VAL`` (mapped to ``social_security``) across the -four benefit reasons using the ASEC ``RESNSS1``/``RESNSS2`` reason codes, with an -age-62 fallback for records whose reason is unclassified. This mirrors eCPS -``policyengine_us_data/datasets/cps/cps.py`` (the SS reason-code split + age-62 -fallback). The leaf is produced today but, before this change, was missing from -the policyengine-us export allowlist, so it never reached the exported H5. - -These tests exercise the real split (they do NOT stub ``_process_persons``) and -assert reconciliation, dominance, fallback, the both-reasons-present drop -edge case, and export-allowlist -properties. They run on tiny synthetic frames, so no weighting is involved; -national-total accuracy against SSA/IRS targets is validated downstream by the -eCPS comparison harness, not here. -""" - -import polars as pl - -from microplex_us.data_sources.cps import ( - MINIMUM_RETIREMENT_AGE, - SOCIAL_SECURITY_DEPENDENT_REASON_CODES, - SOCIAL_SECURITY_DISABILITY_REASON_CODE, - SOCIAL_SECURITY_RETIREMENT_REASON_CODE, - SOCIAL_SECURITY_SURVIVOR_REASON_CODES, - _process_persons, -) - -# The four leaves the SS_VAL reason-code split produces. -_SS_COMPONENTS = ( - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", -) - - -def _raw_person_frame(rows: list[dict]) -> pl.DataFrame: - """Build a raw CPS-style person frame with the columns the split consumes. - - Census column names are used because ``_process_persons`` selects/renames via - ``PERSON_VARIABLES`` before running the split. - """ - n = len(rows) - return pl.DataFrame( - { - "PH_SEQ": [1] * n, - "A_LINENO": list(range(1, n + 1)), - "A_FNLWGT": [100.0] * n, - "A_AGE": [row["age"] for row in rows], - "SS_VAL": [row["ss"] for row in rows], - "RESNSS1": [row.get("r1", 0) for row in rows], - "RESNSS2": [row.get("r2", 0) for row in rows], - } - ) - - -def test_reason_codes_match_ecps_constants(): - """The reason-code constants mirror eCPS cps.py classification.""" - assert SOCIAL_SECURITY_RETIREMENT_REASON_CODE == 1 - assert SOCIAL_SECURITY_DISABILITY_REASON_CODE == 2 - assert SOCIAL_SECURITY_SURVIVOR_REASON_CODES == (3, 5) - assert SOCIAL_SECURITY_DEPENDENT_REASON_CODES == (4, 6, 7) - assert MINIMUM_RETIREMENT_AGE == 62 - - -def test_split_components_sum_to_total_social_security(): - """The four reason-coded components reconstruct the bundled SS_VAL total. - - Every recipient carries a classified reason (or an age-based fallback), so - summing the four components must equal ``social_security`` person-by-person. - """ - rows = [ - {"age": 70, "ss": 20_000.0, "r1": 1}, # retirement - {"age": 50, "ss": 15_000.0, "r1": 2}, # disability - {"age": 40, "ss": 12_000.0, "r1": 3}, # survivor - {"age": 10, "ss": 8_000.0, "r1": 4}, # dependent - {"age": 67, "ss": 18_000.0, "r1": 0}, # unclassified -> age>=62 retirement - {"age": 45, "ss": 9_000.0, "r1": 0}, # unclassified -> age<62 disability - {"age": 30, "ss": 0.0, "r1": 0}, # non-recipient - ] - result = _process_persons(_raw_person_frame(rows), 2023) - - for component in _SS_COMPONENTS: - assert component in result.columns, f"{component} not produced" - - component_sum = result.select( - sum(pl.col(component) for component in _SS_COMPONENTS).alias("total") - )["total"] - total_ss = result["social_security"] - for got, expected in zip(component_sum.to_list(), total_ss.to_list()): - assert abs(got - expected) < 1e-6 - - -def test_retirement_is_the_dominant_component(): - """On a retiree-heavy aged population, retirement dominates the SS split. - - SSA program data: OASI retirement benefits are by far the largest Social - Security component, so a population skewed to ages 62+ must produce a - retirement total larger than each of disability/survivors/dependents. - """ - rows = [{"age": age, "ss": 20_000.0, "r1": 1} for age in (66, 68, 70, 72, 75)] - rows += [{"age": age, "ss": 18_000.0, "r1": 0} for age in (63, 67, 71)] - rows += [ - {"age": 50, "ss": 14_000.0, "r1": 2}, # disability - {"age": 35, "ss": 10_000.0, "r1": 3}, # survivor - ] - result = _process_persons(_raw_person_frame(rows), 2023) - - totals = { - component: float(result[component].sum()) for component in _SS_COMPONENTS - } - assert totals["social_security_retirement"] > totals["social_security_disability"] - assert totals["social_security_retirement"] > totals["social_security_survivors"] - assert totals["social_security_retirement"] > totals["social_security_dependents"] - - -def test_retirement_values_are_non_degenerate(): - """Retirement is neither all-zero nor a single constant across recipients.""" - rows = [ - {"age": 66, "ss": 12_000.0, "r1": 1}, - {"age": 70, "ss": 24_000.0, "r1": 1}, - {"age": 64, "ss": 18_000.0, "r1": 0}, # age>=62 fallback -> retirement - {"age": 50, "ss": 15_000.0, "r1": 2}, # disability, NOT retirement - ] - result = _process_persons(_raw_person_frame(rows), 2023) - - retirement = result["social_security_retirement"] - positive = [value for value in retirement.to_list() if value > 0] - assert len(positive) >= 2, "retirement should be positive for several records" - assert len(set(positive)) >= 2, "retirement values should not be a single constant" - assert float(retirement.sum()) > 0.0 - - -def test_age_62_fallback_routes_unclassified_by_age(): - """Unclassified recipients route to retirement iff age >= 62, else disability.""" - rows = [ - {"age": 62, "ss": 10_000.0, "r1": 0}, # exactly 62 -> retirement - {"age": 61, "ss": 10_000.0, "r1": 0}, # 61 -> disability - ] - result = _process_persons(_raw_person_frame(rows), 2023) - - retirement = result["social_security_retirement"].to_list() - disability = result["social_security_disability"].to_list() - assert retirement[0] == 10_000.0 - assert disability[0] == 0.0 - assert retirement[1] == 0.0 - assert disability[1] == 10_000.0 - - -def test_explicit_reason_code_overrides_age_fallback(): - """A classified disability reason stays disability even at retirement age.""" - rows = [{"age": 70, "ss": 16_000.0, "r1": 2}] # disability code at age 70 - result = _process_persons(_raw_person_frame(rows), 2023) - - assert result["social_security_disability"].to_list()[0] == 16_000.0 - assert result["social_security_retirement"].to_list()[0] == 0.0 - - -def test_both_retirement_and_disability_reasons_is_a_drop_edge_case(): - """When BOTH retirement and disability reasons are present, the value drops. - - The split gates retirement on ``has_retirement & ~has_disability`` and - disability on ``has_disability & ~has_retirement`` (and the record is not - "unclassified" because it carries reasons), so a record coded for BOTH lands - in none of the four components: every component is 0 even though - ``social_security`` is positive. This is a rare degenerate ASEC coding - (simultaneous retirement+disability reason) and is documented here as the - leaf's deterministic behavior rather than silently assumed. It is the one - case where the four components do not reconstruct the total. Do not "fix" by - giving retirement priority without first confirming the matching eCPS - version's behavior. - """ - rows = [{"age": 68, "ss": 22_000.0, "r1": 1, "r2": 2}] - result = _process_persons(_raw_person_frame(rows), 2023) - - assert result["social_security_retirement"].to_list()[0] == 0.0 - assert result["social_security_disability"].to_list()[0] == 0.0 - assert result["social_security_survivors"].to_list()[0] == 0.0 - assert result["social_security_dependents"].to_list()[0] == 0.0 - # The total is still the bundled SS_VAL; only the component split drops it. - assert result["social_security"].to_list()[0] == 22_000.0 - - -def test_social_security_retirement_in_export_allowlist(): - """The leaf must be in the export allowlist or it never reaches the H5. - - The alias map must NOT remap it to a different (reported) companion leaf. - """ - from microplex_us.policyengine.us import ( - POLICYENGINE_US_EXPORT_COLUMN_ALIASES, - SAFE_POLICYENGINE_US_EXPORT_VARIABLES, - ) - - assert "social_security_retirement" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "social_security_disability" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "social_security_survivors" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "social_security_dependents" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert ( - POLICYENGINE_US_EXPORT_COLUMN_ALIASES.get("social_security_retirement") - is None - ) - - -def test_social_security_retirement_survives_computed_export_guard(): - """A future pe-us that re-adds a formula must not silently drop the leaf. - - The export path filters out PolicyEngine-computed variables unless they are - in POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES. Allowlisting the leaf - alone is not enough: if a pe-us version re-introduces the historical - fallback formula, the computed-export guard would strip it before the - hard-raise validation runs, leaving the column silently missing. Pin the - insurance: the leaf is in the override set, so the guard always keeps it. - """ - from microplex_us.policyengine.us import ( - POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES, - POLICYENGINE_US_DATA_OVERRIDABLE_COMPUTED_EXPORT_VARIABLES, - detect_policyengine_computed_export_variables, - ) - - assert ( - "social_security_retirement" - in POLICYENGINE_US_DATA_OVERRIDABLE_COMPUTED_EXPORT_VARIABLES - ) - assert ( - "social_security_retirement" - in POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES - ) - - # Simulate a pe-us where the leaf has a formula: the computed-export - # detector must NOT mark it for exclusion, because it is allow-listed. - class _FormulaVar: - formulas = {"2024": object()} - adds = None - subtracts = None - - class _SystemWithFormula: - variables = {"social_security_retirement": _FormulaVar()} - - excluded = detect_policyengine_computed_export_variables( - _SystemWithFormula(), ["social_security_retirement"] - ) - assert "social_security_retirement" not in excluded - - -# Standalone runner so the suite executes without pytest installed in the env. -if __name__ == "__main__": - import traceback - - funcs = [v for k, v in sorted(globals().items()) if k.startswith("test_")] - passed = 0 - failed = 0 - for fn in funcs: - try: - fn() - print(f"PASS {fn.__name__}") - passed += 1 - except Exception: # noqa: BLE001 - print(f"FAIL {fn.__name__}") - traceback.print_exc() - failed += 1 - print(f"SUMMARY passed={passed} failed={failed}") - raise SystemExit(1 if failed else 0) diff --git a/tests/data_sources/test_scf_net_worth_components.py b/tests/data_sources/test_scf_net_worth_components.py deleted file mode 100644 index 6149df42..00000000 --- a/tests/data_sources/test_scf_net_worth_components.py +++ /dev/null @@ -1,459 +0,0 @@ -"""SCF net-worth component leaves: manifest wiring, rebalance, and a donor fit. - -Covers the 19 SCF balance-sheet component columns added to the ``scf`` -source-impute block (G1). Mirrors policyengine-us-data eCPS: - - * map / signs / rebalance: utils/asset_imputation.py (upstream/main) - * per-leaf QRF + rebalance: calibration/source_impute.py _impute_scf - (upstream/main L1113-1344) - -What is covered here: - - The 19 leaves are present in the scf block target/person/loader columns - and the loader raw-column map equals SCF_NET_WORTH_COMPONENT_TARGETS. - - The ported rebalance reconciles signed components to a net_worth anchor - within the eCPS float32 tolerance (unit test on synthetic components). - - Debt leaves are stored as positive magnitudes and subtract in net worth. - - A small synthetic zero-inflated donor fit on the SCF predictors produces - non-degenerate, non-negative per-leaf predictions (smoke test, not a full - donor run). The runnable path uses ColumnwiseQRFDonorImputer; the - production RegimeAwareDonorImputer variant skips without microimpute. - - The real policyengine-us-data SCF_2022 loader (via the manifest-backed - dataset-loader subprocess) carries all 19 leaves with positive-magnitude - mass (env-gated; skips without a policyengine-us-data checkout). - -Not covered here (would require a full pipeline run): - - End-to-end SCF donor synthesis fit on real CPS receivers + rebalance. - - Final-H5 export of the leaves through the legacy-contract entity path - (verified separately against the export resolver, not in this file). -""" - -from __future__ import annotations - -import json -from pathlib import Path - -import numpy as np -import pandas as pd -import pytest - -from microplex_us.asset_reconciliation import ( - NET_WORTH_COMPONENT_SIGNS, - SCF_NET_WORTH_COMPONENT_TARGETS, - SCF_NET_WORTH_COMPONENT_VARIABLES, - compute_net_worth_from_components, - rebalance_scf_net_worth_components, -) - -SCF_COMPONENT_LEAVES = ( - "scf_certificates_of_deposit", - "scf_savings_bonds", - "scf_retirement_assets", - "scf_cash_value_life_insurance", - "scf_other_managed_assets", - "scf_other_financial_assets", - "scf_primary_residence_value", - "scf_other_residential_real_estate", - "scf_nonresidential_real_estate_equity", - "scf_business_equity", - "scf_other_nonfinancial_assets", - "scf_mortgage_debt", - "scf_other_residential_debt", - "scf_other_lines_of_credit", - "scf_credit_card_debt", - "scf_vehicle_installment_debt", - "scf_student_loan_debt", - "scf_other_installment_debt", - "scf_other_debt", -) - -# eCPS raw SCF summary-extract source column per leaf -# (asset_imputation.py SCF_NET_WORTH_COMPONENT_TARGETS). -EXPECTED_RAW_SOURCE_COLUMN = { - "scf_certificates_of_deposit": "cds", - "scf_savings_bonds": "savbnd", - "scf_retirement_assets": "retqliq", - "scf_cash_value_life_insurance": "cashli", - "scf_other_managed_assets": "othma", - "scf_other_financial_assets": "othfin", - "scf_primary_residence_value": "houses", - "scf_other_residential_real_estate": "oresre", - "scf_nonresidential_real_estate_equity": "nnresre", - "scf_business_equity": "bus", - "scf_other_nonfinancial_assets": "othnfin", - "scf_mortgage_debt": "mrthel", - "scf_other_residential_debt": "resdbt", - "scf_other_lines_of_credit": "othloc", - "scf_credit_card_debt": "ccbal", - "scf_vehicle_installment_debt": "veh_inst", - "scf_student_loan_debt": "edn_inst", - "scf_other_installment_debt": "oth_inst", - "scf_other_debt": "odebt", -} - -DEBT_LEAVES = frozenset( - leaf for leaf, sign in NET_WORTH_COMPONENT_SIGNS.items() if sign < 0 -) & frozenset(SCF_COMPONENT_LEAVES) - -_MANIFEST_PATH = ( - Path(__file__).resolve().parents[2] - / "src" - / "microplex_us" - / "manifests" - / "pe_source_impute_blocks.json" -) - - -def _scf_block() -> dict: - payload = json.loads(_MANIFEST_PATH.read_text()) - return payload["blocks"]["scf"] - - -class TestScfBlockManifest: - """The scf source-impute block must carry the 19 component leaves.""" - - def test_all_nineteen_in_target_variables(self) -> None: - target_variables = set(_scf_block()["target_variables"]) - missing = [ - leaf for leaf in SCF_COMPONENT_LEAVES if leaf not in target_variables - ] - assert not missing, f"missing from scf target_variables: {missing}" - - def test_all_nineteen_in_person_variables(self) -> None: - person_variables = set(_scf_block()["person_variables"]) - missing = [ - leaf for leaf in SCF_COMPONENT_LEAVES if leaf not in person_variables - ] - assert not missing, f"missing from scf person_variables: {missing}" - - def test_loader_raw_column_map_matches_ecps(self) -> None: - """Each leaf maps to the eCPS raw SCF column, no sign flip at load.""" - direct = _scf_block()["dataset_loader"]["direct_person_columns"] - for leaf, raw_column in EXPECTED_RAW_SOURCE_COLUMN.items(): - assert direct.get(leaf) == raw_column, ( - f"{leaf} should load from raw SCF column '{raw_column}', " - f"got {direct.get(leaf)!r}" - ) - - def test_target_map_matches_ecps_component_targets(self) -> None: - """The ported leaf->raw map equals eCPS SCF_NET_WORTH_COMPONENT_TARGETS.""" - for leaf, raw_column in EXPECTED_RAW_SOURCE_COLUMN.items(): - assert SCF_NET_WORTH_COMPONENT_TARGETS[leaf] == (raw_column,) - assert set(SCF_NET_WORTH_COMPONENT_VARIABLES) == set(SCF_COMPONENT_LEAVES) - - def test_resolved_spec_object_carries_leaves(self) -> None: - """The spec object the SCF provider uses carries the 19 leaves. - - Exercises the same get_pe_source_impute_block_spec('scf') the provider - resolves, so the dataset-loader column map and target list are checked - on the live spec, not just the raw JSON. - """ - from microplex_us.data_sources.donor_surveys import ( - get_pe_source_impute_block_spec, - ) - - spec = get_pe_source_impute_block_spec("scf") - direct = spec.dataset_loader.direct_person_columns - for leaf, raw_column in EXPECTED_RAW_SOURCE_COLUMN.items(): - assert direct.get(leaf) == raw_column - assert leaf in spec.target_variables - assert leaf in spec.person_variables - - -class TestNetWorthSigns: - """Debt leaves subtract; asset leaves add.""" - - def test_eight_scf_debt_leaves_carry_negative_sign(self) -> None: - expected_debt = { - "scf_mortgage_debt", - "scf_other_residential_debt", - "scf_other_lines_of_credit", - "scf_credit_card_debt", - "scf_vehicle_installment_debt", - "scf_student_loan_debt", - "scf_other_installment_debt", - "scf_other_debt", - } - assert DEBT_LEAVES == expected_debt - - def test_asset_leaves_have_no_negative_sign(self) -> None: - asset_leaves = set(SCF_COMPONENT_LEAVES) - DEBT_LEAVES - for leaf in asset_leaves: - assert NET_WORTH_COMPONENT_SIGNS.get(leaf, 1.0) >= 0 - - def test_debt_stored_positive_subtracts_in_net_worth(self) -> None: - """A positive-magnitude debt leaf reduces net worth.""" - components = { - "scf_primary_residence_value": np.array([100_000.0], dtype=np.float32), - "scf_mortgage_debt": np.array([40_000.0], dtype=np.float32), - } - net_worth = compute_net_worth_from_components(components=components) - assert net_worth[0] == pytest.approx(60_000.0) - - -def _synthetic_components( - n: int, - *, - seed: int, - include_protected: bool = True, -) -> tuple[dict[str, np.ndarray], np.ndarray]: - """Zero-inflated positive-magnitude component draws + a net-worth anchor. - - Emulates the per-leaf QRF output: each leaf is a non-negative magnitude - with a leaf-specific zero share, the way the real SCF leaves look. - """ - rng = np.random.default_rng(seed) - variables = list(SCF_NET_WORTH_COMPONENT_VARIABLES) - if include_protected: - variables += [ - "bank_account_assets", - "stock_assets", - "bond_assets", - "household_vehicles_value", - ] - components: dict[str, np.ndarray] = {} - for variable in variables: - scale = rng.uniform(1e3, 5e5) - values = rng.exponential(scale=scale, size=n).astype(np.float32) - zero_share = rng.uniform(0.0, 0.7) - values[rng.random(n) < zero_share] = 0.0 - components[variable] = values - target = rng.normal(2e5, 6e5, size=n).astype(np.float32) - return components, target - - -class TestRebalanceReconciliation: - """After rebalance, signed components must reconcile to the net-worth anchor.""" - - def test_reconciles_within_ecps_tolerance(self) -> None: - components, target = _synthetic_components(5000, seed=11) - adjusted = rebalance_scf_net_worth_components( - components={k: v.copy() for k, v in components.items()}, - target_net_worth=target.copy(), - ) - net_worth = compute_net_worth_from_components(components=adjusted) - abs_diff = np.abs(net_worth.astype(np.float64) - target.astype(np.float64)) - rel_diff = abs_diff / np.maximum(np.abs(target.astype(np.float64)), 1.0) - # eCPS reconciliation contract: rtol=1e-6, atol=1.0 - # (check_household_net_worth_reconciliation). float32 accumulation - # leaves a sub-dollar residual on million-dollar balance sheets. - reconciled = (abs_diff <= 1.5) | (rel_diff <= 1e-4) - assert reconciled.all(), ( - f"{(~reconciled).sum()} of {len(target)} rows failed to reconcile; " - f"max abs diff {abs_diff.max():.4g}" - ) - - def test_rebalance_preserves_protected_blended_leaves(self) -> None: - """SIPP/SCF-blended leaves are not rescaled by the rebalance.""" - components, target = _synthetic_components(2000, seed=3) - before = { - k: components[k].copy() - for k in ("bank_account_assets", "stock_assets", "bond_assets") - } - adjusted = rebalance_scf_net_worth_components( - components={k: v.copy() for k, v in components.items()}, - target_net_worth=target.copy(), - ) - for variable, original in before.items(): - np.testing.assert_array_equal(adjusted[variable], original) - - def test_adjusted_leaves_stay_nonnegative(self) -> None: - """Proportional scaling never drives a stored leaf magnitude negative.""" - components, target = _synthetic_components(3000, seed=5) - adjusted = rebalance_scf_net_worth_components( - components={k: v.copy() for k, v in components.items()}, - target_net_worth=target.copy(), - ) - for leaf in SCF_NET_WORTH_COMPONENT_VARIABLES: - assert (adjusted[leaf] >= -1e-3).all(), f"{leaf} went negative" - - -def _donor_training_frame(n: int = 1200, seed: int = 0) -> pd.DataFrame: - """SCF-like donor frame: 8 predictors + zero-inflated positive leaves. - - Leaf support is driven by age/income so a gate classifier can learn it. - """ - rng = np.random.default_rng(seed) - age = rng.integers(18, 90, size=n).astype(float) - is_female = rng.integers(0, 2, size=n).astype(float) - cps_race = rng.integers(1, 8, size=n).astype(float) - is_married = rng.integers(0, 2, size=n).astype(float) - own_children = rng.integers(0, 4, size=n).astype(float) - employment_income = np.maximum(rng.normal(45_000, 35_000, size=n), 0.0).astype( - float - ) - interest_dividend_income = np.maximum(rng.normal(2_000, 6_000, size=n), 0.0).astype( - float - ) - social_security_pension_income = np.where( - age >= 65, - np.maximum(rng.normal(18_000, 9_000, size=n), 0.0), - 0.0, - ).astype(float) - - frame = pd.DataFrame( - { - "age": age, - "is_female": is_female, - "cps_race": cps_race, - "is_married": is_married, - "own_children_in_household": own_children, - "employment_income": employment_income, - "interest_dividend_income": interest_dividend_income, - "social_security_pension_income": social_security_pension_income, - } - ) - - # Retirement assets: held mostly by older / higher-income; ~55% have it. - holds_ret = rng.random(n) < (0.2 + 0.5 * (age >= 50) + 1e-6 * employment_income) - holds_ret = holds_ret & (rng.random(n) < 0.9) - retirement = np.where( - holds_ret, - np.maximum(rng.lognormal(11.5, 1.0, size=n), 0.0), - 0.0, - ).astype(float) - frame["scf_retirement_assets"] = retirement - - # Credit card debt: ~38% carry a balance, broadly across ages. - holds_cc = rng.random(n) < 0.38 - ccbal = np.where( - holds_cc, - np.maximum(rng.lognormal(8.0, 0.8, size=n), 0.0), - 0.0, - ).astype(float) - frame["scf_credit_card_debt"] = ccbal - - return frame - - -_DONOR_CONDITION_VARS = [ - "age", - "is_female", - "cps_race", - "is_married", - "own_children_in_household", - "employment_income", - "interest_dividend_income", - "social_security_pension_income", -] -_DONOR_TARGET_VARS = ("scf_retirement_assets", "scf_credit_card_debt") - - -def _assert_leaf_predictions_nondegenerate(out: pd.DataFrame) -> None: - for leaf in _DONOR_TARGET_VARS: - values = out[leaf].to_numpy(dtype=float) - assert np.isfinite(values).all(), f"{leaf} has non-finite predictions" - # Non-negative (positive-magnitude leaf, clamped at zero). - assert (values >= 0).all(), f"{leaf} produced negative magnitude" - # Non-degenerate: a meaningful share is nonzero (the leaf has mass), - # and not everyone is nonzero (zero-inflation preserved). - nonzero_share = float((values > 0).mean()) - assert 0.05 < nonzero_share < 0.98, ( - f"{leaf} nonzero share {nonzero_share:.3f} looks degenerate" - ) - nonzero = values[values > 0] - assert nonzero.size == 0 or nonzero.std() > 0, f"{leaf} has no spread" - - -class TestDonorFitOnScfLeaves: - """A small zero-inflated donor fit yields non-degenerate, non-negative leaves. - - The component leaves are strongly zero-inflated, so production wiring uses - ``RegimeAwareDonorImputer`` (``donor_imputer_backend='regime_aware'``). - These leaves are positive-magnitude single-sign, so the simpler - ``ColumnwiseQRFDonorImputer`` zero-inflated gate reproduces the same - behavior and is what runs here when ``microimpute`` is not installed. - """ - - def test_columnwise_qrf_zero_inflated_fit(self) -> None: - """Runnable proof: zero-inflated QRF gate on two SCF leaves.""" - pytest.importorskip("quantile_forest") - from microplex_us.pipelines.donor_imputers import ( - ColumnwiseQRFDonorImputer, - ) - - train = _donor_training_frame(n=1200, seed=0) - imputer = ColumnwiseQRFDonorImputer( - condition_vars=_DONOR_CONDITION_VARS, - target_vars=list(_DONOR_TARGET_VARS), - n_estimators=60, - zero_inflated_vars=set(_DONOR_TARGET_VARS), - nonnegative_vars=set(_DONOR_TARGET_VARS), - ) - imputer.fit(train) - receivers = _donor_training_frame(n=600, seed=7)[_DONOR_CONDITION_VARS] - out = imputer.generate(receivers, seed=42) - _assert_leaf_predictions_nondegenerate(out) - - def test_regime_aware_fit(self) -> None: - """Production backend: regime-aware imputer (skips without microimpute).""" - pytest.importorskip("quantile_forest") - pytest.importorskip("microimpute") - from microplex_us.pipelines.donor_imputers import RegimeAwareDonorImputer - - train = _donor_training_frame(n=1200, seed=0) - imputer = RegimeAwareDonorImputer( - condition_vars=_DONOR_CONDITION_VARS, - target_vars=list(_DONOR_TARGET_VARS), - n_estimators=60, - seed=42, - ) - imputer.fit(train) - receivers = _donor_training_frame(n=600, seed=7)[_DONOR_CONDITION_VARS] - out = imputer.generate(receivers, seed=42) - _assert_leaf_predictions_nondegenerate(out) - - -def _resolve_pe_us_data_repo() -> Path | None: - """Resolve a usable policyengine-us-data checkout, or None to skip.""" - try: - from microplex_us.pipelines.pe_native_scores import ( - resolve_policyengine_us_data_python, - resolve_policyengine_us_data_repo_root, - ) - - repo = resolve_policyengine_us_data_repo_root() - resolve_policyengine_us_data_python(repo_root=repo) - except Exception: - return None - return repo - - -class TestScfDonorFrameCarriesLeaves: - """Integration: the real SCF dataset loader carries the 19 leaves. - - Env-gated: runs the actual policyengine-us-data SCF_2022 loader through the - manifest-backed dataset-loader subprocess (the same path the pipeline uses) - and confirms the donor frame carries all 19 component columns with mass. - Skips when policyengine-us-data (and its SCF cache) is not available. - """ - - def test_real_scf_loader_yields_nineteen_leaves(self) -> None: - if _resolve_pe_us_data_repo() is None: - pytest.skip("policyengine-us-data checkout/python not available") - - from microplex.core import EntityType, SourceQuery - - from microplex_us.data_sources.donor_surveys import SCFSourceProvider - - provider = SCFSourceProvider() - try: - frame = provider.load_frame( - SourceQuery(provider_filters={"sample_n": 600, "random_seed": 0}) - ) - except Exception as error: # SCF download / loader unavailable - pytest.skip(f"SCF loader could not run: {error}") - - persons = frame.tables[EntityType.PERSON] - present = [leaf for leaf in SCF_COMPONENT_LEAVES if leaf in persons.columns] - assert len(present) == len(SCF_COMPONENT_LEAVES), ( - f"donor frame missing leaves: {set(SCF_COMPONENT_LEAVES) - set(present)}" - ) - - any_has_mass = False - for leaf in SCF_COMPONENT_LEAVES: - values = pd.to_numeric(persons[leaf], errors="coerce").fillna(0.0) - # Stored as positive magnitudes (no sign flip at load). - assert (values >= 0).all(), f"{leaf} has negative stored magnitude" - if float((values != 0).mean()) > 0: - any_has_mass = True - assert any_has_mass, "no SCF component leaf carried any mass" diff --git a/tests/pipelines/__init__.py b/tests/pipelines/__init__.py deleted file mode 100644 index c9780bbf..00000000 --- a/tests/pipelines/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Pipeline tests for microplex-us.""" diff --git a/tests/pipelines/fixtures/ecps_clean_columns.json b/tests/pipelines/fixtures/ecps_clean_columns.json deleted file mode 100644 index db1605b4..00000000 --- a/tests/pipelines/fixtures/ecps_clean_columns.json +++ /dev/null @@ -1,260 +0,0 @@ -[ - "age", - "alimony_expense", - "alimony_income", - "american_opportunity_credit_claimed_prior_years", - "attends_eligible_educational_institution_for_american_opportunity_credit", - "auto_loan_balance", - "auto_loan_interest", - "bank_account_assets", - "block_geoid", - "bond_assets", - "business_is_sstb", - "casualty_loss", - "charitable_cash_donations", - "charitable_non_cash_donations", - "child_support_expense", - "child_support_received", - "congressional_district_geoid", - "count_under_18", - "count_under_6", - "county_fips", - "cps_race", - "detailed_occupation_recode", - "difficulty_doing_errands", - "difficulty_dressing_or_bathing", - "difficulty_hearing", - "difficulty_remembering_or_making_decisions", - "difficulty_seeing", - "difficulty_walking_or_climbing_stairs", - "disability_benefits", - "domestic_production_ald", - "educational_assistance", - "educator_expense", - "employer_sponsored_insurance_premiums", - "employment_income_before_lsr", - "estate_income", - "estate_income_would_be_qualified", - "family_id", - "family_is_puf_clone", - "farm_income", - "farm_operations_income", - "farm_operations_income_would_be_qualified", - "farm_rent_income", - "farm_rent_income_would_be_qualified", - "financial_assistance", - "first_home_mortgage_balance", - "first_home_mortgage_interest", - "first_home_mortgage_origination_year", - "fsla_overtime_premium", - "has_american_opportunity_credit_1098_t_or_exception", - "has_american_opportunity_credit_institution_ein", - "has_champva_health_coverage_at_interview", - "has_completed_first_four_years_of_postsecondary_education", - "has_esi", - "has_felony_drug_conviction", - "has_indian_health_service_coverage_at_interview", - "has_itin", - "has_marketplace_health_coverage", - "has_marketplace_health_coverage_at_interview", - "has_medicaid_health_coverage_at_interview", - "has_never_worked", - "has_non_marketplace_direct_purchase_health_coverage_at_interview", - "has_other_means_tested_health_coverage_at_interview", - "has_tin", - "has_tricare_health_coverage_at_interview", - "has_va_health_coverage_at_interview", - "has_valid_ssn", - "health_insurance_premiums_without_medicare_part_b", - "health_savings_account_ald", - "home_mortgage_interest", - "hourly_wage", - "hours_worked_last_week", - "household_id", - "household_is_puf_clone", - "household_vehicles_owned", - "household_vehicles_value", - "household_weight", - "immigration_status_str", - "in_nyc", - "investment_income_elected_form_4952", - "investment_interest_expense", - "is_blind", - "is_computer_scientist", - "is_disabled", - "is_enrolled_at_least_half_time_for_american_opportunity_credit", - "is_executive_administrative_professional", - "is_farmer_fisher", - "is_female", - "is_full_time_college_student", - "is_hispanic", - "is_household_head", - "is_military", - "is_paid_hourly", - "is_pregnant", - "is_pursuing_credential_for_american_opportunity_credit", - "is_separated", - "is_surviving_spouse", - "is_tipped_occupation", - "is_union_member_or_covered", - "is_unmarried_partner_of_household_head", - "is_wic_at_nutritional_risk", - "keogh_distributions", - "long_term_capital_gains_before_response", - "long_term_capital_gains_on_collectibles", - "marital_unit_id", - "meets_ssi_disability_criteria", - "miscellaneous_income", - "net_worth", - "non_qualified_dividend_income", - "non_sch_d_capital_gains", - "other_health_insurance_premiums", - "other_medical_expenses", - "other_type_retirement_account_distributions", - "over_the_counter_health_expenses", - "own_children_in_household", - "partnership_s_corp_income", - "partnership_s_corp_income_would_be_qualified", - "partnership_se_income", - "person_family_id", - "person_household_id", - "person_id", - "person_is_puf_clone", - "person_marital_unit_id", - "person_spm_unit_id", - "person_tax_unit_id", - "pre_subsidy_rent", - "previous_year_income_available", - "qualified_bdc_income", - "qualified_dividend_income", - "qualified_reit_and_ptp_income", - "qualified_tuition_expenses", - "real_estate_taxes", - "receives_housing_assistance", - "receives_wic", - "regular_ira_distributions", - "rental_income", - "rental_income_would_be_qualified", - "reported_has_champva_health_coverage_at_interview", - "reported_has_chip_health_coverage_at_interview", - "reported_has_direct_purchase_health_coverage_at_interview", - "reported_has_employer_sponsored_health_coverage_at_interview", - "reported_has_indian_health_service_coverage_at_interview", - "reported_has_marketplace_health_coverage_at_interview", - "reported_has_means_tested_health_coverage_at_interview", - "reported_has_medicaid_health_coverage_at_interview", - "reported_has_medicare_health_coverage_at_interview", - "reported_has_multiple_health_coverage_at_interview", - "reported_has_non_marketplace_direct_purchase_health_coverage_at_interview", - "reported_has_other_means_tested_health_coverage_at_interview", - "reported_has_private_health_coverage_at_interview", - "reported_has_public_health_coverage_at_interview", - "reported_has_subsidized_marketplace_health_coverage_at_interview", - "reported_has_tricare_health_coverage_at_interview", - "reported_has_unsubsidized_marketplace_health_coverage_at_interview", - "reported_has_va_health_coverage_at_interview", - "reported_is_insured_at_interview", - "reported_is_uninsured_at_interview", - "reported_owns_employer_sponsored_health_insurance_at_interview", - "roth_401k_contributions", - "roth_401k_contributions_desired", - "roth_ira_contributions", - "roth_ira_contributions_desired", - "roth_ira_distributions", - "salt_refund_income", - "scf_business_equity", - "scf_cash_value_life_insurance", - "scf_certificates_of_deposit", - "scf_credit_card_debt", - "scf_mortgage_debt", - "scf_nonresidential_real_estate_equity", - "scf_other_debt", - "scf_other_financial_assets", - "scf_other_installment_debt", - "scf_other_lines_of_credit", - "scf_other_managed_assets", - "scf_other_nonfinancial_assets", - "scf_other_residential_debt", - "scf_other_residential_real_estate", - "scf_primary_residence_value", - "scf_retirement_assets", - "scf_savings_bonds", - "scf_student_loan_debt", - "scf_vehicle_installment_debt", - "second_home_mortgage_balance", - "second_home_mortgage_interest", - "second_home_mortgage_origination_year", - "selected_marketplace_plan_benchmark_ratio", - "self_employed_pension_contributions", - "self_employed_pension_contributions_desired", - "self_employment_income_before_lsr", - "self_employment_income_last_year", - "self_employment_income_would_be_qualified", - "short_term_capital_gains", - "social_security_dependents", - "social_security_disability", - "social_security_retirement", - "social_security_survivors", - "spm_unit_capped_work_childcare_expenses", - "spm_unit_energy_subsidy", - "spm_unit_id", - "spm_unit_is_puf_clone", - "spm_unit_pre_subsidy_childcare_expenses", - "spm_unit_tenure_type", - "ssn_card_type", - "sstb_self_employment_income_before_lsr", - "sstb_self_employment_income_would_be_qualified", - "sstb_unadjusted_basis_qualified_property", - "sstb_w2_wages_from_qualified_business", - "state_fips", - "stock_assets", - "strike_benefits", - "student_loan_interest", - "survivor_benefits", - "takes_up_aca_if_eligible", - "takes_up_dc_ptc", - "takes_up_early_head_start_if_eligible", - "takes_up_eitc", - "takes_up_head_start_if_eligible", - "takes_up_housing_assistance_if_eligible", - "takes_up_medicaid_if_eligible", - "takes_up_medicare_if_eligible", - "takes_up_snap_if_eligible", - "takes_up_ssi_if_eligible", - "takes_up_tanf_if_eligible", - "tax_exempt_401k_distributions", - "tax_exempt_403b_distributions", - "tax_exempt_interest_income", - "tax_exempt_ira_distributions", - "tax_exempt_private_pension_income", - "tax_exempt_sep_distributions", - "tax_unit_id", - "tax_unit_is_puf_clone", - "taxable_401k_distributions", - "taxable_403b_distributions", - "taxable_interest_income", - "taxable_ira_distributions", - "taxable_private_pension_income", - "taxable_sep_distributions", - "taxpayer_id_type", - "tenure_type", - "tip_income", - "tract_geoid", - "traditional_401k_contributions", - "traditional_401k_contributions_desired", - "traditional_ira_contributions", - "traditional_ira_contributions_desired", - "treasury_tipped_occupation_code", - "unadjusted_basis_qualified_property", - "unemployment_compensation", - "unrecaptured_section_1250_gain", - "unreimbursed_business_employee_expenses", - "veterans_benefits", - "w2_wages_from_qualified_business", - "weekly_hours_worked_before_lsr", - "weeks_unemployed", - "weeks_worked", - "workers_compensation", - "would_claim_wic", - "would_file_taxes_voluntarily" -] diff --git a/tests/pipelines/test_artifacts.py b/tests/pipelines/test_artifacts.py deleted file mode 100644 index ac79675e..00000000 --- a/tests/pipelines/test_artifacts.py +++ /dev/null @@ -1,1501 +0,0 @@ -"""Tests for pipeline artifact persistence.""" - -import json -import sqlite3 -from pathlib import Path -from types import SimpleNamespace - -import h5py -import pandas as pd -from microplex.core import EntityType -from microplex.targets import StaticTargetProvider, TargetQuery, TargetSet, TargetSpec - -from microplex_us.pipelines.artifacts import ( - build_and_save_versioned_us_microplex_from_source_providers, - replay_us_microplex_policyengine_stage_from_artifact, - save_us_microplex_artifacts, -) -from microplex_us.pipelines.registry import load_us_microplex_run_registry -from microplex_us.pipelines.stage_policyengine_artifacts import ( - load_us_policyengine_entity_stage_artifact, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexTargets, -) -from microplex_us.policyengine import ( - PolicyEngineUSEntityTableBundle, - PolicyEngineUSHarnessSlice, - build_policyengine_us_time_period_arrays, - compute_policyengine_us_definition_hash, - write_policyengine_us_time_period_dataset, -) - - -def test_source_provider_versioned_build_initializes_live_stage_writer( - tmp_path, - monkeypatch, -) -> None: - captured: dict[str, object] = {} - - class FakePipeline: - def __init__(self, config, *, stage_runtime_writer=None): - captured["config"] = config - captured["pipeline_stage_runtime_writer"] = stage_runtime_writer - - def build_from_source_providers(self, providers, queries=None): - captured["providers"] = providers - captured["queries"] = queries - return "build-result" - - def fake_finalize(build_result, **kwargs): - captured["build_result"] = build_result - captured["finalize_stage_runtime_writer"] = kwargs.get("stage_runtime_writer") - captured.update(kwargs) - return "finalized" - - monkeypatch.setattr( - "microplex_us.pipelines.artifacts.USMicroplexPipeline", - FakePipeline, - ) - monkeypatch.setattr( - "microplex_us.pipelines.artifacts._finalize_versioned_build_artifacts", - fake_finalize, - ) - provider = SimpleNamespace(descriptor=SimpleNamespace(name="unit_source")) - - result = build_and_save_versioned_us_microplex_from_source_providers( - [provider], - tmp_path, - config=USMicroplexBuildConfig(calibration_backend="none"), - version_id="runtime-test", - ) - - output_dir = tmp_path / "runtime-test" - stage1_manifest = json.loads( - ( - output_dir / "stage_artifacts" / "manifests" / "01_run_profile.json" - ).read_text() - ) - - assert result == "finalized" - assert captured["build_result"] == "build-result" - assert captured["preallocated_output_dir"] == output_dir - assert ( - captured["pipeline_stage_runtime_writer"] - is captured["finalize_stage_runtime_writer"] - ) - assert stage1_manifest["lifecycleStatus"] == "complete" - assert stage1_manifest["outputs"]["provider_query_plan"]["provider_names"] == [ - "unit_source" - ] - - -def test_replay_policyengine_stage_from_artifact_uses_saved_synthetic( - tmp_path, - monkeypatch, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - config = USMicroplexBuildConfig( - policyengine_targets_db=str(tmp_path / "policy_data.db"), - calibration_backend="entropy", - ) - seed_data = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "weight": [10.0], - } - ) - scaffold_seed_data = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "weight": [5.0], - } - ) - synthetic_data = pd.DataFrame( - { - "person_id": [2], - "household_id": [20], - "weight": [20.0], - } - ) - stale_calibrated_data = pd.DataFrame( - { - "person_id": [3], - "household_id": [30], - "weight": [999.0], - } - ) - scaffold_seed_path = ( - artifact_dir - / "stage_artifacts" - / "04_seed_scaffold" - / "scaffold_seed_data.parquet" - ) - scaffold_seed_path.parent.mkdir(parents=True) - scaffold_seed_data.to_parquet(scaffold_seed_path, index=False) - seed_data.to_parquet(artifact_dir / "seed_data.parquet", index=False) - synthetic_data.to_parquet(artifact_dir / "synthetic_data.parquet", index=False) - stale_calibrated_data.to_parquet( - artifact_dir / "calibrated_data.parquet", - index=False, - ) - (artifact_dir / "targets.json").write_text( - json.dumps({"marginal": {}, "continuous": {}}) - ) - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "config": config.to_dict(), - "artifacts": { - "scaffold_seed_data": ( - "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet" - ), - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - }, - "synthesis": {"source_names": ["test_source"]}, - } - ) - ) - - captured: dict[str, object] = {} - - class FakePipeline: - def __init__(self, config): - captured["config"] = config - - def build_policyengine_entity_tables(self, frame): - captured["table_input"] = frame.copy() - return "synthetic_tables" - - def calibrate_policyengine_tables(self, tables): - captured["tables"] = tables - calibrated = captured["table_input"].copy() - calibrated["weight"] = calibrated["weight"] * 2.0 - return ( - "policyengine_tables", - calibrated, - {"backend": "policyengine_db_none"}, - ) - - monkeypatch.setattr( - "microplex_us.pipelines.artifacts.USMicroplexPipeline", - FakePipeline, - ) - - result = replay_us_microplex_policyengine_stage_from_artifact( - artifact_dir, - config_overrides={"calibration_backend": "none"}, - ) - - assert captured["config"].calibration_backend == "none" - assert captured["tables"] == "synthetic_tables" - pd.testing.assert_frame_equal(captured["table_input"], synthetic_data) - pd.testing.assert_frame_equal(result.seed_data, seed_data) - pd.testing.assert_frame_equal(result.synthetic_data, synthetic_data) - assert result.calibrated_data["person_id"].tolist() == [2] - assert result.calibrated_data["weight"].tolist() == [40.0] - assert result.policyengine_tables == "policyengine_tables" - pd.testing.assert_frame_equal(result.scaffold_seed_data, scaffold_seed_data) - assert result.calibration_summary == {"backend": "policyengine_db_none"} - assert result.synthesis_metadata["policyengine_stage_replay"][ - "config_override_keys" - ] == ["calibration_backend"] - - -def test_replay_policyengine_stage_refreshes_baseline_total_weight_targets( - tmp_path, - monkeypatch, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - baseline_dataset = tmp_path / "baseline.h5" - with h5py.File(baseline_dataset, "w") as handle: - household_weight = handle.create_group("household_weight") - household_weight.create_dataset("2024", data=[2.5, 3.5]) - - stale_total_weight = 153_768_768.0 - config = USMicroplexBuildConfig( - policyengine_targets_db=str(tmp_path / "policy_data.db"), - policyengine_baseline_dataset="/tmp/stale-baseline.h5", - policyengine_dataset_year=2024, - policyengine_target_period=2024, - calibration_backend="entropy", - policyengine_selection_target_total_weight=stale_total_weight, - policyengine_calibration_target_total_weight=stale_total_weight, - policyengine_calibration_rescale_to_target_total_weight=True, - ) - seed_data = pd.DataFrame( - {"person_id": [1], "household_id": [10], "weight": [10.0]} - ) - synthetic_data = pd.DataFrame( - {"person_id": [2], "household_id": [20], "weight": [20.0]} - ) - seed_data.to_parquet(artifact_dir / "seed_data.parquet", index=False) - synthetic_data.to_parquet(artifact_dir / "synthetic_data.parquet", index=False) - synthetic_data.to_parquet( - artifact_dir / "calibrated_data.parquet", - index=False, - ) - (artifact_dir / "targets.json").write_text( - json.dumps({"marginal": {}, "continuous": {}}) - ) - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "config": config.to_dict(), - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - }, - "synthesis": {"source_names": ["test_source"]}, - } - ) - ) - - captured: dict[str, object] = {} - - class FakePipeline: - def __init__(self, config): - captured["config"] = config - - def build_policyengine_entity_tables(self, frame): - return frame.copy() - - def calibrate_policyengine_tables(self, tables): - return tables, synthetic_data.copy(), {"backend": "policyengine_db_entropy"} - - monkeypatch.setattr( - "microplex_us.pipelines.artifacts.USMicroplexPipeline", - FakePipeline, - ) - - replay_us_microplex_policyengine_stage_from_artifact( - artifact_dir, - policyengine_baseline_dataset=baseline_dataset, - ) - - replay_config = captured["config"] - assert replay_config.policyengine_baseline_dataset == str(baseline_dataset) - assert replay_config.policyengine_selection_target_total_weight == 6.0 - assert replay_config.policyengine_calibration_target_total_weight == 6.0 - assert replay_config.policyengine_calibration_rescale_to_target_total_weight is True - - -def _write_baseline_dataset( - path: Path, - tables: PolicyEngineUSEntityTableBundle, -) -> Path: - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map={"state_fips": "state_fips", "snap": "snap"}, - person_variable_map={"age": "age", "income": "employment_income"}, - tax_unit_variable_map={"filing_status": "filing_status"}, - ) - write_policyengine_us_time_period_dataset(arrays, path) - return path - - -def _create_policyengine_targets_db(path: Path) -> None: - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - - CREATE VIEW target_overview AS - SELECT - t.target_id, - t.stratum_id, - t.variable, - t.value, - t.period, - t.active, - 'state' AS geo_level, - '06' AS geographic_id, - 'household_count' AS domain_variable - FROM targets AS t; - """ - ) - conn.execute( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, NULL) - """, - (1, compute_policyengine_us_definition_hash(())), - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (1, "household_count", 2024, 1, 0, 3.0, 1, None, "test", "count"), - ], - ) - conn.commit() - conn.close() - - -class TestSaveUSMicroplexArtifacts: - """Test saving pipeline artifacts.""" - - def test_writes_expected_files(self, tmp_path): - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - scaffold_seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0, 20.0], "weight": [1.0, 1.0]}), - calibrated_data=pd.DataFrame( - {"income": [10.0, 20.0], "weight": [0.5, 1.5]} - ), - targets=USMicroplexTargets( - marginal={"state": {"CA": 2.0}}, - continuous={"income": 30.0}, - ), - calibration_summary={"max_error": 0.01, "mean_error": 0.005}, - synthesis_metadata={"backend": "bootstrap"}, - synthesizer=None, - pre_calibration_policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - {"household_id": [1, 2], "household_weight": [1.0, 1.0]} - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["SINGLE", "JOINT"], - } - ), - ), - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - {"household_id": [1, 2], "household_weight": [0.5, 1.5]} - ), - persons=pd.DataFrame( - { - "person_id": [10, 11], - "household_id": [1, 2], - "tax_unit_id": [101, 102], - "spm_unit_id": [201, 202], - "family_id": [301, 302], - "marital_unit_id": [401, 402], - "age": [35, 62], - "taxable_interest_income": [125.0, 250.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["SINGLE", "JOINT"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - - paths = save_us_microplex_artifacts(result, tmp_path) - - assert paths.output_dir == tmp_path - assert paths.scaffold_seed_data is not None - assert paths.scaffold_seed_data.exists() - assert paths.seed_data.exists() - assert paths.synthetic_data.exists() - assert paths.calibrated_data.exists() - assert paths.targets.exists() - assert paths.manifest.exists() - assert paths.synthesizer is None - assert paths.policyengine_dataset is not None - assert paths.policyengine_dataset.exists() - assert paths.stage_manifest is not None - assert paths.stage_manifest.exists() - assert paths.artifact_inventory is not None - assert paths.artifact_inventory.exists() - assert paths.conditional_readiness is not None - assert paths.conditional_readiness.exists() - assert paths.source_plan is not None - assert paths.source_plan.exists() - assert paths.pre_calibration_policyengine_entity_tables is not None - assert paths.pre_calibration_policyengine_entity_tables.exists() - assert paths.policyengine_entity_tables is not None - assert paths.policyengine_entity_tables.exists() - assert paths.calibration_summary is not None - assert paths.calibration_summary.exists() - assert paths.validation_evidence is not None - assert paths.validation_evidence.exists() - assert paths.source_weight_diagnostics is not None - assert paths.source_weight_diagnostics.exists() - - manifest = json.loads(paths.manifest.read_text()) - assert manifest["rows"]["synthetic"] == 2 - assert manifest["weights"]["nonzero"] == 2 - assert manifest["config"]["synthesis_backend"] == "bootstrap" - assert ( - manifest["artifacts"]["scaffold_seed_data"] - == "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet" - ) - assert manifest["artifacts"]["policyengine_dataset"] == "policyengine_us.h5" - assert manifest["artifacts"]["stage_manifest"] == "stage_manifest.json" - assert ( - manifest["artifacts"]["artifact_inventory"] - == "stage_artifacts/artifact_inventory.json" - ) - assert ( - manifest["artifacts"]["conditional_readiness"] - == "stage_artifacts/conditional_readiness.json" - ) - assert ( - manifest["artifacts"]["policyengine_entity_tables"] - == "stage_artifacts/07_calibration/policyengine_entity_tables/metadata.json" - ) - assert ( - manifest["artifacts"]["pre_calibration_policyengine_entity_tables"] - == "stage_artifacts/06_policyengine_entities/metadata.json" - ) - assert ( - manifest["artifacts"]["source_weight_diagnostics"] - == "source_weight_diagnostics.json" - ) - source_diagnostics = json.loads(paths.source_weight_diagnostics.read_text()) - artifact_inventory = json.loads(paths.artifact_inventory.read_text()) - conditional_readiness = json.loads(paths.conditional_readiness.read_text()) - inventory_records = { - (record["stageId"], record["key"]): record - for record in artifact_inventory["artifacts"] - } - assert inventory_records[("01_run_profile", "manifest")]["exists"] is True - assert ( - inventory_records[("08_dataset_assembly", "policyengine_dataset")][ - "classification" - ] - == "post_artifact_evidence" - ) - readiness = { - stage["stageId"]: stage for stage in conditional_readiness["stages"] - } - assert readiness["09_validation_benchmarking"]["readiness"] == ( - "post_artifact_evidence" - ) - assert ( - source_diagnostics["summary"]["diagnostic_scope"] - == "saved_artifact_entity_weight_by_source_rows" - ) - assert source_diagnostics["summary"]["support_household_weight_share"] == 0.0 - assert ( - source_diagnostics["summary"]["puf_support_household_weight_share"] == 0.0 - ) - assert source_diagnostics["summary"]["total_household_weight"] == 2.0 - assert source_diagnostics["summary"]["total_person_weight"] == 2.0 - assert source_diagnostics["summary"]["total_tax_unit_weight"] == 2.0 - assert source_diagnostics["sources"][0]["source_class"] == ( - "synthetic_population" - ) - assert source_diagnostics["sources"][0]["person_count"] == 2 - assert source_diagnostics["sources"][0]["person_weight_share"] == 1.0 - assert source_diagnostics["sources"][0]["tax_unit_count"] == 2 - assert source_diagnostics["sources"][0]["tax_unit_weight_share"] == 1.0 - pre_calibration_tables, _ = load_us_policyengine_entity_stage_artifact( - paths.pre_calibration_policyengine_entity_tables - ) - assert pre_calibration_tables.tax_units is not None - assert "filing_status" in pre_calibration_tables.tax_units - - with h5py.File(paths.policyengine_dataset, "r") as handle: - assert "household_id" in handle - assert "person_household_id" in handle - assert "tax_unit_id" in handle - assert "taxable_interest_income" in handle - assert "filing_status" not in handle - assert "source_weight_diagnostics" not in handle - - def test_leaves_pre_calibration_entity_artifact_blank_when_tables_are_absent( - self, - tmp_path, - ): - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=1, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - scaffold_seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0], "weight": [1.0]}), - calibrated_data=pd.DataFrame({"income": [10.0], "weight": [1.0]}), - targets=USMicroplexTargets(marginal={}, continuous={}), - calibration_summary={"max_error": 0.0}, - synthesis_metadata={ - "backend": "bootstrap", - "source_names": ["source"], - "scaffold_source": "source", - }, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - {"household_id": [1], "household_weight": [1.0]} - ), - persons=pd.DataFrame( - { - "person_id": [10], - "household_id": [1], - "tax_unit_id": [101], - "spm_unit_id": [201], - "family_id": [301], - "marital_unit_id": [401], - "age": [35], - } - ), - tax_units=pd.DataFrame({"tax_unit_id": [101], "household_id": [1]}), - spm_units=pd.DataFrame({"spm_unit_id": [201], "household_id": [1]}), - families=pd.DataFrame({"family_id": [301], "household_id": [1]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [401], "household_id": [1]} - ), - ), - ) - - paths = save_us_microplex_artifacts(result, tmp_path) - - manifest = json.loads(paths.manifest.read_text()) - assert paths.pre_calibration_policyengine_entity_tables is None - assert ( - manifest["artifacts"]["pre_calibration_policyengine_entity_tables"] is None - ) - assert not (tmp_path / "stage_artifacts" / "06_policyengine_entities").exists() - stage7_manifest = json.loads( - ( - tmp_path / "stage_artifacts" / "manifests" / "07_calibration.json" - ).read_text() - ) - assert stage7_manifest["complete"] is False - assert ( - stage7_manifest["outputs"]["policyengine_entity_tables"]["exists"] is True - ) - - def test_writes_model_when_present(self, tmp_path): - class FakeSynthesizer: - def __init__(self): - self.saved_path = None - - def save(self, path): - self.saved_path = path - path.write_text("model") - - fake = FakeSynthesizer() - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=1, - synthesis_backend="synthesizer", - calibration_backend="entropy", - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0], "weight": [1.0]}), - calibrated_data=pd.DataFrame({"income": [10.0], "weight": [1.0]}), - targets=USMicroplexTargets(marginal={}, continuous={"income": 10.0}), - calibration_summary={"max_error": 0.0, "mean_error": 0.0}, - synthesis_metadata={"backend": "synthesizer"}, - synthesizer=fake, - policyengine_tables=None, - ) - - paths = save_us_microplex_artifacts(result, tmp_path) - - assert paths.synthesizer is not None - assert paths.synthesizer.exists() - assert fake.saved_path == paths.synthesizer - - def test_writes_data_flow_snapshot_before_manifest_validation(self, tmp_path): - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=1, - synthesis_backend="seed", - calibration_backend="entropy", - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0], "weight": [1.0]}), - calibrated_data=pd.DataFrame({"income": [10.0], "weight": [1.0]}), - targets=USMicroplexTargets(marginal={}, continuous={"income": 10.0}), - calibration_summary={"max_error": 0.0, "mean_error": 0.0}, - synthesis_metadata={ - "backend": "seed", - "source_names": ["cps_asec_parquet"], - "scaffold_source": "cps_asec_parquet", - "condition_vars": [], - "target_vars": [], - "donor_integrated_variables": [], - "state_program_support_proxies": { - "available": [], - "missing": [], - }, - }, - synthesizer=None, - policyengine_tables=None, - ) - - paths = save_us_microplex_artifacts(result, tmp_path) - - assert paths.data_flow_snapshot is not None - assert paths.data_flow_snapshot.exists() - assert paths.stage_manifest is not None - assert paths.stage_manifest.exists() - manifest = json.loads(paths.manifest.read_text()) - assert manifest["artifacts"]["data_flow_snapshot"] == "data_flow_snapshot.json" - assert manifest["artifacts"]["stage_manifest"] == "stage_manifest.json" - snapshot = json.loads(paths.data_flow_snapshot.read_text()) - assert snapshot["runtime"]["scaffoldSource"] == "cps_asec_parquet" - assert len(snapshot["stages"]) == 9 - - def test_writes_child_tax_unit_agi_drift_summary(self, tmp_path): - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="seed", - calibration_backend="entropy", - policyengine_dataset_year=2024, - ), - seed_data=pd.DataFrame( - { - "person_id": [1, 2], - "tax_unit_id": [10, 10], - "age": [40.0, 10.0], - "is_tax_unit_dependent": [0, 1], - "employment_income": [30_000.0, 0.0], - "wage_income": [28_000.0, 0.0], - "taxable_interest_income": [100.0, 0.0], - } - ), - synthetic_data=pd.DataFrame( - { - "person_id": [1, 2], - "tax_unit_id": [10, 10], - "age": [40.0, 10.0], - "is_tax_unit_dependent": [0, 1], - "employment_income": [30_000.0, 0.0], - "wage_income": [28_000.0, 0.0], - "taxable_interest_income": [100.0, 0.0], - "weight": [1.0, 1.0], - } - ), - calibrated_data=pd.DataFrame( - { - "person_id": [1, 2], - "tax_unit_id": [10, 10], - "age": [40.0, 10.0], - "is_tax_unit_dependent": [0, 1], - "employment_income": [30_000.0, 0.0], - "wage_income": [28_000.0, 0.0], - "taxable_interest_income": [100.0, 0.0], - "weight": [1.0, 1.0], - } - ), - targets=USMicroplexTargets(marginal={}, continuous={"income": 10.0}), - calibration_summary={"max_error": 0.01, "mean_error": 0.005}, - synthesis_metadata={"backend": "seed"}, - synthesizer=None, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1], - "household_weight": [2.0], - "state_fips": [6], - "snap": [100.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [1, 1], - "tax_unit_id": [10, 10], - "spm_unit_id": [20, 20], - "family_id": [30, 30], - "marital_unit_id": [40, 40], - "age": [40.0, 10.0], - "income": [30_000.0, 0.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [10], - "household_id": [1], - "filing_status": ["JOINT"], - } - ), - spm_units=pd.DataFrame({"spm_unit_id": [20], "household_id": [1]}), - families=pd.DataFrame({"family_id": [30], "household_id": [1]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [40], "household_id": [1]} - ), - ), - ) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - result.policyengine_tables, - ) - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=100.0, - period=2024, - measure="snap", - aggregation="sum", - ), - ] - ) - ) - - paths = save_us_microplex_artifacts( - result, - tmp_path / "bundle", - policyengine_target_provider=provider, - policyengine_baseline_dataset=baseline_dataset, - policyengine_harness_slices=( - PolicyEngineUSHarnessSlice( - name="snap", - description="SNAP parity", - query=TargetQuery(period=2024, names=("snap_total",)), - ), - ), - policyengine_harness_metadata={"baseline_dataset": baseline_dataset.name}, - enable_child_tax_unit_agi_drift=True, - ) - - assert paths.child_tax_unit_agi_drift is not None - assert paths.child_tax_unit_agi_drift.exists() - manifest = json.loads(paths.manifest.read_text()) - assert ( - manifest["artifacts"]["child_tax_unit_agi_drift"] - == "child_tax_unit_agi_drift.json" - ) - assert "child_tax_unit_agi_drift" in manifest.get("diagnostics", {}) - registry_entries = load_us_microplex_run_registry( - paths.run_registry or tmp_path / "run_registry.jsonl" - ) - assert registry_entries[-1].metadata.get("child_tax_unit_agi_drift") is not None - - def test_writes_policyengine_harness_when_baseline_and_targets_are_provided( - self, tmp_path - ): - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0, 20.0], "weight": [1.0, 1.0]}), - calibrated_data=pd.DataFrame( - {"income": [10.0, 20.0], "weight": [0.5, 1.5]} - ), - targets=USMicroplexTargets( - marginal={"state": {"CA": 2.0}}, - continuous={"income": 30.0}, - ), - calibration_summary={"max_error": 0.01, "mean_error": 0.005}, - synthesis_metadata={"backend": "bootstrap"}, - synthesizer=None, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [2.0, 1.0], - "state_fips": [6, 36], - "snap": [100.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="household_count", - entity=EntityType.HOUSEHOLD, - value=3.0, - period=2024, - aggregation="count", - ), - ] - ) - ) - - paths = save_us_microplex_artifacts( - result, - tmp_path / "bundle", - policyengine_target_provider=provider, - policyengine_baseline_dataset=baseline_dataset, - policyengine_harness_slices=( - PolicyEngineUSHarnessSlice( - name="household_count", - description="Household count parity", - query=TargetQuery(period=2024, names=("household_count",)), - ), - ), - policyengine_harness_metadata={"baseline_dataset": baseline_dataset.name}, - ) - - assert paths.policyengine_harness is not None - assert paths.policyengine_harness.exists() - - manifest = json.loads(paths.manifest.read_text()) - assert ( - manifest["artifacts"]["policyengine_harness"] == "policyengine_harness.json" - ) - assert manifest["policyengine_harness"]["slice_win_rate"] == 1.0 - assert manifest["policyengine_harness"]["target_win_rate"] == 1.0 - assert ( - manifest["policyengine_harness"]["candidate_composite_parity_loss"] - is not None - ) - - harness_payload = json.loads(paths.policyengine_harness.read_text()) - assert harness_payload["metadata"]["baseline_dataset"] == "baseline.h5" - assert ( - harness_payload["metadata"]["policyengine_us_runtime_version"] is not None - ) - assert harness_payload["summary"]["slice_win_rate"] == 1.0 - assert harness_payload["summary"]["candidate_composite_parity_loss"] is not None - - def test_can_defer_policyengine_harness_generation(self, monkeypatch, tmp_path): - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0, 20.0], "weight": [1.0, 1.0]}), - calibrated_data=pd.DataFrame( - {"income": [10.0, 20.0], "weight": [0.5, 1.5]} - ), - targets=USMicroplexTargets( - marginal={"state": {"CA": 2.0}}, - continuous={"income": 30.0}, - ), - calibration_summary={"max_error": 0.01, "mean_error": 0.005}, - synthesis_metadata={"backend": "bootstrap"}, - synthesizer=None, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [2.0, 1.0], - "state_fips": [6, 36], - "snap": [100.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=250.0, - period=2024, - measure="snap", - aggregation="sum", - ), - ] - ) - ) - - monkeypatch.setattr( - "microplex_us.pipelines.artifacts.evaluate_policyengine_us_harness", - lambda *_args, **_kwargs: (_ for _ in ()).throw( - AssertionError("harness evaluation should be deferred") - ), - ) - - paths = save_us_microplex_artifacts( - result, - tmp_path / "bundle", - policyengine_target_provider=provider, - policyengine_baseline_dataset=baseline_dataset, - policyengine_harness_slices=( - PolicyEngineUSHarnessSlice( - name="snap", - description="SNAP parity", - query=TargetQuery(period=2024, names=("snap_total",)), - ), - ), - defer_policyengine_harness=True, - defer_policyengine_native_score=True, - ) - - manifest = json.loads(paths.manifest.read_text()) - assert paths.policyengine_harness is None - assert manifest["artifacts"]["policyengine_harness"] is None - assert "policyengine_harness" not in manifest - - def test_writes_policyengine_harness_from_build_config_defaults(self, tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - policyengine_targets_db=str(targets_db), - policyengine_baseline_dataset=str(baseline_dataset), - policyengine_target_variables=("household_count",), - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0, 20.0], "weight": [1.0, 1.0]}), - calibrated_data=pd.DataFrame( - {"income": [10.0, 20.0], "weight": [0.5, 1.5]} - ), - targets=USMicroplexTargets( - marginal={"state": {"CA": 2.0}}, - continuous={"income": 30.0}, - ), - calibration_summary={"max_error": 0.01, "mean_error": 0.005}, - synthesis_metadata={"backend": "bootstrap"}, - synthesizer=None, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [2.0, 1.0], - "state_fips": [6, 36], - "snap": [100.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - - paths = save_us_microplex_artifacts( - result, - tmp_path / "bundle", - defer_policyengine_native_score=True, - ) - - assert paths.policyengine_harness is not None - assert paths.policyengine_harness.exists() - assert paths.run_registry is not None - assert paths.run_registry.exists() - - manifest = json.loads(paths.manifest.read_text()) - assert manifest["policyengine_harness"]["slice_win_rate"] == 1.0 - assert manifest["policyengine_harness"]["target_win_rate"] == 1.0 - assert ( - manifest["policyengine_harness"]["candidate_composite_parity_loss"] - is not None - ) - assert ( - manifest["policyengine_harness"]["parity_scorecard"]["overall"][ - "candidate_beats_baseline" - ] - is True - ) - assert manifest["run_registry"]["artifact_id"] == "bundle" - assert manifest["run_registry"]["improved_candidate_frontier"] is True - assert manifest["run_registry"]["improved_composite_frontier"] is True - assert ( - manifest["run_registry"]["default_frontier_metric"] - == "candidate_composite_parity_loss" - ) - - harness_payload = json.loads(paths.policyengine_harness.read_text()) - assert harness_payload["metadata"]["baseline_dataset"] == "baseline.h5" - assert harness_payload["metadata"]["targets_db"] == "policyengine_targets.db" - assert ( - harness_payload["metadata"]["harness_suite"] - == "policyengine_us_all_targets" - ) - assert harness_payload["metadata"]["harness_slice_names"] == ["all_targets"] - assert harness_payload["metadata"]["target_variables"] == ["household_count"] - assert ( - harness_payload["metadata"]["policyengine_us_runtime_version"] is not None - ) - assert [ - slice_payload["name"] for slice_payload in harness_payload["slices"] - ] == [ - "all_targets", - ] - registry_entries = load_us_microplex_run_registry(paths.run_registry) - assert len(registry_entries) == 1 - assert registry_entries[0].artifact_id == "bundle" - assert registry_entries[0].policyengine_us_runtime_version is not None - assert registry_entries[0].supported_target_rate == 1.0 - assert registry_entries[0].candidate_composite_parity_loss is not None - assert ( - registry_entries[0].tag_summaries["all_targets"]["target_win_rate"] == 1.0 - ) - - def test_writes_policyengine_native_scores_when_available( - self, monkeypatch, tmp_path - ) -> None: - monkeypatch.setattr( - "microplex_us.pipelines.artifacts.compute_us_pe_native_scores", - lambda **_kwargs: { - "metric": "enhanced_cps_native_loss", - "summary": { - "candidate_enhanced_cps_native_loss": 0.25, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": -0.25, - "candidate_beats_baseline": True, - "candidate_unweighted_msre": 0.3, - "baseline_unweighted_msre": 0.6, - "unweighted_msre_delta": -0.3, - "n_targets_total": 2863, - "n_targets_kept": 2853, - "n_targets_zero_dropped": 10, - "n_targets_bad_dropped": 10, - "n_national_targets": 2000, - "n_state_targets": 853, - }, - }, - ) - - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0, 20.0], "weight": [1.0, 1.0]}), - calibrated_data=pd.DataFrame( - {"income": [10.0, 20.0], "weight": [0.5, 1.5]} - ), - targets=USMicroplexTargets(marginal={}, continuous={"income": 30.0}), - calibration_summary={"max_error": 0.01, "mean_error": 0.005}, - synthesis_metadata={"backend": "bootstrap"}, - synthesizer=None, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [0.5, 1.5], - "state_fips": [6, 48], - "snap": [0.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11], - "household_id": [1, 2], - "tax_unit_id": [101, 102], - "spm_unit_id": [201, 202], - "family_id": [301, 302], - "marital_unit_id": [401, 402], - "age": [35.0, 62.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["SINGLE", "JOINT"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - - baseline_dataset = tmp_path / "baseline.h5" - baseline_dataset.write_text("baseline") - - paths = save_us_microplex_artifacts( - result, - tmp_path / "bundle-native", - policyengine_baseline_dataset=baseline_dataset, - ) - - assert paths.policyengine_native_scores is not None - assert paths.policyengine_native_scores.exists() - - manifest = json.loads(paths.manifest.read_text()) - assert ( - manifest["artifacts"]["policyengine_native_scores"] - == "policyengine_native_scores.json" - ) - assert paths.run_registry is not None - assert paths.run_registry.exists() - assert ( - manifest["policyengine_native_scores"]["candidate_enhanced_cps_native_loss"] - == 0.25 - ) - assert ( - manifest["policyengine_native_scores"]["candidate_beats_baseline"] is True - ) - assert ( - manifest["run_registry"]["default_frontier_metric"] - == "enhanced_cps_native_loss_delta" - ) - - registry_entries = load_us_microplex_run_registry(paths.run_registry) - assert len(registry_entries) == 1 - assert registry_entries[0].candidate_beats_baseline_native_loss is True - - def test_uses_precomputed_policyengine_native_scores_without_recomputing( - self, monkeypatch, tmp_path - ) -> None: - def _boom(**_kwargs): - raise AssertionError("native scorer should not be called") - - monkeypatch.setattr( - "microplex_us.pipelines.artifacts.compute_us_pe_native_scores", - _boom, - ) - - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0, 20.0], "weight": [1.0, 1.0]}), - calibrated_data=pd.DataFrame( - {"income": [10.0, 20.0], "weight": [0.5, 1.5]} - ), - targets=USMicroplexTargets(marginal={}, continuous={"income": 30.0}), - calibration_summary={"max_error": 0.01, "mean_error": 0.005}, - synthesis_metadata={"backend": "bootstrap"}, - synthesizer=None, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [0.5, 1.5], - "state_fips": [6, 48], - "snap": [0.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11], - "household_id": [1, 2], - "tax_unit_id": [101, 102], - "spm_unit_id": [201, 202], - "family_id": [301, 302], - "marital_unit_id": [401, 402], - "age": [35.0, 62.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["SINGLE", "JOINT"], - } - ), - spm_units=pd.DataFrame( - {"spm_unit_id": [201, 202], "household_id": [1, 2]} - ), - families=pd.DataFrame( - {"family_id": [301, 302], "household_id": [1, 2]} - ), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - - payload = { - "metric": "enhanced_cps_native_loss", - "summary": { - "candidate_enhanced_cps_native_loss": 0.25, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": -0.25, - "candidate_beats_baseline": True, - "candidate_unweighted_msre": 0.3, - "baseline_unweighted_msre": 0.6, - "unweighted_msre_delta": -0.3, - "n_targets_total": 2863, - "n_targets_kept": 2853, - "n_targets_zero_dropped": 10, - "n_targets_bad_dropped": 10, - "n_national_targets": 2000, - "n_state_targets": 853, - }, - } - - paths = save_us_microplex_artifacts( - result, - tmp_path / "bundle-native-precomputed", - precomputed_policyengine_native_scores=payload, - ) - - assert paths.policyengine_native_scores is not None - assert json.loads(paths.policyengine_native_scores.read_text()) == payload diff --git a/tests/pipelines/test_backfill_pe_native_audit.py b/tests/pipelines/test_backfill_pe_native_audit.py deleted file mode 100644 index 84308ed9..00000000 --- a/tests/pipelines/test_backfill_pe_native_audit.py +++ /dev/null @@ -1,475 +0,0 @@ -"""Tests for historical PE rebuild native-audit backfill.""" - -from __future__ import annotations - -import json -from pathlib import Path - -from microplex_us.pipelines.backfill_pe_native_audit import ( - backfill_us_pe_native_audit_bundle, - backfill_us_pe_native_audit_bundles, - backfill_us_pe_native_audit_root, -) - - -def test_backfill_us_pe_native_audit_root_updates_manifest_and_snapshot( - monkeypatch, - tmp_path, -) -> None: - artifact_root = tmp_path / "live_runs" - bundle_dir = artifact_root / "run-1" - bundle_dir.mkdir(parents=True) - (bundle_dir / "policyengine_us.h5").write_text("candidate") - (bundle_dir / "policyengine_native_scores.json").write_text( - json.dumps( - { - "metric": "enhanced_cps_native_loss", - "summary": { - "enhanced_cps_native_loss_delta": 0.25, - }, - } - ) - ) - (bundle_dir / "data_flow_snapshot.json").write_text( - json.dumps( - { - "schemaVersion": 1, - "stages": [ - { - "id": "benchmark", - "outputs": ["policyengine_native_scores.json"], - "metrics": [], - "status": "ready", - } - ], - } - ) - ) - - manifest = { - "created_at": "2026-03-29T12:00:00+00:00", - "config": { - "policyengine_baseline_dataset": str((tmp_path / "baseline.h5").resolve()), - "policyengine_dataset_year": 2024, - }, - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 1000.0}, - "synthesis": {"source_names": ["cps", "puf"]}, - "calibration": {"converged": True}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "policyengine_native_scores": "policyengine_native_scores.json", - }, - "policyengine_native_scores": { - "enhanced_cps_native_loss_delta": 0.25, - }, - } - manifest_path = bundle_dir / "manifest.json" - manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True)) - - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.compute_batch_us_pe_native_target_deltas", - lambda **_kwargs: [ - { - "metric": "enhanced_cps_native_loss_target_delta", - "to_dataset": str((bundle_dir / "policyengine_us.h5").resolve()), - "top_regressions": [{"target_name": "nation/irs/example"}], - "top_improvements": [], - } - ], - ) - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.compute_batch_us_pe_native_support_audits", - lambda **_kwargs: [ - { - "metric": "enhanced_cps_support_audit", - "candidate_dataset": str((bundle_dir / "policyengine_us.h5").resolve()), - "comparisons": { - "critical_input_support": [ - { - "variable": "has_esi", - "candidate_stored": False, - "baseline_stored": True, - "weighted_nonzero_delta": -10.0, - } - ] - }, - } - ], - ) - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.build_policyengine_us_data_rebuild_native_audit", - lambda *args, **kwargs: { - "artifactId": "run-1", - "period": 2024, - "targetDelta": { - "metric": "enhanced_cps_native_loss_target_delta", - "period": 2024, - "from_dataset": str((tmp_path / "baseline.h5").resolve()), - "to_dataset": str((bundle_dir / "policyengine_us.h5").resolve()), - "summary": {"n_targets": 1, "to_win_rate": 1.0}, - "family_summaries": [{"target_family": "national_irs_other"}], - "scope_summaries": [{"target_scope": "national"}], - "targets": [ - { - "target_name": "nation/irs/example", - "target_family": "national_irs_other", - "target_scope": "national", - "winner": "to", - "weighted_term_delta": -1.0, - "from_weighted_term": 2.0, - "to_weighted_term": 1.0, - "target_value": 100.0, - "from_estimate": 90.0, - "to_estimate": 95.0, - "from_rel_error": 0.2, - "to_rel_error": 0.1, - } - ], - "top_regressions": [], - "top_improvements": [], - }, - "verdictHints": { - "largestRegressingFamily": "national_irs_other", - "productionImputationVariant": "structured_pe_conditioning", - "productionImputationVariantIsMaeWinner": False, - "productionImputationVariantIsSupportWinner": True, - }, - }, - ) - - manifest_paths = backfill_us_pe_native_audit_root(artifact_root) - - assert manifest_paths == [manifest_path] - updated_manifest = json.loads(manifest_path.read_text()) - assert ( - updated_manifest["artifacts"]["policyengine_native_audit"] - == "pe_us_data_rebuild_native_audit.json" - ) - assert ( - updated_manifest["artifacts"]["policyengine_native_target_diagnostics"] - == "pe_native_target_diagnostics.json" - ) - target_diagnostics = json.loads( - (bundle_dir / "pe_native_target_diagnostics.json").read_text() - ) - assert target_diagnostics["artifact_id"] == "run-1" - assert target_diagnostics["run_id"] == "run-1" - assert target_diagnostics["targets"][0]["artifact_id"] == "run-1" - assert target_diagnostics["targets"][0]["delta_absolute_error"] == -5.0 - assert ( - updated_manifest["policyengine_native_audit"][ - "productionImputationVariantIsSupportWinner" - ] - is True - ) - snapshot = json.loads((bundle_dir / "data_flow_snapshot.json").read_text()) - benchmark = next( - stage - for stage in snapshot["stages"] - if stage["id"] == "09_validation_benchmarking" - ) - assert benchmark["outputs"] == [ - "policyengine_native_scores.json", - "pe_us_data_rebuild_native_audit.json", - "pe_native_target_diagnostics.json", - ] - - -def test_backfill_us_pe_native_audit_bundle_reuses_existing_sidecar_without_recomputing( - monkeypatch, - tmp_path, -) -> None: - bundle_dir = tmp_path / "run-1" - bundle_dir.mkdir() - (bundle_dir / "policyengine_us.h5").write_text("candidate") - (bundle_dir / "policyengine_native_scores.json").write_text( - json.dumps({"metric": "enhanced_cps_native_loss", "summary": {}}) - ) - (bundle_dir / "pe_us_data_rebuild_native_audit.json").write_text( - json.dumps( - { - "artifactId": "run-1", - "verdictHints": { - "largestRegressingFamily": "national_irs_other", - "productionImputationVariantIsMaeWinner": False, - }, - } - ) - ) - manifest = { - "created_at": "2026-03-29T12:00:00+00:00", - "config": { - "policyengine_baseline_dataset": str((tmp_path / "baseline.h5").resolve()), - "policyengine_dataset_year": 2024, - }, - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 1000.0}, - "synthesis": {"source_names": ["cps", "puf"]}, - "calibration": {"converged": True}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "policyengine_native_scores": "policyengine_native_scores.json", - }, - "policyengine_native_scores": {}, - } - manifest_path = bundle_dir / "manifest.json" - manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True)) - - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.build_policyengine_us_data_rebuild_native_audit", - lambda *args, **kwargs: (_ for _ in ()).throw( - AssertionError("should not recompute when audit sidecar already exists") - ), - ) - - returned_manifest_path = backfill_us_pe_native_audit_bundle(bundle_dir) - - assert returned_manifest_path == manifest_path - updated_manifest = json.loads(manifest_path.read_text()) - assert ( - updated_manifest["artifacts"]["policyengine_native_audit"] - == "pe_us_data_rebuild_native_audit.json" - ) - assert ( - updated_manifest["policyengine_native_audit"]["largestRegressingFamily"] - == "national_irs_other" - ) - - -def test_backfill_us_pe_native_audit_bundles_uses_grouped_batch_helpers( - monkeypatch, - tmp_path, -) -> None: - artifact_root = tmp_path / "live_runs" - bundle_dirs = [artifact_root / "run-1", artifact_root / "run-2"] - baseline_path = tmp_path / "baseline.h5" - baseline_path.write_text("baseline") - - for index, bundle_dir in enumerate(bundle_dirs, start=1): - bundle_dir.mkdir(parents=True) - (bundle_dir / "policyengine_us.h5").write_text(f"candidate-{index}") - (bundle_dir / "policyengine_native_scores.json").write_text( - json.dumps( - { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "summary": { - "enhanced_cps_native_loss_delta": float(index), - }, - } - ) - ) - manifest = { - "created_at": f"2026-03-29T12:00:0{index}+00:00", - "config": { - "policyengine_baseline_dataset": str(baseline_path.resolve()), - "policyengine_dataset_year": 2024, - }, - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 1000.0}, - "synthesis": {"source_names": ["cps", "puf"]}, - "calibration": {"converged": True}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "policyengine_native_scores": "policyengine_native_scores.json", - }, - "policyengine_native_scores": { - "enhanced_cps_native_loss_delta": float(index), - }, - } - (bundle_dir / "manifest.json").write_text( - json.dumps(manifest, indent=2, sort_keys=True) - ) - - captured: dict[str, object] = {} - - def fake_batch_target_deltas(**kwargs): - captured["target_kwargs"] = kwargs - return [ - { - "metric": "enhanced_cps_native_loss_target_delta", - "to_dataset": str((bundle_dirs[0] / "policyengine_us.h5").resolve()), - "top_regressions": [{"target_name": "target-a"}], - "top_improvements": [], - }, - { - "metric": "enhanced_cps_native_loss_target_delta", - "to_dataset": str((bundle_dirs[1] / "policyengine_us.h5").resolve()), - "top_regressions": [{"target_name": "target-b"}], - "top_improvements": [], - }, - ] - - def fake_batch_support_audits(**kwargs): - captured["support_kwargs"] = kwargs - return [ - { - "metric": "enhanced_cps_support_audit", - "candidate_dataset": str((bundle_dirs[0] / "policyengine_us.h5").resolve()), - "comparisons": { - "critical_input_support": [ - { - "variable": "has_esi", - "candidate_stored": False, - "baseline_stored": True, - "weighted_nonzero_delta": -10.0, - } - ] - }, - }, - { - "metric": "enhanced_cps_support_audit", - "candidate_dataset": str((bundle_dirs[1] / "policyengine_us.h5").resolve()), - "comparisons": { - "critical_input_support": [ - { - "variable": "rental_income", - "candidate_stored": True, - "baseline_stored": True, - "weighted_nonzero_delta": -2.0, - } - ] - }, - }, - ] - - def fake_build_audit( - artifact_dir, - *, - manifest_payload, - native_scores_payload, - target_delta_payload, - support_audit_payload, - **_kwargs, - ): - return { - "artifactId": Path(artifact_dir).name, - "nativeBroadLossSummary": dict(native_scores_payload.get("summary", {})), - "topTargetRegressions": list(target_delta_payload.get("top_regressions", ())), - "supportAuditSummary": { - "missingStoredCriticalInputs": [ - row["variable"] - for row in support_audit_payload["comparisons"]["critical_input_support"] - if row.get("baseline_stored") and not row.get("candidate_stored") - ] - }, - "verdictHints": { - "largestRegressingTarget": target_delta_payload["top_regressions"][0]["target_name"], - "missingStoredCriticalInputs": [ - row["variable"] - for row in support_audit_payload["comparisons"]["critical_input_support"] - if row.get("baseline_stored") and not row.get("candidate_stored") - ], - }, - } - - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.compute_batch_us_pe_native_target_deltas", - fake_batch_target_deltas, - ) - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.compute_batch_us_pe_native_support_audits", - fake_batch_support_audits, - ) - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.build_policyengine_us_data_rebuild_native_audit", - fake_build_audit, - ) - - manifest_paths = backfill_us_pe_native_audit_bundles(bundle_dirs) - - assert len(manifest_paths) == 2 - assert captured["target_kwargs"]["baseline_dataset_path"] == baseline_path.resolve() - assert captured["support_kwargs"]["baseline_dataset_path"] == baseline_path.resolve() - assert captured["target_kwargs"]["candidate_dataset_paths"] == [ - bundle_dirs[0] / "policyengine_us.h5", - bundle_dirs[1] / "policyengine_us.h5", - ] - updated_manifest = json.loads((bundle_dirs[0] / "manifest.json").read_text()) - assert updated_manifest["policyengine_native_audit"]["largestRegressingTarget"] == "target-a" - assert updated_manifest["policyengine_native_audit"]["missingStoredCriticalInputs"] == [ - "has_esi" - ] - - -def test_backfill_us_pe_native_audit_bundles_skips_missing_native_scores( - monkeypatch, - tmp_path, -) -> None: - artifact_root = tmp_path / "live_runs" - skipped_bundle = artifact_root / "run-missing-scores" - ready_bundle = artifact_root / "run-ready" - baseline_path = tmp_path / "baseline.h5" - baseline_path.write_text("baseline") - - for bundle_dir in (skipped_bundle, ready_bundle): - bundle_dir.mkdir(parents=True) - (bundle_dir / "policyengine_us.h5").write_text(bundle_dir.name) - manifest = { - "created_at": "2026-03-29T12:00:00+00:00", - "config": { - "policyengine_baseline_dataset": str(baseline_path.resolve()), - "policyengine_dataset_year": 2024, - }, - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 1000.0}, - "synthesis": {"source_names": ["cps", "puf"]}, - "calibration": {"converged": True}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - }, - } - (bundle_dir / "manifest.json").write_text( - json.dumps(manifest, indent=2, sort_keys=True) - ) - - (ready_bundle / "policyengine_native_scores.json").write_text( - json.dumps( - { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "summary": {"enhanced_cps_native_loss_delta": 0.5}, - } - ) - ) - ready_manifest = json.loads((ready_bundle / "manifest.json").read_text()) - ready_manifest["artifacts"]["policyengine_native_scores"] = "policyengine_native_scores.json" - (ready_bundle / "manifest.json").write_text( - json.dumps(ready_manifest, indent=2, sort_keys=True) - ) - - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.compute_batch_us_pe_native_target_deltas", - lambda **_kwargs: [ - { - "metric": "enhanced_cps_native_loss_target_delta", - "to_dataset": str((ready_bundle / "policyengine_us.h5").resolve()), - "top_regressions": [{"target_name": "target-ready"}], - "top_improvements": [], - } - ], - ) - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.compute_batch_us_pe_native_support_audits", - lambda **_kwargs: [ - { - "metric": "enhanced_cps_support_audit", - "candidate_dataset": str((ready_bundle / "policyengine_us.h5").resolve()), - "comparisons": {"critical_input_support": []}, - } - ], - ) - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_audit.build_policyengine_us_data_rebuild_native_audit", - lambda artifact_dir, **_kwargs: { - "artifactId": Path(artifact_dir).name, - "verdictHints": {"largestRegressingTarget": "target-ready"}, - }, - ) - - manifest_paths = backfill_us_pe_native_audit_bundles([skipped_bundle, ready_bundle]) - - assert manifest_paths == [ready_bundle / "manifest.json"] - assert not (skipped_bundle / "pe_us_data_rebuild_native_audit.json").exists() - assert (ready_bundle / "pe_us_data_rebuild_native_audit.json").exists() diff --git a/tests/pipelines/test_backfill_pe_native_scores.py b/tests/pipelines/test_backfill_pe_native_scores.py deleted file mode 100644 index a81c0a32..00000000 --- a/tests/pipelines/test_backfill_pe_native_scores.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Tests for historical PE-native score backfill.""" - -from __future__ import annotations - -import json - -from microplex_us.pipelines.backfill_pe_native_scores import ( - backfill_us_pe_native_scores_bundles, - backfill_us_pe_native_scores_root, -) -from microplex_us.pipelines.registry import load_us_microplex_run_registry - - -def test_backfill_us_pe_native_scores_root_updates_manifest_and_registry( - monkeypatch, - tmp_path, -) -> None: - artifact_root = tmp_path / "live_runs" - bundle_dir = artifact_root / "run-1" - bundle_dir.mkdir(parents=True) - (bundle_dir / "policyengine_us.h5").write_text("candidate") - (tmp_path / "baseline.h5").write_text("baseline") - - manifest = { - "created_at": "2026-03-29T12:00:00+00:00", - "config": { - "synthesis_backend": "bootstrap", - "calibration_backend": "entropy", - "policyengine_baseline_dataset": str((tmp_path / "baseline.h5").resolve()), - "policyengine_dataset_year": 2024, - }, - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 1000.0}, - "synthesis": {"source_names": ["cps", "puf"]}, - "calibration": { - "converged": True, - "weight_collapse_suspected": False, - }, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - }, - } - manifest_path = bundle_dir / "manifest.json" - manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True)) - - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_scores.compute_us_pe_native_scores", - lambda **_kwargs: { - "metric": "enhanced_cps_native_loss", - "summary": { - "candidate_enhanced_cps_native_loss": 0.25, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": -0.25, - "candidate_beats_baseline": True, - "candidate_unweighted_msre": 0.3, - "baseline_unweighted_msre": 0.6, - "unweighted_msre_delta": -0.3, - "n_targets_total": 2865, - "n_targets_kept": 2853, - "n_targets_zero_dropped": 10, - "n_targets_bad_dropped": 10, - "n_national_targets": 677, - "n_state_targets": 2176, - }, - "broad_loss": { - "metric": "enhanced_cps_native_loss", - }, - }, - ) - - manifest_paths = backfill_us_pe_native_scores_root(artifact_root) - - assert manifest_paths == [manifest_path] - sidecar_path = bundle_dir / "policyengine_native_scores.json" - assert sidecar_path.exists() - - updated_manifest = json.loads(manifest_path.read_text()) - stage_manifest = json.loads((bundle_dir / "stage_manifest.json").read_text()) - validation_evidence = json.loads( - ( - bundle_dir - / "stage_artifacts" - / "09_validation_benchmarking" - / "evidence_manifest.json" - ).read_text() - ) - assert ( - updated_manifest["artifacts"]["policyengine_native_scores"] - == "policyengine_native_scores.json" - ) - assert updated_manifest["artifacts"]["stage_manifest"] == "stage_manifest.json" - assert ( - updated_manifest["artifacts"]["validation_evidence"] - == "stage_artifacts/09_validation_benchmarking/evidence_manifest.json" - ) - assert updated_manifest["policyengine_native_scores"]["candidate_beats_baseline"] is True - assert ( - updated_manifest["run_registry"]["default_frontier_metric"] - == "enhanced_cps_native_loss_delta" - ) - stage9 = next( - stage - for stage in stage_manifest["stages"] - if stage["id"] == "09_validation_benchmarking" - ) - assert stage9["status"] == "ready" - assert validation_evidence["evidence"][0]["key"] == "policyengine_native_scores" - assert validation_evidence["evidence"][0]["exists"] is True - - registry_path = artifact_root / "run_registry.jsonl" - assert registry_path.exists() - registry_entries = load_us_microplex_run_registry(registry_path) - assert len(registry_entries) == 1 - assert registry_entries[0].candidate_beats_baseline_native_loss is True - - -def test_backfill_us_pe_native_scores_bundles_uses_batch_scorer( - monkeypatch, - tmp_path, -) -> None: - artifact_root = tmp_path / "live_runs" - bundle_dirs = [artifact_root / "run-1", artifact_root / "run-2"] - baseline_path = tmp_path / "baseline.h5" - baseline_path.write_text("baseline") - - for index, bundle_dir in enumerate(bundle_dirs, start=1): - bundle_dir.mkdir(parents=True) - (bundle_dir / "policyengine_us.h5").write_text(f"candidate-{index}") - manifest = { - "created_at": f"2026-03-29T12:00:0{index}+00:00", - "config": { - "synthesis_backend": "bootstrap", - "calibration_backend": "entropy", - "policyengine_baseline_dataset": str(baseline_path.resolve()), - "policyengine_dataset_year": 2024, - }, - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 1000.0}, - "synthesis": {"source_names": ["cps", "puf"]}, - "calibration": { - "converged": True, - "weight_collapse_suspected": False, - }, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - }, - } - (bundle_dir / "manifest.json").write_text( - json.dumps(manifest, indent=2, sort_keys=True) - ) - - monkeypatch.setattr( - "microplex_us.pipelines.backfill_pe_native_scores.compute_batch_us_pe_native_scores", - lambda **_kwargs: [ - { - "metric": "enhanced_cps_native_loss", - "summary": { - "candidate_enhanced_cps_native_loss": 0.25, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": -0.25, - "candidate_beats_baseline": True, - "candidate_unweighted_msre": 0.3, - "baseline_unweighted_msre": 0.6, - "unweighted_msre_delta": -0.3, - "n_targets_total": 2865, - "n_targets_kept": 2853, - "n_targets_zero_dropped": 10, - "n_targets_bad_dropped": 10, - "n_national_targets": 677, - "n_state_targets": 2176, - }, - "broad_loss": {"metric": "enhanced_cps_native_loss"}, - }, - { - "metric": "enhanced_cps_native_loss", - "summary": { - "candidate_enhanced_cps_native_loss": 0.75, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": 0.25, - "candidate_beats_baseline": False, - "candidate_unweighted_msre": 0.8, - "baseline_unweighted_msre": 0.6, - "unweighted_msre_delta": 0.2, - "n_targets_total": 2865, - "n_targets_kept": 2853, - "n_targets_zero_dropped": 10, - "n_targets_bad_dropped": 10, - "n_national_targets": 677, - "n_state_targets": 2176, - }, - "broad_loss": {"metric": "enhanced_cps_native_loss"}, - }, - ], - ) - - manifest_paths = backfill_us_pe_native_scores_bundles(bundle_dirs) - - assert len(manifest_paths) == 2 - registry_entries = load_us_microplex_run_registry(artifact_root / "run_registry.jsonl") - assert len(registry_entries) == 2 - assert registry_entries[0].candidate_beats_baseline_native_loss is True - assert registry_entries[1].candidate_beats_baseline_native_loss is False diff --git a/tests/pipelines/test_calibration_stage_parity.py b/tests/pipelines/test_calibration_stage_parity.py deleted file mode 100644 index 2abf9edd..00000000 --- a/tests/pipelines/test_calibration_stage_parity.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Tests for calibration-stage parity auditing.""" - -from __future__ import annotations - -import h5py -import numpy as np -import pandas as pd -import pytest - -from microplex_us.pipelines.calibration_stage_parity import ( - build_us_calibration_stage_parity_audit, -) -from microplex_us.pipelines.pre_sim_parity import PreSimParityVariableSpec - - -def _write_period_dataset(path, data: dict[str, np.ndarray], *, period: int = 2024) -> None: - with h5py.File(path, "w") as handle: - for variable, values in data.items(): - group = handle.create_group(variable) - group.create_dataset(str(period), data=values) - - -def test_build_us_calibration_stage_parity_audit_reports_weight_lift_and_reference_support( - tmp_path, -) -> None: - synthetic_path = tmp_path / "synthetic.parquet" - calibrated_path = tmp_path / "calibrated.parquet" - reference_path = tmp_path / "reference.h5" - - pd.DataFrame( - { - "household_id": [1, 1, 2, 2], - "weight": [1.0, 1.0, 1.0, 1.0], - "health_savings_account_ald": [100.0, 0.0, 0.0, 0.0], - "has_esi": [1.0, 0.0, 0.0, 1.0], - } - ).to_parquet(synthetic_path, index=False) - pd.DataFrame( - { - "household_id": [1, 1, 2, 2], - "weight": [5.0, 5.0, 1.0, 1.0], - "health_savings_account_ald": [100.0, 0.0, 0.0, 0.0], - "has_esi": [True, False, False, True], - } - ).to_parquet(calibrated_path, index=False) - - _write_period_dataset( - reference_path, - { - "household_id": np.array([1, 2], dtype=int), - "household_weight": np.array([2.0, 1.0], dtype=float), - "person_id": np.array([10, 11, 20, 21], dtype=int), - "person_household_id": np.array([1, 1, 2, 2], dtype=int), - "tax_unit_id": np.array([100, 200], dtype=int), - "person_tax_unit_id": np.array([100, 100, 200, 200], dtype=int), - "health_savings_account_ald": np.array([100.0, 0.0], dtype=float), - "has_esi": np.array([True, False, False, True], dtype=bool), - }, - ) - - audit = build_us_calibration_stage_parity_audit( - synthetic_path, - calibrated_path, - reference_dataset=reference_path, - focus_variables=( - PreSimParityVariableSpec( - "health_savings_account_ald", - "health_savings_account_ald", - value_kind="numeric", - ), - PreSimParityVariableSpec("has_esi", "has_esi", value_kind="categorical"), - ), - ) - - synthetic_weights = audit["weightDiagnostics"]["synthetic"] - calibrated_weights = audit["weightDiagnostics"]["calibrated"] - assert synthetic_weights["total_weight"] == pytest.approx(2.0) - assert calibrated_weights["total_weight"] == pytest.approx(6.0) - assert calibrated_weights["effective_sample_size"] < synthetic_weights["effective_sample_size"] - - hsa = audit["focusVariables"]["health_savings_account_ald"] - assert hsa["calibrated_vs_synthetic"]["type"] == "numeric" - assert hsa["calibrated_vs_synthetic"]["weighted_sum_ratio"] == pytest.approx(5.0) - assert hsa["calibrated_vs_reference"]["weighted_sum_ratio"] == pytest.approx(2.5) - - has_esi = audit["focusVariables"]["has_esi"] - assert has_esi["calibrated_vs_synthetic"]["type"] == "categorical" - assert has_esi["calibrated_vs_synthetic"]["support_recall"] == 1.0 - assert has_esi["calibrated_vs_reference"]["support_precision"] == 1.0 diff --git a/tests/pipelines/test_cd_age_reweighting.py b/tests/pipelines/test_cd_age_reweighting.py deleted file mode 100644 index 2531feca..00000000 --- a/tests/pipelines/test_cd_age_reweighting.py +++ /dev/null @@ -1,139 +0,0 @@ -from __future__ import annotations - -import sqlite3 - -import h5py -import numpy as np - -from microplex_us.pipelines.cd_age_reweighting import ( - normalize_at_large_cd_geoids, - reweight_h5_to_cd_age_targets, -) - - -def test_normalize_at_large_cd_geoids_maps_statewide_zero_to_one() -> None: - values = np.asarray([200, 201, 1000, 3601, 0], dtype=np.int64) - - normalized = normalize_at_large_cd_geoids(values) - - np.testing.assert_array_equal( - normalized, - np.asarray([201, 201, 1001, 3601, 0], dtype=np.int64), - ) - - -def test_reweight_h5_to_cd_age_targets_matches_simple_at_large_targets(tmp_path) -> None: - dataset = tmp_path / "input.h5" - output = tmp_path / "output.h5" - db = tmp_path / "policy_data.db" - _write_minimal_h5(dataset) - _write_cd_age_target_db(db) - - summary = reweight_h5_to_cd_age_targets( - input_dataset=dataset, - target_db=db, - output_dataset=output, - period=2024, - max_iter=100, - preserve_district_weight_sum=False, - ) - - assert summary["n_targets"] == 2 - assert summary["max_abs_relative_error_after"] < 1e-5 - with h5py.File(output, "r") as handle: - np.testing.assert_allclose( - handle["household_weight"]["2024"][:], - np.asarray([10.0, 20.0], dtype=np.float32), - rtol=1e-5, - ) - np.testing.assert_array_equal( - handle["congressional_district_geoid"]["2024"][:], - np.asarray([201, 201]), - ) - - -def _write_minimal_h5(path): - with h5py.File(path, "w") as handle: - _write_period(handle, "household_id", [1, 2]) - _write_period(handle, "household_weight", [1.0, 1.0]) - _write_period(handle, "congressional_district_geoid", [200, 200]) - _write_period(handle, "person_household_id", [1, 2]) - _write_period(handle, "age", [4, 40]) - - -def _write_period(handle, variable, values): - group = handle.create_group(variable) - group.create_dataset("2024", data=np.asarray(values)) - - -def _write_cd_age_target_db(path): - conn = sqlite3.connect(path) - try: - conn.executescript( - """ - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT, - period INTEGER, - stratum_id INTEGER, - reform_id INTEGER DEFAULT 0, - value REAL, - active INTEGER DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - CREATE TABLE stratum_constraints ( - stratum_id INTEGER, - constraint_variable TEXT, - operation TEXT, - value TEXT - ); - CREATE VIEW target_overview AS - SELECT - target_id, - stratum_id, - variable, - value, - period, - active, - 'district' AS geo_level, - '201' AS geographic_id, - 'age' AS domain_variable - FROM targets; - """ - ) - _insert_target(conn, 1, 101, 10.0, [("age", "<", "18"), ("age", ">", "-1")]) - _insert_target(conn, 2, 102, 20.0, [("age", ">=", "18")]) - conn.commit() - finally: - conn.close() - - -def _insert_target(conn, target_id, stratum_id, value, constraints): - conn.execute( - """ - INSERT INTO targets - (target_id, variable, period, stratum_id, reform_id, value, active) - VALUES (?, 'person_count', 2024, ?, 0, ?, 1) - """, - (target_id, stratum_id, value), - ) - conn.execute("INSERT INTO strata (stratum_id) VALUES (?)", (stratum_id,)) - for constraint in [ - ("congressional_district_geoid", "==", "201"), - *constraints, - ]: - conn.execute( - """ - INSERT INTO stratum_constraints - (stratum_id, constraint_variable, operation, value) - VALUES (?, ?, ?, ?) - """, - (stratum_id, *constraint), - ) diff --git a/tests/pipelines/test_check_export_columns.py b/tests/pipelines/test_check_export_columns.py deleted file mode 100644 index 94b14738..00000000 --- a/tests/pipelines/test_check_export_columns.py +++ /dev/null @@ -1,716 +0,0 @@ -"""Tests for the fast eCPS column-parity check CLI. - -The module under test is loaded directly from its file path (not via -``import microplex_us...``) so these tests run with only ``pytest`` / -``h5py`` / ``numpy`` installed -- importing the ``microplex_us`` package -would pull ``microplex`` and torch. This mirrors the loader pattern in -``test_mp300k_artifact_gates.py``. -""" - -from __future__ import annotations - -import importlib.util -import json -import sys -from pathlib import Path - -import pytest - -_MODULE_PATH = ( - Path(__file__).resolve().parents[2] - / "src" - / "microplex_us" - / "pipelines" - / "check_export_columns.py" -) -_spec = importlib.util.spec_from_file_location("check_export_columns", _MODULE_PATH) -cec = importlib.util.module_from_spec(_spec) -# Register before exec so the module's @dataclass can resolve its module. -sys.modules["check_export_columns"] = cec -_spec.loader.exec_module(cec) - -DEFAULT_CONTRACT_PATH = cec.DEFAULT_CONTRACT_PATH -DEFAULT_SPEC_PATH = cec.DEFAULT_SPEC_PATH -compute_column_diff = cec.compute_column_diff -compute_spec_variable_manifest_diff = cec.compute_spec_variable_manifest_diff -load_contract = cec.load_contract -main = cec.main - -# A tiny self-contained contract so most tests do not depend on the -# (large) committed contract. -TINY_CONTRACT = { - "required": ["age", "snap", "employment_income"], - "ecps_internal_optional": ["person_is_puf_clone"], - "forbidden": ["snap_reported", "ssi_reported"], -} - - -def _write_json(path: Path, obj) -> Path: - path.write_text(json.dumps(obj)) - return path - - -@pytest.fixture -def contract_path(tmp_path: Path) -> Path: - return _write_json(tmp_path / "contract.json", TINY_CONTRACT) - - -def _run_columns( - tmp_path: Path, - contract_path: Path, - columns: list[str], -) -> int: - cols_path = _write_json(tmp_path / "cols.json", columns) - return main( - [ - "--columns-json", - str(cols_path), - "--contract", - str(contract_path), - ] - ) - - -def _write_period_h5(path: Path, columns: dict[str, list[object]]) -> Path: - h5py = pytest.importorskip("h5py") - import numpy as np - - with h5py.File(path, "w") as f: - for column, values in columns.items(): - f.create_dataset(f"{column}/2024", data=np.asarray(values)) - return path - - -def test_main_clean_list_returns_zero(tmp_path, contract_path): - # required + optional, no forbidden -> pass. - cols = ["age", "snap", "employment_income", "person_is_puf_clone"] - assert _run_columns(tmp_path, contract_path, cols) == 0 - - -def test_main_missing_required_returns_one(tmp_path, contract_path): - # Drop a required column. - cols = ["age", "snap"] # missing employment_income - assert _run_columns(tmp_path, contract_path, cols) == 1 - - -def test_main_forbidden_present_returns_one(tmp_path, contract_path): - # All required present, but a forbidden column is exported. - cols = ["age", "snap", "employment_income", "snap_reported"] - assert _run_columns(tmp_path, contract_path, cols) == 1 - - -def test_columns_json_path_collapses_period_suffix(tmp_path, contract_path): - # "name/period" entries collapse to the base name and still pass. - cols = ["age/2024", "snap/2024", "employment_income/2024"] - assert _run_columns(tmp_path, contract_path, cols) == 0 - - -def test_optional_column_is_neither_required_nor_forbidden(tmp_path, contract_path): - # Omitting an optional column does not fail; it is not "missing". - cols = ["age", "snap", "employment_income"] - assert _run_columns(tmp_path, contract_path, cols) == 0 - - -def test_main_h5_path_returns_zero_when_clean(tmp_path, contract_path): - h5py = pytest.importorskip("h5py") - import numpy as np - - # Mirror the eCPS export layout: each column is a group /. - h5_path = tmp_path / "export.h5" - with h5py.File(h5_path, "w") as f: - for col in ["age", "snap", "employment_income"]: - f.create_dataset(f"{col}/2024", data=np.array([1, 2, 3])) - rc = main([str(h5_path), "--contract", str(contract_path)]) - assert rc == 0 - - -def test_main_h5_path_flags_missing_required(tmp_path, contract_path): - h5py = pytest.importorskip("h5py") - import numpy as np - - h5_path = tmp_path / "export.h5" - with h5py.File(h5_path, "w") as f: - # missing employment_income - for col in ["age", "snap"]: - f.create_dataset(f"{col}/2024", data=np.array([1, 2, 3])) - rc = main([str(h5_path), "--contract", str(contract_path)]) - assert rc == 1 - - -def test_main_h5_path_flags_forbidden_present(tmp_path, contract_path): - h5py = pytest.importorskip("h5py") - import numpy as np - - h5_path = tmp_path / "export.h5" - with h5py.File(h5_path, "w") as f: - for col in ["age", "snap", "employment_income", "snap_reported"]: - f.create_dataset(f"{col}/2024", data=np.array([1, 2, 3])) - rc = main([str(h5_path), "--contract", str(contract_path)]) - assert rc == 1 - - -def test_main_h5_path_accepts_flat_datasets(tmp_path, contract_path): - # A flat dataset layout (no period sub-group) is also accepted. - h5py = pytest.importorskip("h5py") - import numpy as np - - h5_path = tmp_path / "export.h5" - with h5py.File(h5_path, "w") as f: - for col in ["age", "snap", "employment_income"]: - f.create_dataset(col, data=np.array([1, 2, 3])) - rc = main([str(h5_path), "--contract", str(contract_path)]) - assert rc == 0 - - -def test_support_baseline_rejects_numeric_column_eCPS_populates( - tmp_path, - contract_path, -): - candidate = _write_period_h5( - tmp_path / "candidate.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 0.0, 0.0], - }, - ) - baseline = _write_period_h5( - tmp_path / "baseline.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 12_000.0, 0.0], - }, - ) - - rc = main( - [ - str(candidate), - "--contract", - str(contract_path), - "--support-baseline", - str(baseline), - ] - ) - - assert rc == 1 - - -def test_support_baseline_rejects_missing_numeric_sign_support( - tmp_path, -): - contract_path = _write_json( - tmp_path / "contract.json", - { - "required": ["age", "snap", "rental_income"], - "ecps_internal_optional": [], - "forbidden": [], - }, - ) - candidate = _write_period_h5( - tmp_path / "candidate.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "rental_income": [0.0, 12_000.0, 0.0], - }, - ) - baseline = _write_period_h5( - tmp_path / "baseline.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "rental_income": [-200.0, 12_000.0, 0.0], - }, - ) - diagnostics = tmp_path / "support.json" - - rc = main( - [ - str(candidate), - "--contract", - str(contract_path), - "--support-baseline", - str(baseline), - "--support-diagnostics-json", - str(diagnostics), - ] - ) - - assert rc == 1 - payload = json.loads(diagnostics.read_text()) - assert payload["issues"][0]["column"] == "rental_income" - assert payload["issues"][0]["requirement"] == "numeric_signed" - assert payload["issues"][0]["baseline"]["negative_count"] == 1 - assert payload["issues"][0]["candidate"]["negative_count"] == 0 - - -def test_support_baseline_accepts_negative_noise_for_unsigned_numeric( - tmp_path, - contract_path, -): - candidate = _write_period_h5( - tmp_path / "candidate.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 12_000.0, 0.0], - }, - ) - baseline = _write_period_h5( - tmp_path / "baseline.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [-200.0, 12_000.0, 0.0], - }, - ) - - rc = main( - [ - str(candidate), - "--contract", - str(contract_path), - "--support-baseline", - str(baseline), - ] - ) - - assert rc == 0 - - -def test_support_baseline_rejects_categorical_column_eCPS_varies( - tmp_path, - contract_path, -): - candidate = _write_period_h5( - tmp_path / "candidate.h5", - { - "age": [34, 42, 50], - "snap": [False, False, False], - "employment_income": [0.0, 12_000.0, 0.0], - }, - ) - baseline = _write_period_h5( - tmp_path / "baseline.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 12_000.0, 0.0], - }, - ) - - rc = main( - [ - str(candidate), - "--contract", - str(contract_path), - "--support-baseline", - str(baseline), - ] - ) - - assert rc == 1 - - -def test_support_baseline_ignores_ecps_filler_columns(tmp_path, contract_path): - candidate = _write_period_h5( - tmp_path / "candidate.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 0.0, 0.0], - }, - ) - baseline = _write_period_h5( - tmp_path / "baseline.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 0.0, 0.0], - }, - ) - - rc = main( - [ - str(candidate), - "--contract", - str(contract_path), - "--support-baseline", - str(baseline), - ] - ) - - assert rc == 0 - - -def test_support_baseline_accepts_candidate_categorical_support_for_numeric_ecps( - tmp_path, - contract_path, -): - candidate = _write_period_h5( - tmp_path / "candidate.h5", - { - "age": [b"34", b"42", b"50"], - "snap": [False, True, False], - "employment_income": [0.0, 12_000.0, 0.0], - }, - ) - baseline = _write_period_h5( - tmp_path / "baseline.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 12_000.0, 0.0], - }, - ) - - rc = main( - [ - str(candidate), - "--contract", - str(contract_path), - "--support-baseline", - str(baseline), - ] - ) - - assert rc == 0 - - -def test_support_baseline_writes_diagnostics_and_honors_explicit_exemption( - tmp_path, - contract_path, -): - candidate = _write_period_h5( - tmp_path / "candidate.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 0.0, 0.0], - }, - ) - baseline = _write_period_h5( - tmp_path / "baseline.h5", - { - "age": [34, 42, 50], - "snap": [False, True, False], - "employment_income": [0.0, 12_000.0, 0.0], - }, - ) - diagnostics = tmp_path / "support.json" - - rc = main( - [ - str(candidate), - "--contract", - str(contract_path), - "--support-baseline", - str(baseline), - "--support-exempt-column", - "employment_income", - "--support-diagnostics-json", - str(diagnostics), - ] - ) - - assert rc == 0 - payload = json.loads(diagnostics.read_text()) - assert payload["issues"] == [] - assert payload["exempt_columns"] == ["employment_income"] - - -def test_main_entity_tables_path_uses_schema_columns( - tmp_path, contract_path, monkeypatch -): - checkpoint_dir = tmp_path / "post-imputation" - - def fake_columns(path, *, direct_override_variables): - assert path == checkpoint_dir - assert direct_override_variables == ("non_sch_d_capital_gains",) - return {"age", "snap", "employment_income"} - - monkeypatch.setattr(cec, "_columns_from_entity_tables", fake_columns) - - rc = main( - [ - "--entity-tables", - str(checkpoint_dir), - "--direct-override-variable", - "non_sch_d_capital_gains", - "--contract", - str(contract_path), - ] - ) - - assert rc == 0 - - -def test_main_explicit_spec_variable_manifest_failure_returns_one( - tmp_path, - contract_path, -): - spec_path = tmp_path / "spec.yaml" - spec_path.write_text( - """ -meta: {country: us, model_year: 2024} -imputation: - - onto: synthetic_puf - from: puf - vars: [employment_income] -variables: - age: - mp_spec: {method: passthrough} - snap: - mp_spec: {method: passthrough} -""", - encoding="utf-8", - ) - - cols_path = _write_json( - tmp_path / "cols.json", - ["age", "snap", "employment_income"], - ) - rc = main( - [ - "--columns-json", - str(cols_path), - "--contract", - str(contract_path), - "--spec", - str(spec_path), - ] - ) - - assert rc == 1 - - -def test_main_requires_exactly_one_input(tmp_path, contract_path): - # Neither input -> argparse error (SystemExit code 2). - with pytest.raises(SystemExit) as exc: - main(["--contract", str(contract_path)]) - assert exc.value.code == 2 - - # Both inputs -> argparse error. - cols_path = _write_json(tmp_path / "c.json", ["age"]) - with pytest.raises(SystemExit) as exc: - main( - [ - str(tmp_path / "x.h5"), - "--columns-json", - str(cols_path), - "--contract", - str(contract_path), - ] - ) - assert exc.value.code == 2 - - with pytest.raises(SystemExit) as exc: - main( - [ - "--columns-json", - str(cols_path), - "--support-baseline", - str(tmp_path / "baseline.h5"), - "--contract", - str(contract_path), - ] - ) - assert exc.value.code == 2 - - -def test_compute_column_diff_categories(): - diff = compute_column_diff( - {"age", "snap", "snap_reported", "mystery"}, - required={"age", "snap", "wages"}, - forbidden={"snap_reported"}, - optional={"person_is_puf_clone"}, - ) - assert diff.missing_required == ["wages"] - assert diff.forbidden_present == ["snap_reported"] - assert diff.extra_unknown == ["mystery"] - assert diff.ok is False - - -def test_load_contract_rejects_missing_keys(tmp_path): - bad = _write_json(tmp_path / "bad.json", {"required": ["age"]}) - with pytest.raises(ValueError, match="forbidden"): - load_contract(bad) - - -def test_spec_variable_manifest_diff_covers_committed_spec(): - diff = compute_spec_variable_manifest_diff( - contract=load_contract(DEFAULT_CONTRACT_PATH), - spec_path=DEFAULT_SPEC_PATH, - ) - - assert diff.ok - assert diff.required_contract_count == 252 - assert diff.declared_imputation_count == 76 - assert diff.variable_manifest_count == 278 - assert diff.missing_required == [] - assert diff.missing_declared_imputation == [] - assert diff.extra_variables == [] - - -def test_spec_variable_manifest_diff_flags_missing_required_and_imputation(tmp_path): - contract = { - "required": ["age", "snap"], - "forbidden": [], - "ecps_internal_optional": [], - } - spec_path = tmp_path / "spec.yaml" - spec_path.write_text( - """ -meta: {country: us, model_year: 2024} -imputation: - - onto: synthetic_puf - from: puf - vars: [employment_income] -variables: - age: - mp_spec: {method: passthrough} -""", - encoding="utf-8", - ) - - diff = compute_spec_variable_manifest_diff( - contract=contract, - spec_path=spec_path, - ) - - assert diff.ok is False - assert diff.missing_required == ["snap"] - assert diff.missing_declared_imputation == ["employment_income"] - assert diff.extra_variables == [] - - -def test_spec_variable_manifest_diff_counts_quoted_and_commented_imputation_vars( - tmp_path, -): - contract = { - "required": ["age"], - "forbidden": [], - "ecps_internal_optional": [], - } - spec_path = tmp_path / "spec.yaml" - spec_path.write_text( - """ -meta: {country: us, model_year: 2024} -imputation: - - onto: synthetic_puf - from: puf - vars: - - "employment_income" # PUF override - - 'rental_income' - - onto: cps_keep - from: puf - vars: [self_employment_income, "social_security"] # inline form -variables: - age: - mp_spec: {method: passthrough} -""", - encoding="utf-8", - ) - - diff = compute_spec_variable_manifest_diff( - contract=contract, - spec_path=spec_path, - ) - - assert diff.ok is False - assert diff.declared_imputation_count == 4 - assert diff.missing_declared_imputation == [ - "employment_income", - "rental_income", - "self_employment_income", - "social_security", - ] - - -def test_committed_contract_parses_with_expected_categories(): - contract = load_contract(DEFAULT_CONTRACT_PATH) - for key in ( - "required", - "ecps_internal_optional", - "forbidden", - "formula_owned_excluded", - ): - assert key in contract, f"contract missing '{key}'" - assert isinstance(contract[key], list) - # Category sizes of the eCPS contract, aligned to the clone-correct baseline - # H5 (postfix_clonecorrect plus target-source gap fixes): required exports - # both the 5 capped retirement account inputs and the 5 *_desired - # retirement inputs, forbids the - # PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES tax-credit outputs, and - # excludes only weeks_worked (the lone pe-us formula var the baseline does - # not persist). Structural/overridable computed fields - # (has_tin/has_itin/in_nyc/fsla_overtime_premium/meets_ssi_disability_criteria) - # are REQUIRED, matching the in-tree _column_contract_gate. - # Sizes sum to the 258-column source-backed baseline: 252 + 5 + 1. - assert len(contract["required"]) == 252 - assert len(contract["ecps_internal_optional"]) == 5 - assert len(contract["forbidden"]) == 22 - assert len(contract["formula_owned_excluded"]) == 1 - # Categories must be disjoint. - req = set(contract["required"]) - opt = set(contract["ecps_internal_optional"]) - forb = set(contract["forbidden"]) - excl = set(contract["formula_owned_excluded"]) - assert req.isdisjoint(opt) - assert req.isdisjoint(forb) - assert opt.isdisjoint(forb) - assert excl.isdisjoint(req) - assert excl.isdisjoint(forb) - # The clone-bookkeeping flags are optional, not required. - assert "person_is_puf_clone" in opt - assert "person_is_puf_clone" not in req - # Structural/overridable computed fields are REQUIRED (in-tree gate parity), - # NOT excluded; only weeks_worked is excluded. - for structural in ( - "has_tin", - "has_itin", - "in_nyc", - "fsla_overtime_premium", - "meets_ssi_disability_criteria", - "difficulty_hearing", - ): - assert structural in req - assert excl == {"weeks_worked"} - - -def test_committed_clean_fixture_passes_committed_contract(capsys): - # The CI fixture must be a clean, passing set against the real - # contract so the green CI path proves the gate passes on good data. - fixture = Path(__file__).parent / "fixtures" / "ecps_clean_columns.json" - rc = main(["--columns-json", str(fixture)]) - report = capsys.readouterr().out - assert rc == 0 - assert "spec variable manifest" in report - - -def test_committed_contract_covers_every_baseline_column(): - # Completeness invariant: every column the clean baseline fixture exports - # must be accounted for by some contract category, so a baseline-shaped - # export produces no extra_unknown columns. This pins the contract to the - # real baseline and catches silent under-specification of `required`. - contract = load_contract(DEFAULT_CONTRACT_PATH) - fixture = Path(__file__).parent / "fixtures" / "ecps_clean_columns.json" - present = set(json.loads(fixture.read_text())) - diff = compute_column_diff( - present, - required=set(contract["required"]), - forbidden=set(contract["forbidden"]), - optional=set(contract["ecps_internal_optional"]), - excluded=set(contract["formula_owned_excluded"]), - ) - assert diff.extra_unknown == [] - assert diff.missing_required == [] - assert diff.forbidden_present == [] - - -def test_default_contract_path_is_packaged(): - # The contract ships next to the module so the default path resolves. - assert DEFAULT_CONTRACT_PATH.name == "ecps_export_contract.json" - assert DEFAULT_CONTRACT_PATH.exists() - assert callable(cec.main) diff --git a/tests/pipelines/test_check_site_snapshot.py b/tests/pipelines/test_check_site_snapshot.py deleted file mode 100644 index 36fc15a2..00000000 --- a/tests/pipelines/test_check_site_snapshot.py +++ /dev/null @@ -1,138 +0,0 @@ -"""Tests for the US site snapshot consistency checker.""" - -from __future__ import annotations - -import json - -import pytest - -from microplex_us.pipelines.check_site_snapshot import ( - check_us_microplex_site_snapshot, -) -from microplex_us.pipelines.site_snapshot import write_us_microplex_site_snapshot - - -def test_check_us_microplex_site_snapshot_accepts_matching_snapshot(tmp_path) -> None: - artifact_dir = tmp_path / "run-1" - artifact_dir.mkdir() - _write_us_artifact_bundle(artifact_dir) - snapshot_path = tmp_path / "snapshots" / "site_snapshot_us.json" - write_us_microplex_site_snapshot(artifact_dir, snapshot_path) - - snapshot = json.loads(snapshot_path.read_text()) - assert snapshot["sourceArtifact"]["artifactPath"] == "../run-1" - - assert check_us_microplex_site_snapshot(snapshot_path) == snapshot_path - - -def test_check_us_microplex_site_snapshot_rejects_stale_snapshot(tmp_path) -> None: - artifact_dir = tmp_path / "run-1" - artifact_dir.mkdir() - _write_us_artifact_bundle(artifact_dir) - snapshot_path = tmp_path / "site_snapshot_us.json" - write_us_microplex_site_snapshot(artifact_dir, snapshot_path) - - snapshot = json.loads(snapshot_path.read_text()) - snapshot["currentRun"]["candidateMeanAbsRelativeError"] = 9.9 - snapshot_path.write_text(json.dumps(snapshot, indent=2, sort_keys=True)) - - with pytest.raises(SystemExit, match="stale or inconsistent"): - check_us_microplex_site_snapshot(snapshot_path) - - -def test_check_us_microplex_site_snapshot_rejects_stale_data_flow_sidecar( - tmp_path, -) -> None: - artifact_dir = tmp_path / "run-1" - artifact_dir.mkdir() - _write_us_artifact_bundle(artifact_dir) - snapshot_path = tmp_path / "site_snapshot_us.json" - write_us_microplex_site_snapshot(artifact_dir, snapshot_path) - - data_flow_path = artifact_dir / "data_flow_snapshot.json" - data_flow = json.loads(data_flow_path.read_text()) - data_flow["runtime"]["scaffoldSource"] = "stale_source" - data_flow_path.write_text(json.dumps(data_flow, indent=2, sort_keys=True)) - - with pytest.raises(SystemExit, match="data-flow snapshot is stale or inconsistent"): - check_us_microplex_site_snapshot(snapshot_path) - - -def _write_us_artifact_bundle(artifact_dir) -> None: - (artifact_dir / "seed_data.parquet").write_text("") - (artifact_dir / "synthetic_data.parquet").write_text("") - (artifact_dir / "calibrated_data.parquet").write_text("") - (artifact_dir / "targets.json").write_text("{}") - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-03-29T00:00:00+00:00", - "config": {"n_synthetic": 2000}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_harness": "policyengine_harness.json", - }, - "synthesis": { - "scaffold_source": "cps_asec_2023", - "state_program_support_proxies": { - "available": ["ssi"], - "missing": ["snap"], - }, - }, - "calibration": { - "n_loaded_targets": 100, - "n_supported_targets": 90, - "converged": False, - "weight_collapse_suspected": False, - "household_weight_diagnostics": { - "effective_sample_size": 40.0, - "tiny_share": 0.01, - }, - "person_weight_diagnostics": { - "effective_sample_size": 80.0, - "tiny_share": 0.02, - }, - }, - "policyengine_harness": { - "candidate_mean_abs_relative_error": 0.9, - "baseline_mean_abs_relative_error": 1.1, - "mean_abs_relative_error_delta": -0.2, - }, - } - ) - ) - (artifact_dir / "policyengine_harness.json").write_text( - json.dumps( - { - "summary": { - "candidate_mean_abs_relative_error": 0.9, - "baseline_mean_abs_relative_error": 1.1, - "mean_abs_relative_error_delta": -0.2, - "candidate_composite_parity_loss": 0.8, - "baseline_composite_parity_loss": 1.2, - "target_win_rate": 0.2, - "slice_win_rate": 0.5, - "supported_target_rate": 0.9, - "tag_summaries": { - "state": { - "candidate_mean_abs_relative_error": 0.7, - "baseline_mean_abs_relative_error": 0.8, - "mean_abs_relative_error_delta": -0.1, - "candidate_composite_parity_loss": 0.6, - "baseline_composite_parity_loss": 0.9, - "target_win_rate": 0.3, - "slice_win_rate": 1.0, - "supported_target_rate": 0.85, - } - }, - "parity_scorecard": {"overall": {"candidate_beats_baseline": True}}, - "attribute_cell_summaries": { - "geo=state|feature=snap": {"candidate_target_count": 10} - }, - } - } - ) - ) diff --git a/tests/pipelines/test_compact_policyengine_dataset.py b/tests/pipelines/test_compact_policyengine_dataset.py deleted file mode 100644 index 4dad745d..00000000 --- a/tests/pipelines/test_compact_policyengine_dataset.py +++ /dev/null @@ -1,133 +0,0 @@ -"""Tests for compact PolicyEngine H5 export.""" - -from __future__ import annotations - -import json -from pathlib import Path - -import h5py -import numpy as np -import pytest - -from microplex_us.pipelines.compact_policyengine_dataset import ( - compact_policyengine_dataset_by_household_weight, - main, -) -from microplex_us.policyengine.us import write_policyengine_us_time_period_dataset - - -def _write_dataset(path: Path) -> Path: - arrays = { - "household_id": {"2024": np.asarray([10, 20, 30, 40])}, - "household_weight": {"2024": np.asarray([1.0, 100.0, 5.0, 50.0])}, - "person_id": {"2024": np.asarray([1, 2, 3, 4, 5])}, - "person_household_id": {"2024": np.asarray([10, 20, 20, 30, 40])}, - "person_tax_unit_id": {"2024": np.asarray([100, 200, 200, 100, 400])}, - "tax_unit_id": {"2024": np.asarray([100, 200, 400])}, - "employment_income": {"2024": np.asarray([10.0, 20.0, 30.0, 40.0, 50.0])}, - "tax_unit_dependents": {"2024": np.asarray([0, 1, 1])}, - "household_net_worth": {"2024": np.asarray([1.0, 2.0, 3.0, 4.0])}, - } - return write_policyengine_us_time_period_dataset(arrays, path) - - -def test_compact_policyengine_dataset_keeps_linked_entities_and_rescales(tmp_path): - source = _write_dataset(tmp_path / "source.h5") - output = tmp_path / "compact.h5" - - summary = compact_policyengine_dataset_by_household_weight( - input_dataset_path=source, - output_dataset_path=output, - households=2, - period=2024, - ) - - with h5py.File(output, "r") as handle: - assert handle["household_id"]["2024"][:].tolist() == [20, 40] - assert handle["household_weight"]["2024"][:].tolist() == pytest.approx( - [104.0, 52.0] - ) - assert handle["person_id"]["2024"][:].tolist() == [2, 3, 5] - assert handle["person_household_id"]["2024"][:].tolist() == [20, 20, 40] - assert handle["tax_unit_id"]["2024"][:].tolist() == [200, 400] - assert handle["employment_income"]["2024"][:].tolist() == [ - 20.0, - 30.0, - 50.0, - ] - assert handle["tax_unit_dependents"]["2024"][:].tolist() == [1, 1] - assert handle["household_net_worth"]["2024"][:].tolist() == [2.0, 4.0] - - assert summary["source_households"] == 4 - assert summary["selected_households"] == 2 - assert summary["source_weight_sum"] == pytest.approx(156.0) - assert summary["selected_weight_sum_before_rescale"] == pytest.approx(150.0) - assert summary["output_weight_sum"] == pytest.approx(156.0) - assert summary["entity_counts"] == { - "household": 2, - "person": 3, - "tax_unit": 2, - } - - -def test_compact_policyengine_dataset_can_select_from_external_weights(tmp_path): - source = _write_dataset(tmp_path / "source.h5") - selection_weights = np.asarray([100.0, 1.0, 50.0, 2.0]) - weights_path = tmp_path / "weights.npy" - np.save(weights_path, selection_weights) - - summary = compact_policyengine_dataset_by_household_weight( - input_dataset_path=source, - output_dataset_path=tmp_path / "compact.h5", - households=2, - period=2024, - weights_path=weights_path, - rescale_to_total=False, - ) - - with h5py.File(tmp_path / "compact.h5", "r") as handle: - assert handle["household_id"]["2024"][:].tolist() == [10, 30] - assert handle["household_weight"]["2024"][:].tolist() == [1.0, 5.0] - - assert summary["output_weight_sum"] == pytest.approx(6.0) - assert summary["target_total_weight"] is None - assert summary["rescale_to_total"] is False - - -def test_compact_policyengine_dataset_cli_writes_summary(tmp_path): - source = _write_dataset(tmp_path / "source.h5") - output = tmp_path / "compact.h5" - summary_path = tmp_path / "summary.json" - - exit_code = main( - [ - "--input-dataset", - str(source), - "--output-dataset", - str(output), - "--households", - "2", - "--summary-json", - str(summary_path), - ] - ) - - assert exit_code == 0 - assert output.exists() - payload = json.loads(summary_path.read_text()) - assert payload["selected_households"] == 2 - assert payload["output_dataset"] == str(output.resolve()) - - -def test_compact_policyengine_dataset_rejects_mismatched_weights(tmp_path): - source = _write_dataset(tmp_path / "source.h5") - weights_path = tmp_path / "weights.npy" - np.save(weights_path, np.asarray([1.0, 2.0])) - - with pytest.raises(ValueError, match="selection weights length"): - compact_policyengine_dataset_by_household_weight( - input_dataset_path=source, - output_dataset_path=tmp_path / "compact.h5", - households=2, - weights_path=weights_path, - ) diff --git a/tests/pipelines/test_constraint_metadata_lookup.py b/tests/pipelines/test_constraint_metadata_lookup.py deleted file mode 100644 index 11a4bcae..00000000 --- a/tests/pipelines/test_constraint_metadata_lookup.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Constraint-metadata precompute + lookup path. - -The calibration stage previously scanned each constraint's dense -1.5M-length coefficient array three separate times during ledger + -deferred-stage-selection. That accounted for ~30 GB of transient -``np.abs(...)`` allocations at v7/v8 scale on top of the ~48 GB -baseline — a contributor to the 172 GB-compressed v7 / 197 GB v8 -jetsam kills. - -Fix: precompute ``active_households`` and ``coefficient_mass`` once -per constraint, then thread a ``metadata_lookup`` dict through -``_build_policyengine_constraint_records`` and -``_constraint_active_household_count`` so the dense arrays aren't -rescanned. These tests pin that contract. -""" - -from __future__ import annotations - -import numpy as np -import pytest -from microplex.calibration import LinearConstraint - -from microplex_us.pipelines.us import ( - _build_policyengine_constraint_records, - _constraint_active_household_count, - _precompute_constraint_metadata, - _strip_constraint_coefficients, -) - - -def _toy_constraints(n_hh: int = 1000) -> tuple[LinearConstraint, ...]: - """Three constraints over ``n_hh`` households with known active counts. - - - ``all_nonzero``: every household has nonzero coefficient (count n_hh) - - ``half``: half the households have nonzero coefficient (count n_hh/2) - - ``rare``: only 10 households have nonzero coefficient - """ - rng = np.random.default_rng(0) - all_nonzero = np.ones(n_hh, dtype=float) - half = np.where(rng.random(n_hh) > 0.5, 1.0, 0.0) - rare = np.zeros(n_hh, dtype=float) - rare[:10] = 1.0 - return ( - LinearConstraint(name="all_nonzero", coefficients=all_nonzero, target=100.0), - LinearConstraint(name="half", coefficients=half, target=200.0), - LinearConstraint(name="rare", coefficients=rare, target=10.0), - ) - - -class TestPrecomputeMetadata: - def test_precomputed_scalars_match_direct_computation(self) -> None: - constraints = _toy_constraints(n_hh=1000) - metadata = _precompute_constraint_metadata(constraints) - for c in constraints: - expected_count = int(np.count_nonzero(np.abs(c.coefficients) > 1e-12)) - expected_mass = float(np.abs(c.coefficients).sum()) - assert metadata[c.name]["active_households"] == expected_count - assert metadata[c.name]["coefficient_mass"] == pytest.approx( - expected_mass, rel=1e-12 - ) - - def test_empty_constraints_produce_empty_metadata(self) -> None: - assert _precompute_constraint_metadata(()) == {} - - -class TestMetadataLookupBypassesCoefficients: - def test_active_household_count_uses_lookup(self) -> None: - constraints = _toy_constraints(n_hh=1000) - metadata = _precompute_constraint_metadata(constraints) - stripped = _strip_constraint_coefficients(constraints) - # Sanity: stripped tuple has no coefficient data to scan. - for c in stripped: - assert c.coefficients.size == 0 - # Without metadata_lookup, active-count on a stripped constraint is 0. - assert _constraint_active_household_count(stripped[0]) == 0 - # With metadata_lookup, the precomputed count is returned. - assert ( - _constraint_active_household_count( - stripped[0], metadata_lookup=metadata - ) - == 1000 - ) - - def test_build_records_uses_lookup_when_coefficients_stripped(self) -> None: - """Integration: records built from stripped constraints + lookup - match records built from the full (unstripped) constraints.""" - - class FakeTarget: - def __init__(self, name: str, geo_level: str = "national"): - self.name = name - self.aggregation = "SUM" - self.metadata = {"geo_level": geo_level} - self.required_features = () - - constraints = _toy_constraints(n_hh=1000) - targets = [ - FakeTarget(name="all_nonzero"), - FakeTarget(name="half"), - FakeTarget(name="rare"), - ] - expected = _build_policyengine_constraint_records(targets, constraints) - - metadata = _precompute_constraint_metadata(constraints) - stripped = _strip_constraint_coefficients(constraints) - actual = _build_policyengine_constraint_records( - targets, stripped, metadata_lookup=metadata - ) - - for exp, act in zip(expected, actual, strict=True): - assert exp["active_households"] == act["active_households"] - assert exp["coefficient_mass"] == pytest.approx( - act["coefficient_mass"], rel=1e-12 - ) - - -class TestBackwardCompatibility: - def test_records_without_lookup_still_work(self) -> None: - """Legacy callers that don't pass metadata_lookup should still get - correct results by scanning the coefficient arrays.""" - - class FakeTarget: - def __init__(self, name: str): - self.name = name - self.aggregation = "SUM" - self.metadata = {"geo_level": "national"} - self.required_features = () - - constraints = _toy_constraints(n_hh=500) - targets = [FakeTarget(name=c.name) for c in constraints] - records = _build_policyengine_constraint_records(targets, constraints) - assert records[0]["active_households"] == 500 - assert records[1]["active_households"] > 200 # ~half - assert records[1]["active_households"] < 300 - assert records[2]["active_households"] == 10 diff --git a/tests/pipelines/test_dashboard.py b/tests/pipelines/test_dashboard.py deleted file mode 100644 index 5218101b..00000000 --- a/tests/pipelines/test_dashboard.py +++ /dev/null @@ -1,620 +0,0 @@ -import json - -import numpy as np -import pytest - -from microplex_us.pipelines.dashboard import build_dashboard_payload - - -def test_dashboard_payload_marks_missing_pe_l0_comparators(tmp_path): - artifacts = tmp_path / "artifacts" - run_dir = artifacts / "latest" - run_dir.mkdir(parents=True) - (run_dir / "scores.json").write_text( - json.dumps( - [ - { - "metric": "pe_native_broad_loss", - "period": 2024, - "summary": { - "baseline_enhanced_cps_native_loss": 0.1664, - "candidate_beats_baseline": True, - "candidate_enhanced_cps_native_loss": 0.0252, - "enhanced_cps_native_loss_delta": -0.0725, - "n_targets_kept": 2818, - "n_targets_total": 2830, - }, - "broad_loss": { - "baseline_dataset": "enhanced_cps_2024.h5", - "candidate_dataset": "pe_l0_candidate.h5", - "baseline_weight_sum": 153.8, - "candidate_weight_sum": 153.7, - }, - } - ] - ) - ) - screen_dir = artifacts / "local_screen" - screen_dir.mkdir() - (screen_dir / "split_loss_summary.json").write_text( - json.dumps( - { - "candidate": "cd_age_w8", - "broad_objective_on_latest_pe_matrix_rows": 0.0262, - "latest_pe_baseline_broad_loss": 0.1664, - "cd_age_mean_abs_relative_error": 0.0155, - } - ) - ) - (screen_dir / "scores.json").write_text( - json.dumps( - [ - { - "summary": { - "baseline_enhanced_cps_native_loss": 0.1664, - "candidate_beats_baseline": True, - "candidate_enhanced_cps_native_loss": 0.0263, - "enhanced_cps_native_loss_delta": -0.0714, - } - } - ] - ) - ) - local_l0_dir = artifacts / "pe_local_area_l0_compare" - local_l0_dir.mkdir() - (local_l0_dir / "pe_local_area_l0_state_stack_vs_legacy_ecps.json").write_text( - json.dumps( - { - "metric": "enhanced_cps_native_loss_target_delta", - "from_dataset": "legacy-pe-ecps", - "to_dataset": "pe-local-area-l0-state-stack", - "state_score_count": 51, - "state_weight_sum": 121.0, - "summary": { - "n_targets": 2814, - "from_loss": 0.1747, - "to_loss": 3.0, - "loss_delta": 2.8253, - }, - } - ) - ) - microplex_l0_dir = artifacts / "microplex_actual_l0" - microplex_l0_dir.mkdir() - (microplex_l0_dir / "unified_diagnostics.csv").write_text( - "\n".join( - [ - "target,true_value,estimate,rel_error,abs_rel_error,achievable", - "a,100,90,-0.10,0.10,True", - "b,100,100,0.00,0.00,True", - ] - ) - ) - (microplex_l0_dir / "unified_run_config.json").write_text( - json.dumps({"n_clones": 10, "epochs": 300}) - ) - np.save(microplex_l0_dir / "calibration_weights.npy", np.array([1.0, 0.0, 200.0])) - target_diagnostics = artifacts / "pe_native_target_diagnostics_current.json" - target_diagnostics.write_text( - json.dumps( - { - "dataset_labels": {"from": "PE", "to": "Microplex"}, - "summary": {"n_targets": 0}, - "targets": [], - } - ) - ) - pe_repo = tmp_path / "policyengine-us-data" - for dirname, epochs, mean_error in [ - ("local_net_worth_100", 100, 5.5), - ("local_net_worth_100_e300", 300, 2.5), - ]: - model_dir = ( - pe_repo / "policyengine_us_data" / "storage" / "calibration" / dirname - ) - model_dir.mkdir(parents=True) - (model_dir / "unified_run_config.json").write_text( - json.dumps( - { - "dataset": "source_imputed_stratified_extended_cps_2024.h5", - "db_path": "policy_data.db", - "n_clones": 430, - "epochs": epochs, - "n_targets": 2, - "n_records": 3_000_000, - "weight_sum": 153.0, - "weight_nonzero": 1000, - "mean_error_pct": mean_error, - } - ) - ) - (model_dir / "unified_diagnostics.csv").write_text( - "\n".join( - [ - "target,true_value,estimate,rel_error,abs_rel_error,achievable", - "a,100,95,-0.05,0.05,True", - "b,100,80,-0.20,0.20,True", - ] - ) - ) - - payload = build_dashboard_payload( - artifact_root=artifacts, - target_diagnostics_path=target_diagnostics, - policyengine_us_data_repo=pe_repo, - include_tmux=False, - ) - - assertions = payload["run_board"]["assertions"] - assert assertions["microplex_beats_legacy_ecps_latest_pe_broad"] is True - assert assertions["policyengine_small_l0_weight_package_available"] is True - assert assertions["policyengine_big_l0_weight_package_available"] is True - assert assertions["microplex_vs_small_l0_complete"] is False - assert assertions["microplex_vs_big_l0_complete"] is False - assert assertions["microplex_vs_all_three_pe_models_on_both_metrics"] is False - assert assertions["policyengine_materialized_l0_same_harness_available"] is True - assert assertions["apples_to_apples_groups_available"] is True - assert payload["run_board"]["score_runs"][0]["candidate_loss"] == 0.0252 - assert payload["run_board"]["local_target_screens"][0]["label"] == "cd_age_w8" - assert payload["run_board"]["local_target_screens"][0]["status"] == ( - "screen_scored_latest_pe" - ) - assert ( - payload["run_board"]["local_target_screens"][0]["pe_native_broad_loss"] - == 0.0263 - ) - assert ( - payload["run_board"]["materialized_policyengine_l0_scores"][0]["candidate_loss"] - == 3.0 - ) - actual_l0_runs = payload["run_board"]["actual_l0_objective_runs"] - assert actual_l0_runs[0]["model_id"] == "microplex_actual_l0" - assert actual_l0_runs[0]["actual_l0_data_loss"] == pytest.approx(100 / (101**2)) - assert actual_l0_runs[0]["weights"]["nonzero"] == 2 - groups = {row["id"]: row for row in payload["run_board"]["apples_to_apples"]} - assert groups["latest_pe_broad"]["rows"][0]["score"] == 0.1664 - assert groups["legacy_broad"]["rows"][2]["model_id"] == ( - "policyengine_local_area_l0_state_stack" - ) - models = {row["id"]: row for row in payload["run_board"]["policyengine_l0_models"]} - assert models["policyengine_small_l0"]["epochs"] == 100 - assert ( - models["policyengine_big_l0"]["diagnostics"]["mean_abs_relative_error_pct"] - == 12.5 - ) - assert ( - models["policyengine_big_l0"]["diagnostics"]["actual_l0_objective"] - == "sum(((estimate - target) / (target + 1)) ** 2)" - ) - assert models["policyengine_big_l0"]["diagnostics"][ - "actual_l0_data_loss" - ] == pytest.approx(425 / (101**2)) - - -def test_dashboard_payload_reads_release_smoke_for_record_tiers(tmp_path): - artifacts = tmp_path / "artifacts" - run_dir = artifacts / "mp120k_latest_us_data_refit" - run_dir.mkdir(parents=True) - (run_dir / "scores.json").write_text( - json.dumps( - [ - { - "metric": "pe_native_broad_loss", - "period": 2024, - "summary": { - "baseline_enhanced_cps_native_loss": 0.1664, - "candidate_enhanced_cps_native_loss": 0.0936, - "enhanced_cps_native_loss_delta": -0.0728, - "n_targets_kept": 2818, - "n_targets_total": 2830, - }, - "broad_loss": { - "baseline_dataset": "enhanced_cps_2024.h5", - "candidate_dataset": str(run_dir / "pe_l0_candidate.h5"), - "baseline_weight_sum": 153_768_768, - "candidate_weight_sum": 153_492_896, - }, - } - ] - ) - ) - (run_dir / "runtime_smoke_loader.json").write_text( - json.dumps( - { - "benchmark": "policyengine_us_loader_household_weight_smoke_repeated", - "file_size_ratio": 1.36, - "household_ratio": 2.9, - "median_runtime_ratio": 1.19, - "candidate": { - "file_size_bytes": 150_658_539, - "households": 120_000, - "median_elapsed_seconds": 0.137, - "raw_household_weight_sum": 153_492_896, - }, - "baseline": { - "file_size_bytes": 110_717_166, - "households": 41_314, - "median_elapsed_seconds": 0.115, - "raw_household_weight_sum": 153_768_768, - }, - } - ) - ) - - payload = build_dashboard_payload( - artifact_root=artifacts, - policyengine_us_data_repo=None, - include_tmux=False, - ) - - score_run = payload["run_board"]["score_runs"][0] - assert score_run["record_count_tier"] == "mp-120k" - assert score_run["release_smoke"]["file_size_ratio"] == 1.36 - assert score_run["release_smoke"]["median_runtime_ratio"] == 1.19 - assert score_run["release_smoke"]["passes_file_size_ratio_2x"] is True - assert score_run["release_smoke"]["passes_runtime_ratio_1_25x"] is True - assert score_run["candidate_beats_baseline"] is True - current_best = next( - row - for row in payload["run_board"]["comparison_matrix"] - if row["id"] == "microplex_current_best" - ) - assert current_best["record_count_tier"] == "mp-120k" - assert current_best["release_smoke"]["candidate_households"] == 120_000 - assertions = payload["run_board"]["assertions"] - assert assertions["microplex_current_best_has_release_smoke"] is True - assert assertions["microplex_current_best_release_smoke_passes"] is True - readiness = payload["run_board"]["release_readiness"] - assert len(readiness) == 1 - assert readiness[0]["product"] == "mp-120k" - assert readiness[0]["metric_runtime"] == "latest_policyengine_us" - assert readiness[0]["status"] == "incomplete" - assert readiness[0]["best_passing_artifact"] is None - assert readiness[0]["release_blockers"] == ["full_gate_report"] - assert readiness[0]["best_fit_artifact"]["artifact_id"] == ( - "mp120k_latest_us_data_refit" - ) - assert readiness[0]["best_fit_artifact"]["compatibility_status"] == ( - "smoke_only" - ) - assert readiness[0]["best_fit_artifact"]["candidate_households"] == 120_000 - assert readiness[0]["best_fit_release_blockers"] == ["full_gate_report"] - - -def test_dashboard_payload_wires_materialized_pe_l0_score_jsons(tmp_path): - artifacts = tmp_path / "artifacts" - artifacts.mkdir() - latest_dir = artifacts / "latest_us_data_microplex" - legacy_dir = artifacts / "legacy_targets_microplex" - latest_dir.mkdir() - legacy_dir.mkdir() - (latest_dir / "scores.json").write_text( - json.dumps( - [ - { - "metric": "pe_native_broad_loss", - "summary": { - "baseline_enhanced_cps_native_loss": 0.16, - "candidate_beats_baseline": True, - "candidate_enhanced_cps_native_loss": 0.03, - "n_targets_kept": 2818, - }, - "broad_loss": { - "candidate_dataset": "microplex_latest.h5", - "baseline_dataset": "enhanced_cps_2024.h5", - }, - } - ] - ) - ) - (legacy_dir / "scores.json").write_text( - json.dumps( - [ - { - "metric": "pe_native_broad_loss", - "summary": { - "baseline_enhanced_cps_native_loss": 0.17, - "candidate_beats_baseline": True, - "candidate_enhanced_cps_native_loss": 0.06, - "n_targets_kept": 2814, - }, - "broad_loss": { - "candidate_dataset": "microplex_legacy.h5", - "baseline_dataset": "enhanced_cps_2024.h5", - }, - } - ] - ) - ) - score_dir = artifacts / "pe_l0_clone_apples_to_apples" - score_dir.mkdir() - for metric, targets, small_loss, big_loss in [ - ("legacy_targets", 2814, 0.15, 0.12), - ("new_targets", 2818, 0.09, 0.08), - ]: - for label, loss in [ - ("pe_small_l0", small_loss), - ("pe_big_l0", big_loss), - ]: - (score_dir / f"{metric}_{label}_score.json").write_text( - json.dumps( - { - "metric": "enhanced_cps_native_loss", - "candidate_dataset": f"/tmp/{label}.h5", - "baseline_dataset": "/tmp/enhanced_cps_2024.h5", - "baseline_enhanced_cps_native_loss": ( - 0.16 if metric == "new_targets" else 0.17 - ), - "candidate_beats_baseline": loss - < (0.16 if metric == "new_targets" else 0.17), - "candidate_enhanced_cps_native_loss": loss, - "enhanced_cps_native_loss_delta": loss - - (0.16 if metric == "new_targets" else 0.17), - "n_targets_kept": targets, - "n_targets_total": targets + 10, - } - ) - ) - - pe_repo = tmp_path / "policyengine-us-data" - for dirname in ["local_net_worth_100", "local_net_worth_100_e300"]: - model_dir = ( - pe_repo / "policyengine_us_data" / "storage" / "calibration" / dirname - ) - model_dir.mkdir(parents=True) - (model_dir / "unified_run_config.json").write_text( - json.dumps({"n_targets": 2, "epochs": 100}) - ) - (model_dir / "unified_diagnostics.csv").write_text( - "\n".join( - [ - "target,true_value,estimate,rel_error,abs_rel_error,achievable", - "a,100,95,-0.05,0.05,True", - ] - ) - ) - - payload = build_dashboard_payload( - artifact_root=artifacts, - target_diagnostics_path=artifacts / "missing.json", - policyengine_us_data_repo=pe_repo, - include_tmux=False, - ) - - assertions = payload["run_board"]["assertions"] - assert assertions["microplex_vs_small_l0_complete"] is True - assert assertions["microplex_vs_big_l0_complete"] is True - assert assertions["microplex_vs_all_three_pe_models_on_both_metrics"] is True - groups = {row["id"]: row for row in payload["run_board"]["apples_to_apples"]} - latest_rows = {row["model_id"]: row for row in groups["latest_pe_broad"]["rows"]} - legacy_rows = {row["model_id"]: row for row in groups["legacy_broad"]["rows"]} - assert latest_rows["policyengine_small_l0"]["score"] == 0.09 - assert latest_rows["policyengine_big_l0"]["score"] == 0.08 - assert legacy_rows["policyengine_small_l0"]["score"] == 0.15 - assert legacy_rows["policyengine_big_l0"]["score"] == 0.12 - - -def test_dashboard_payload_reads_run_contract_summaries(tmp_path): - artifacts = tmp_path / "artifacts" - run_dir = artifacts / "contracted_run" - run_dir.mkdir(parents=True) - (run_dir / "run_manifest.json").write_text( - json.dumps( - { - "run_id": "contracted-run", - "attempt_id": "attempt-1", - } - ) - ) - (run_dir / "run_summary.json").write_text( - json.dumps( - { - "run_id": "contracted-run", - "attempt_id": "attempt-1", - "status": "running", - "active": {"stage_id": "policyengine_materialization"}, - "started_at": "2026-05-28T00:00:00+00:00", - "updated_at": "2026-05-28T00:00:01+00:00", - "completed_stages": ["preflight", "target_build", "calibration"], - "failure": {"stage_id": "donor_integration"}, - "restart": { - "stage_id": "policyengine_materialization", - "checkpoint_ref": "checkpoint:post_microsim", - }, - } - ) - ) - - payload = build_dashboard_payload( - artifact_root=artifacts, - policyengine_us_data_repo=None, - include_tmux=False, - ) - - contracts = payload["run_board"]["run_contracts"] - assert len(contracts) == 1 - assert contracts[0]["status_source"] == "contract" - assert contracts[0]["run_id"] == "contracted-run" - assert contracts[0]["status"] == "running" - assert contracts[0]["active"]["stage_id"] == "06_policyengine_entities" - assert contracts[0]["active"]["legacy_stage_id"] == "policyengine_materialization" - assert contracts[0]["failure"]["stage_id"] == "05_donor_integration_synthesis" - assert contracts[0]["restart"]["stage_id"] == "06_policyengine_entities" - assert contracts[0]["completed_stages"] == [ - "01_run_profile", - "07_calibration", - ] - assert contracts[0]["legacy_completed_stages"] == [ - "preflight", - "target_build", - "calibration", - ] - - -def test_dashboard_payload_reads_mp300k_artifact_gate_reports(tmp_path): - artifacts = tmp_path / "artifacts" - gate_dir = artifacts / "mp120k_release" - gate_dir.mkdir(parents=True) - (gate_dir / "mp300k_artifact_gates.json").write_text( - json.dumps( - { - "artifact_id": "mp120k_release", - "product": "mp-120k", - "period": 2024, - "summary": { - "status": "passed", - "passing_required_gate_count": 6, - "failed_required_gate_count": 0, - "unmeasured_required_gate_count": 0, - "failed_required_gates": [], - "unmeasured_required_gates": [], - }, - "candidate_dataset": { - "path": "/tmp/pe_l0_candidate.h5", - "size_bytes": 150_658_539, - }, - "gates": { - "compatibility": { - "status": "pass", - "metrics": { - "household_count": 120_000, - "person_count": 261_177, - }, - }, - "artifact_size": { - "status": "pass", - "metrics": {"artifact_size_ratio": 1.36}, - }, - "runtime": { - "status": "pass", - "metrics": {"runtime_ratio": 1.19}, - }, - "ecps_comparison": { - "status": "pass", - "metrics": { - "candidate_enhanced_cps_native_loss": 0.0936, - "baseline_enhanced_cps_native_loss": 0.1664, - "enhanced_cps_native_loss_delta": -0.0728, - }, - }, - }, - } - ) - ) - (gate_dir / "scores.json").write_text( - json.dumps( - [ - { - "summary": { - "baseline_enhanced_cps_native_loss": 0.1664, - "candidate_enhanced_cps_native_loss": 0.0936, - "n_targets_kept": 2818, - }, - "broad_loss": { - "candidate_dataset": str(gate_dir / "pe_l0_candidate.h5"), - "baseline_dataset": "enhanced_cps_2024.h5", - }, - } - ] - ) - ) - (gate_dir / "runtime_smoke_loader.json").write_text( - json.dumps( - { - "file_size_ratio": 1.36, - "median_runtime_ratio": 1.19, - "candidate": { - "file_size_bytes": 150_658_539, - "households": 120_000, - "median_elapsed_seconds": 0.137, - }, - "baseline": { - "file_size_bytes": 110_717_166, - "households": 41_314, - "median_elapsed_seconds": 0.115, - }, - } - ) - ) - blocked_dir = artifacts / "mp120k_better_fit_blocked" - blocked_dir.mkdir(parents=True) - (blocked_dir / "mp300k_artifact_gates.json").write_text( - json.dumps( - { - "artifact_id": "mp120k_better_fit_blocked", - "product": "mp-120k", - "period": 2024, - "summary": { - "status": "failed", - "passing_required_gate_count": 5, - "failed_required_gate_count": 1, - "unmeasured_required_gate_count": 0, - "failed_required_gates": ["runtime"], - "unmeasured_required_gates": [], - }, - "candidate_dataset": { - "path": "/tmp/better_fit_candidate.h5", - "size_bytes": 150_658_539, - }, - "gates": { - "compatibility": { - "status": "pass", - "metrics": { - "household_count": 120_000, - "person_count": 261_177, - }, - }, - "artifact_size": { - "status": "pass", - "metrics": {"artifact_size_ratio": 1.36}, - }, - "runtime": { - "status": "fail", - "metrics": {"runtime_ratio": 1.31}, - }, - "ecps_comparison": { - "status": "pass", - "metrics": { - "candidate_enhanced_cps_native_loss": 0.0836, - "baseline_enhanced_cps_native_loss": 0.1664, - "enhanced_cps_native_loss_delta": -0.0828, - "n_targets_kept": 2818, - }, - }, - }, - } - ) - ) - - payload = build_dashboard_payload( - artifact_root=artifacts, - policyengine_us_data_repo=None, - include_tmux=False, - ) - - reports = payload["run_board"]["mp300k_artifact_gate_reports"] - assert len(reports) == 2 - passed_report = next(row for row in reports if row["status"] == "passed") - assert passed_report["product"] == "mp-120k" - assert passed_report["candidate_households"] == 120_000 - assert passed_report["artifact_size_ratio"] == 1.36 - assert passed_report["runtime_ratio"] == 1.19 - assert passed_report["candidate_loss"] == 0.0936 - - readiness = payload["run_board"]["release_readiness"] - assert len(readiness) == 1 - assert readiness[0]["product"] == "mp-120k" - assert readiness[0]["metric_runtime"] == "latest_policyengine_us" - assert readiness[0]["status"] == "release_ready" - assert readiness[0]["passed_artifact_count"] == 1 - assert readiness[0]["failed_artifact_count"] == 1 - assert readiness[0]["best_passing_artifact"]["artifact_id"] == "mp120k_release" - assert readiness[0]["best_passing_artifact"]["artifact_path"].endswith( - "mp300k_artifact_gates.json" - ) - assert ( - readiness[0]["best_fit_artifact"]["artifact_id"] == "mp120k_better_fit_blocked" - ) - assert readiness[0]["best_fit_is_release_ready"] is False - assert readiness[0]["best_fit_release_blockers"] == ["runtime"] - assert readiness[0]["fit_loss_gap_to_best_passing"] == pytest.approx(0.01) diff --git a/tests/pipelines/test_data_flow_snapshot.py b/tests/pipelines/test_data_flow_snapshot.py deleted file mode 100644 index 79f72751..00000000 --- a/tests/pipelines/test_data_flow_snapshot.py +++ /dev/null @@ -1,257 +0,0 @@ -"""Tests for the canonical US data-flow snapshot.""" - -import json - -from microplex_us.pipelines.data_flow_snapshot import ( - build_us_microplex_data_flow_snapshot, - write_us_microplex_data_flow_snapshot, -) - - -def test_build_us_microplex_data_flow_snapshot_reads_manifest_runtime_mix(tmp_path): - artifact_dir = tmp_path / "run-1" - artifact_dir.mkdir() - (artifact_dir / "policyengine_us.h5").write_text("dataset") - (artifact_dir / "policyengine_harness.json").write_text("{}") - evidence_path = ( - artifact_dir - / "stage_artifacts" - / "09_validation_benchmarking" - / "evidence_manifest.json" - ) - evidence_path.parent.mkdir(parents=True) - evidence_path.write_text( - json.dumps( - { - "schemaVersion": 1, - "evidence": [ - { - "key": "policyengine_harness", - "path": "policyengine_harness.json", - "exists": True, - } - ], - } - ) - ) - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-04-05T00:00:00+00:00", - "config": { - "n_synthetic": 5000, - "policyengine_direct_override_variables": [], - }, - "rows": { - "seed": 3000, - "synthetic": 5000, - "calibrated": 5000, - }, - "synthesis": { - "backend": "synthesizer", - "source_names": ["cps_asec_2023", "irs_soi_puf"], - "scaffold_source": "cps_asec_2023", - "condition_vars": ["age", "state_fips"], - "target_vars": ["income", "employment_income"], - "donor_integrated_variables": [ - "employment_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - ], - "donor_authoritative_override_variables": ["employment_income"], - "state_program_support_proxies": { - "available": ["ssi"], - "missing": ["snap"], - }, - }, - "calibration": { - "backend": "ipf", - "n_loaded_targets": 100, - "n_supported_targets": 90, - "converged": True, - }, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "policyengine_harness": "policyengine_harness.json", - "validation_evidence": ( - "stage_artifacts/09_validation_benchmarking/" - "evidence_manifest.json" - ), - }, - "policyengine_harness": { - "mean_abs_relative_error_delta": -0.2, - "target_win_rate": 0.4, - }, - } - ) - ) - - snapshot = build_us_microplex_data_flow_snapshot(artifact_dir) - - assert snapshot["schemaVersion"] == 1 - assert snapshot["coverageMode"] == "artifact_frozen" - assert snapshot["runtime"]["scaffoldSource"] == "cps_asec_2023" - assert snapshot["runtime"]["nSynthetic"] == 5000 - assert snapshot["sharedCoverage"]["sourceNames"] == [ - "cps_asec_2023", - "irs_soi_puf", - ] - assert snapshot["sources"][0]["name"] == "cps_asec_2023" - assert snapshot["sources"][1]["manifestBacked"] is True - assert any( - block["restoreFrame"] == "restore_dividend_components_from_composition" - for block in snapshot["donorBlocks"] - ) - assert any( - highlight["variableName"] == "employment_income" - and highlight["hasDonorTransform"] is True - for highlight in snapshot["semanticHighlights"] - ) - assert [stage["id"] for stage in snapshot["stages"]] == [ - "01_run_profile", - "02_source_loading", - "03_source_planning", - "04_seed_scaffold", - "05_donor_integration_synthesis", - "06_policyengine_entities", - "07_calibration", - "08_dataset_assembly", - "09_validation_benchmarking", - ] - assert snapshot["stages"][8]["status"] == "ready" - - -def test_build_us_microplex_data_flow_snapshot_resolves_cps_parquet_source_exactly( - tmp_path, -): - artifact_dir = tmp_path / "run-2" - artifact_dir.mkdir() - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-04-05T00:00:00+00:00", - "config": {"n_synthetic": 1000}, - "rows": {"seed": 1000, "synthetic": 1000, "calibrated": 1000}, - "synthesis": { - "backend": "seed", - "source_names": ["cps_asec_parquet"], - "scaffold_source": "cps_asec_parquet", - "condition_vars": [], - "target_vars": [], - "donor_integrated_variables": [], - "state_program_support_proxies": {"available": [], "missing": []}, - }, - "calibration": {}, - "artifacts": {}, - } - ) - ) - - snapshot = build_us_microplex_data_flow_snapshot(artifact_dir) - - assert snapshot["sources"][0]["name"] == "cps_asec_parquet" - assert "split household/person parquet files" in snapshot["sources"][0]["notes"][0] - - -def test_build_us_microplex_data_flow_snapshot_prefers_saved_sidecar_but_can_refresh( - tmp_path, -): - artifact_dir = tmp_path / "run-3" - artifact_dir.mkdir() - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-04-05T00:00:00+00:00", - "config": {"n_synthetic": 1000}, - "rows": {"seed": 1000, "synthetic": 1000, "calibrated": 1000}, - "synthesis": { - "backend": "seed", - "source_names": ["cps_asec_parquet"], - "scaffold_source": "cps_asec_parquet", - "condition_vars": [], - "target_vars": [], - "donor_integrated_variables": [], - "state_program_support_proxies": {"available": [], "missing": []}, - }, - "calibration": {}, - "artifacts": {}, - } - ) - ) - (artifact_dir / "data_flow_snapshot.json").write_text( - json.dumps( - { - "schemaVersion": 1, - "generatedAt": "2000-01-01T00:00:00Z", - "coverageMode": "stale", - "runtime": {"scaffoldSource": "stale_source"}, - } - ) - ) - - saved_snapshot = build_us_microplex_data_flow_snapshot(artifact_dir) - fresh_snapshot = build_us_microplex_data_flow_snapshot( - artifact_dir, - prefer_saved=False, - ) - - assert saved_snapshot["coverageMode"] == "stale" - assert saved_snapshot["runtime"]["scaffoldSource"] == "stale_source" - assert fresh_snapshot["coverageMode"] == "artifact_frozen" - assert fresh_snapshot["runtime"]["scaffoldSource"] == "cps_asec_parquet" - - -def test_write_us_microplex_data_flow_snapshot_ignores_stale_stage_manifest( - tmp_path, -): - artifact_dir = tmp_path / "run-4" - artifact_dir.mkdir() - (artifact_dir / "policyengine_us.h5").write_text("dataset") - (artifact_dir / "stage_manifest.json").write_text( - json.dumps( - { - "schemaVersion": 1, - "contractVersion": "stale", - "generatedAt": "2000-01-01T00:00:00Z", - "pipeline": "us_microplex", - "artifactRoot": ".", - "manifest": "manifest.json", - "stages": [{"id": "stale_stage"}], - } - ) - ) - manifest = { - "created_at": "2026-04-05T00:00:00+00:00", - "config": {"n_synthetic": 1000}, - "rows": {"seed": 1000, "synthetic": 1000, "calibrated": 1000}, - "synthesis": { - "backend": "seed", - "source_names": ["cps_asec_parquet"], - "scaffold_source": "cps_asec_parquet", - "condition_vars": [], - "target_vars": [], - "donor_integrated_variables": [], - "state_program_support_proxies": {"available": [], "missing": []}, - }, - "calibration": {}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "stage_manifest": "stage_manifest.json", - "data_flow_snapshot": "data_flow_snapshot.json", - }, - } - - write_us_microplex_data_flow_snapshot( - artifact_dir, - artifact_dir / "data_flow_snapshot.json", - manifest_payload=manifest, - assume_existing_stage_artifact_keys=( - "stage_manifest", - "artifact_inventory", - "conditional_readiness", - ), - ) - - snapshot = json.loads((artifact_dir / "data_flow_snapshot.json").read_text()) - assert snapshot["stages"][0]["id"] == "01_run_profile" - assert snapshot["stages"][7]["status"] == "ready" diff --git a/tests/pipelines/test_donor_imputer_negative_preservation.py b/tests/pipelines/test_donor_imputer_negative_preservation.py deleted file mode 100644 index f1c40f21..00000000 --- a/tests/pipelines/test_donor_imputer_negative_preservation.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Donor imputer must preserve negative values in zero-inflated-sign-mixed columns. - -v7 bug (`us.py:235`, pre-fix): `ColumnwiseQRFDonorImputer` applies -`y_values > 0` as its nonzero filter. For columns that can be negative -(short-term capital gains, partnership/S-corp income, farm income, -rental income), this drops all negative training rows — the QRF only -sees positives and therefore produces zero-or-positive predictions. -The entire negative tail disappears from the synthetic frame. - -v9 fix: swap the ad-hoc gate for `microimpute.models.ZeroInflatedImputer`, -which auto-detects the three-sign regime and routes negative-gated -records to a negative-only QRF. - -These tests pin the post-fix contract by fitting on a column that -genuinely spans neg/0/pos and asserting negatives survive to the -synthetic output. -""" - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pytest - -pytest.importorskip("quantile_forest") -pytest.importorskip("microimpute") - - -def _three_sign_frame(n: int = 800, seed: int = 0) -> pd.DataFrame: - """Training frame with a three-sign target. - - ~40% negative, ~20% zero, ~40% positive. Positive regime has - distinct distribution from negative regime, so the sign is - predictable from the conditioning variables. - """ - rng = np.random.default_rng(seed) - age = rng.integers(18, 80, size=n).astype(float) - is_female = rng.integers(0, 2, size=n).astype(float) - - # Regime assignment driven by (age, is_female). - logit_pos = -0.5 + 0.05 * (age - 50) # older → more likely positive - logit_neg = 0.5 - 0.05 * (age - 50) # younger → more likely negative - logit_zero = 1.0 - 0.02 * age - - logits = np.stack([logit_neg, logit_zero, logit_pos], axis=1) - logits -= logits.max(axis=1, keepdims=True) - probs = np.exp(logits) - probs /= probs.sum(axis=1, keepdims=True) - - u = rng.random(n) - cum = np.cumsum(probs, axis=1) - regime_idx = (cum >= u[:, None]).argmax(axis=1) - - y = np.zeros(n) - pos_mask = regime_idx == 2 - neg_mask = regime_idx == 0 - y[pos_mask] = 100 + rng.exponential(200, size=pos_mask.sum()) - y[neg_mask] = -(100 + rng.exponential(200, size=neg_mask.sum())) - - return pd.DataFrame( - { - "age": age, - "is_female": is_female, - "short_term_capital_gains": y, - } - ) - - -class TestDonorImputerPreservesNegatives: - """The donor imputer must emit negatives for three-sign training columns.""" - - def test_fit_generate_preserves_negative_predictions(self) -> None: - """The current v7 imputer (`y > 0` gate) should NOT pass this. - The v9 imputer (ZeroInflatedImputer-based) should. - """ - from microplex_us.pipelines.us import ColumnwiseQRFDonorImputer - - train = _three_sign_frame(n=800, seed=0) - # Preconditions on the fixture: genuinely three-sign. - y = train["short_term_capital_gains"].to_numpy() - assert (y > 0).sum() > 50, "fixture should have meaningful positive mass" - assert (y < 0).sum() > 50, "fixture should have meaningful negative mass" - assert (y == 0).sum() > 50, "fixture should have meaningful zero mass" - - imputer = ColumnwiseQRFDonorImputer( - condition_vars=["age", "is_female"], - target_vars=["short_term_capital_gains"], - n_estimators=30, - zero_inflated_vars={"short_term_capital_gains"}, - zero_threshold=0.05, - ) - imputer.fit(train) - - rng = np.random.default_rng(42) - n_gen = 2000 - conditions = pd.DataFrame( - { - "age": rng.integers(18, 80, size=n_gen).astype(float), - "is_female": rng.integers(0, 2, size=n_gen).astype(float), - } - ) - synthetic = imputer.generate(conditions, seed=42) - synth_y = synthetic["short_term_capital_gains"].to_numpy() - - # The core contract: the synthetic output must contain some - # negative values. Under the v7 `y > 0` bug this would be 0. - n_negative = int((synth_y < 0).sum()) - assert n_negative > 0, ( - f"Donor imputer produced no negative values despite training " - f"data having {(y < 0).sum()} negatives. This is the v7 " - "drop-negatives bug." - ) - # Loose sanity: the negative fraction should be materially - # above zero (not just a single fp-edge-case). - assert n_negative / n_gen > 0.05, ( - f"Negative fraction in synthetic = {n_negative / n_gen:.3f}; " - "expected > 5% given the training distribution has ~40% negatives." - ) diff --git a/tests/pipelines/test_ecps_replacement_comparison.py b/tests/pipelines/test_ecps_replacement_comparison.py deleted file mode 100644 index c79b9d9e..00000000 --- a/tests/pipelines/test_ecps_replacement_comparison.py +++ /dev/null @@ -1,1060 +0,0 @@ -"""Tests for sound Microplex-vs-eCPS replacement comparisons.""" - -from __future__ import annotations - -import json -import shutil -import subprocess -import sys -from pathlib import Path - -import h5py -import numpy as np -import pytest - -from microplex_us.pipelines import ecps_replacement_comparison as ecps -from microplex_us.pipelines.mp300k_artifact_gates import ( - write_mp300k_artifact_gate_report, -) -from microplex_us.pipelines.mp_benchmark_manifest import ( - FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, -) -from microplex_us.pipelines.pe_native_loss import build_pe_native_loss_arrays -from microplex_us.policyengine.us import write_policyengine_us_time_period_dataset - -_TARGET_NAMES = [ - "nation/irs/ssi", - "nation/irs/snap", - "nation/irs/employment_income", - "nation/irs/self_employment_income", - "nation/irs/capital_gains", - "nation/irs/taxable_interest_income", - "nation/irs/dividend_income", - "nation/irs/pension_income", - "nation/irs/disability_income", - "nation/irs/household_net_income", - "state/CA/adjusted_gross_income/amount/0_1", - "state/census/age/CA/65", - "nation/ssa/retirement", - "nation/irs/aca_spending/CA", -] - - -@pytest.fixture(autouse=True) -def _pin_policyengine_us_version(monkeypatch): - monkeypatch.setattr( - ecps, - "_installed_policyengine_us_version", - lambda: "1.587.0", - ) - - -def _write_minimal_policyengine_dataset( - path: Path, - *, - weights: tuple[float, float] = (0.5, 0.5), - period: int = 2024, -) -> Path: - arrays = { - "household_id": {str(period): np.asarray([1, 2])}, - "household_weight": {str(period): np.asarray(weights, dtype=np.float32)}, - "person_id": {str(period): np.asarray([1, 2, 3])}, - "person_household_id": {str(period): np.asarray([1, 1, 2])}, - "tax_unit_id": {str(period): np.asarray([10, 20])}, - "person_tax_unit_id": {str(period): np.asarray([10, 10, 20])}, - "spm_unit_id": {str(period): np.asarray([100, 200])}, - "person_spm_unit_id": {str(period): np.asarray([100, 100, 200])}, - "family_id": {str(period): np.asarray([1000, 2000])}, - "person_family_id": {str(period): np.asarray([1000, 1000, 2000])}, - "marital_unit_id": {str(period): np.asarray([10000, 10001, 20000])}, - "person_marital_unit_id": {str(period): np.asarray([10000, 10001, 20000])}, - "social_security_retirement": {str(period): np.asarray([1.0, 0.0, 0.0])}, - "social_security_disability": {str(period): np.asarray([0.0, 1.0, 0.0])}, - "employment_income_before_lsr": {str(period): np.asarray([100.0, 0.0, 200.0])}, - } - return write_policyengine_us_time_period_dataset(arrays, path) - - -def _read_weights(path: Path, *, period: int = 2024) -> np.ndarray: - with h5py.File(path, "r") as handle: - return np.asarray(handle["household_weight"][str(period)], dtype=np.float64) - - -def _write_clean_git_repo(path: Path) -> Path: - path.mkdir() - (path / "README.md").write_text("pinned scorer repo\n") - subprocess.run(["git", "init"], cwd=path, check=True, capture_output=True) - subprocess.run(["git", "add", "README.md"], cwd=path, check=True) - subprocess.run( - [ - "git", - "-c", - "user.name=Microplex Tests", - "-c", - "user.email=microplex-tests@example.com", - "commit", - "-m", - "Initial scorer pin", - ], - cwd=path, - check=True, - capture_output=True, - ) - return path - - -def _fake_loss_inputs(input_dataset_path: str | Path, **_kwargs) -> dict[str, object]: - path = Path(input_dataset_path) - if path.name.startswith("candidate"): - matrix = np.tile(np.asarray([[1.0], [0.0]]), (1, len(_TARGET_NAMES))) - else: - matrix = np.tile(np.asarray([[0.9], [0.0]]), (1, len(_TARGET_NAMES))) - return { - "scaled_matrix": matrix, - "scaled_target": np.ones(len(_TARGET_NAMES), dtype=np.float64), - "initial_weights": _read_weights(path), - "unscaled_target": np.ones(len(_TARGET_NAMES), dtype=np.float64), - "scaling": np.ones(len(_TARGET_NAMES), dtype=np.float64), - "metadata": { - "target_names": list(_TARGET_NAMES), - "n_targets_kept": len(_TARGET_NAMES), - }, - } - - -def _fake_pe_native_scores(**kwargs) -> dict[str, object]: - candidate_path = Path(kwargs["candidate_dataset_path"]) - baseline_path = Path(kwargs["baseline_dataset_path"]) - candidate_inputs = _fake_loss_inputs(candidate_path) - baseline_inputs = _fake_loss_inputs(baseline_path) - candidate_loss = ecps._objective( - np.asarray(candidate_inputs["scaled_matrix"]), - np.asarray(candidate_inputs["scaled_target"]), - _read_weights(candidate_path), - ) - baseline_loss = ecps._objective( - np.asarray(baseline_inputs["scaled_matrix"]), - np.asarray(baseline_inputs["scaled_target"]), - _read_weights(baseline_path), - ) - return { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "summary": { - "candidate_enhanced_cps_native_loss": candidate_loss, - "baseline_enhanced_cps_native_loss": baseline_loss, - "enhanced_cps_native_loss_delta": candidate_loss - baseline_loss, - "candidate_beats_baseline": candidate_loss < baseline_loss, - "n_targets_kept": len(_TARGET_NAMES), - }, - "broad_loss": {}, - "family_breakdown": [ - { - "family": family, - "candidate_loss_contribution": 0.01, - "baseline_loss_contribution": 0.01, - } - for family in ( - "state_agi_distribution", - "state_age_distribution", - "national_ssa", - "national_irs_other", - "state_aca_spending", - ) - ], - } - - -def test_target_value_diagnostics_falls_back_for_zero_scaling(): - loss_inputs = { - "scaled_matrix": np.asarray([[2.0, 3.0], [0.0, 0.0]]), - "scaled_target": np.asarray([1.0, 2.0]), - "unscaled_target": np.asarray([10.0, 20.0]), - "scaling": np.asarray([0.0, 0.5]), - } - - diagnostics = ecps._target_value_diagnostics( - loss_inputs, - np.asarray([1.0, 0.0]), - ) - - assert diagnostics["value_scale"].tolist() == ["scaled", "native"] - assert diagnostics["target"].tolist() == [1.0, 20.0] - assert diagnostics["estimate"].tolist() == [2.0, 6.0] - - -def _fake_support_audit(**_kwargs) -> dict[str, object]: - return { - "metric": "enhanced_cps_support_audit", - "comparisons": { - "critical_input_support": [ - { - "variable": "medicare_part_b_premiums", - "candidate_stored": True, - "baseline_stored": False, - "weighted_nonzero_delta": 100.0, - } - ], - "filing_status_weighted_delta": [ - { - "filing_status": "HEAD_OF_HOUSEHOLD", - "weighted_count_delta": 50.0, - } - ], - "hoh_agi_delta": [ - { - "agi_bin": "500k_to_1m", - "weighted_count_delta": 40.0, - } - ], - "ssi_by_age_delta": [ - { - "age_bucket": "65_plus", - "weighted_recipient_delta": 30.0, - } - ], - "medicare_part_b_premiums_by_age_delta": [ - { - "age_bucket": "10_to_19", - "weighted_positive_delta": 20.0, - } - ], - "state_aca_ptc_spending_top_gaps": [ - { - "state": "CA", - "weighted_aca_ptc_delta": -10.0, - } - ], - }, - } - - -def test_protected_family_losses_match_pe_native_labels_with_spaces(): - target_names = [ - "nation/bea/wages and salaries", - "nation/irs/capital gains gross/total/AGI in -inf-inf/taxable/All", - "nation/census/household net income", - ] - candidate_inputs = { - "scaled_matrix": np.asarray([[2.0, 2.0, 2.0]]), - "scaled_target": np.ones(len(target_names), dtype=np.float64), - } - baseline_inputs = { - "scaled_matrix": np.asarray([[1.0, 1.0, 1.0]]), - "scaled_target": np.ones(len(target_names), dtype=np.float64), - } - - rows = ecps._protected_family_losses( - target_names=target_names, - candidate_inputs=candidate_inputs, - baseline_inputs=baseline_inputs, - candidate_weights=np.asarray([1.0]), - baseline_weights=np.asarray([1.0]), - ) - - assert rows["wages"]["n_targets"] == 1 - assert rows["capital_gains"]["n_targets"] == 1 - assert rows["household_net_income"]["n_targets"] == 1 - assert rows["wages"]["candidate_loss"] == pytest.approx(1.0) - assert rows["wages"]["baseline_loss"] == pytest.approx(0.0) - assert rows["capital_gains"]["loss_delta"] == pytest.approx(1.0) - - -def test_target_loss_diagnostics_family_breakdown_uses_total_loss_scale(): - target_names = [ - "nation/irs/capital gains gross/total/AGI in 1m-inf/taxable/All", - "state/census/age/CA/65", - ] - candidate_inputs = { - "scaled_matrix": np.asarray([[2.0, 3.0]]), - "scaled_target": np.ones(len(target_names), dtype=np.float64), - } - baseline_inputs = { - "scaled_matrix": np.asarray([[1.0, 1.0]]), - "scaled_target": np.ones(len(target_names), dtype=np.float64), - } - - diagnostics = ecps._target_loss_diagnostics( - target_names=target_names, - candidate_inputs=candidate_inputs, - baseline_inputs=baseline_inputs, - candidate_weights=np.asarray([1.0]), - baseline_weights=np.asarray([1.0]), - holdout_mask=np.asarray([False, True]), - top_k=2, - ) - - assert diagnostics["summary"]["candidate_loss"] == pytest.approx(5.0) - breakdown = {row["family"]: row for row in diagnostics["family_breakdown"]} - assert breakdown["national_irs_other"][ - "candidate_loss_contribution" - ] == pytest.approx(1.0) - assert breakdown["state_age_distribution"][ - "candidate_loss_contribution" - ] == pytest.approx(4.0) - assert sum( - row["candidate_loss_contribution"] for row in diagnostics["family_breakdown"] - ) == pytest.approx(diagnostics["summary"]["candidate_loss"]) - - -def test_robust_loss_terms_match_objective() -> None: - target_names = [ - "nation/irs/example income/total/AGI in 0_1/taxable/All", - "nation/irs/example count/count/AGI in 0_1/taxable/All", - ] - targets = np.asarray([100.0, 10.0]) - loss_arrays = build_pe_native_loss_arrays(target_names, targets) - matrix = np.asarray([[90.0, 20.0], [5.0, 0.0]]) - weights = np.asarray([1.0, 1.0]) - loss_inputs = { - "scaled_matrix": matrix, - "scaled_target": loss_arrays.objective_target, - "unscaled_target": targets, - "loss_denominator": loss_arrays.denominator, - "loss_target_weight": loss_arrays.target_weight, - "loss_bucket": loss_arrays.bucket_keys, - "loss_unit": loss_arrays.unit_keys, - "loss_scope": loss_arrays.scope_keys, - "loss_family": loss_arrays.family_keys, - "loss_epsilon": loss_arrays.epsilon, - "metadata": { - **loss_arrays.metadata(), - "target_names": target_names, - }, - } - - assert ecps._loss_terms(loss_inputs, weights).sum() == pytest.approx( - ecps._objective( - matrix, - loss_arrays.objective_target, - weights, - loss_arrays=loss_arrays, - ) - ) - - -def test_comparison_bad_targets_exclude_dataset_derived_source_counts() -> None: - bad_targets = ecps._comparison_bad_targets() - - assert len(bad_targets) == len(set(bad_targets)) - assert set(ecps._ENHANCED_CPS_BAD_TARGETS).issubset(bad_targets) - assert { - "nation/source/household_count", - "nation/source/cps_household_count", - "nation/source/puf_clone_household_count", - }.issubset(bad_targets) - - -def _artifact_manifest(artifact_dir: Path, baseline_dataset: Path) -> None: - (artifact_dir / "source_weight_diagnostics.json").write_text( - json.dumps( - { - "schema_version": 1, - "sources": [ - { - "source_name": "cps_asec", - "source_class": "base", - "household_weight_share": 0.95, - }, - { - "source_name": "irs_soi_puf_support_clone", - "source_class": "puf_support", - "household_weight_share": 0.05, - }, - ], - } - ) - ) - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "config": { - "policyengine_baseline_dataset": str(baseline_dataset), - "policyengine_dataset_year": 2024, - }, - "artifacts": { - "policyengine_dataset": "candidate.h5", - "source_weight_diagnostics": "source_weight_diagnostics.json", - }, - } - ) - ) - - -def _benchmark_manifest( - path: Path, - *, - certificate: dict[str, object] | None = None, -) -> None: - if certificate is not None: - baseline_dataset = dict(certificate["baseline_dataset"]) - target_db = dict(certificate["target_db"]) - policyengine_us_data = dict(certificate["policyengine_us_data"]) - policyengine_us = dict(certificate["policyengine_us"]) - certificate_type = certificate["certificate_type"] - period = certificate["period"] - target_surface = dict(certificate["target_surface"]) - scoring_config = {"sha256": certificate["scoring_config"]["sha256"]} - baseline_metrics = dict(certificate["baseline_metrics"]) - else: - baseline_dataset = { - "path": "/tmp/enhanced_cps_2024.h5", - "sha256": FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - } - target_db = { - "path": "/tmp/policyengine_targets.db", - "sha256": FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, - } - policyengine_us_data = { - "repo": "PolicyEngine/policyengine-us-data", - "commit": "b" * 40, - } - policyengine_us = {"version": "1.587.0"} - certificate_type = "frozen_production_ecps_baseline" - period = 2024 - target_surface = { - "target_profile": "pe_native_broad", - "target_scope": "all", - "target_count": 150, - "target_names_sha256": "d" * 64, - } - scoring_config = {"sha256": "e" * 64} - baseline_metrics = { - "baseline_enhanced_cps_native_loss": 0.20, - "baseline_holdout_loss": 0.04, - "baseline_unweighted_msre": 0.17, - } - path.write_text( - json.dumps( - { - "schema_version": 1, - "certificate_type": certificate_type, - "period": period, - "target_profile": target_surface["target_profile"], - "target_scope": target_surface["target_scope"], - "target_surface": target_surface, - "scoring_config": scoring_config, - "baseline_metrics": baseline_metrics, - "baseline_dataset": baseline_dataset, - "policyengine_us_data": policyengine_us_data, - "policyengine_us": policyengine_us, - "target_db": target_db, - } - ) - ) - - -def _arch_coverage_payload() -> dict[str, object]: - return { - "profile_name": "pe_native_broad_source_backed", - "period": 2024, - "target_cell_count": 183, - "covered_cell_count": 183, - "uncovered_cell_count": 0, - "coverage_rate": 1.0, - } - - -def test_sound_ecps_replacement_comparison_satisfies_gate_contract( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - targets_db = tmp_path / "policyengine_targets.db" - targets_db.write_bytes(b"pinned target database") - scorer_repo = _write_clean_git_repo(tmp_path / "policyengine-us-data") - output_dir = tmp_path / "comparison" - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_scores", _fake_pe_native_scores) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - payload = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=output_dir, - optimizer_max_iter=50, - policyengine_targets_db_path=targets_db, - policyengine_us_data_repo=scorer_repo, - enforce_production_pins=False, - ) - - summary = payload["summary"] - certificate = payload["frozen_ecps_baseline_certificate"] - certificate["baseline_dataset"]["sha256"] = FROZEN_PRODUCTION_ECPS_BASELINE_SHA256 - certificate["target_db"]["sha256"] = FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256 - payload["summary"]["candidate_unweighted_msre"] = 0.10 - payload["summary"]["baseline_unweighted_msre"] = 0.17 - certificate["baseline_metrics"]["baseline_unweighted_msre"] = 0.17 - assert certificate["baseline_dataset"]["sha256"] - assert certificate["target_db"]["sha256"] - assert certificate["policyengine_us_data"]["commit"] - assert certificate["policyengine_us"]["version"] - assert ( - certificate["baseline_metrics"]["baseline_enhanced_cps_native_loss"] - == (summary["baseline_enhanced_cps_native_loss"]) - ) - assert ( - certificate["baseline_metrics"]["baseline_holdout_loss"] - == (summary["baseline_holdout_loss"]) - ) - assert summary["candidate_household_count"] == 2 - assert summary["baseline_household_count"] == 2 - assert payload["matched_datasets"]["sample_method"] == "uniform" - assert summary["symmetric_refit"] is True - assert summary["score_candidate_only"] is False - assert summary["score_source"] == "refit_loss_matrix" - assert summary["exact_rescore_status"] == "skipped" - assert summary["refit_objective_matches_scoring"] is True - assert summary["ecps_refit_recovery_passed"] is True - assert summary["ecps_refit_effective_passed"] is True - assert summary["baseline_sanity"]["mode"] == "msre" - assert summary["baseline_sanity"]["status"] == "passed" - assert ( - summary["candidate_enhanced_cps_native_loss"] - < summary["baseline_enhanced_cps_native_loss"] - ) - assert summary["holdout_targets"] > 0 - assert set(summary["protected_family_losses"]) == { - "ssi", - "snap", - "wages", - "self_employment_income", - "capital_gains", - "interest", - "dividends", - "retirement_income", - "disability", - "household_net_income", - } - assert summary["protected_family_losses"]["wages"]["n_targets"] == 1 - target_diagnostics = payload["target_diagnostics"] - assert target_diagnostics["summary"]["n_targets"] == len(_TARGET_NAMES) - assert target_diagnostics["summary"]["candidate_wins"] + target_diagnostics[ - "summary" - ]["baseline_wins"] + target_diagnostics["summary"]["ties"] == len(_TARGET_NAMES) - assert target_diagnostics["summary"]["train_targets"] > 0 - assert target_diagnostics["summary"]["holdout_targets"] > 0 - assert target_diagnostics["top_regressions"] - assert target_diagnostics["top_improvements"] - assert len(target_diagnostics["targets"]) == len(_TARGET_NAMES) - first_target = target_diagnostics["targets"][0] - assert first_target["value_scale"] == "native" - assert first_target["target_value"] == pytest.approx(1.0) - assert "candidate_estimate" in first_target - assert "baseline_estimate" in first_target - assert "candidate_relative_error" in first_target - assert "baseline_relative_error" in first_target - assert {row["split"] for row in target_diagnostics["targets"]} == { - "train", - "holdout", - } - assert target_diagnostics["family_breakdown"] - support_summary = payload["summary"]["support_audit"] - assert support_summary["top_filing_status_gaps"][0]["filing_status"] == ( - "HEAD_OF_HOUSEHOLD" - ) - assert support_summary["top_hoh_agi_gaps"][0]["agi_bin"] == "500k_to_1m" - assert support_summary["top_ssi_by_age_gaps"][0]["age_bucket"] == "65_plus" - assert ( - support_summary["top_medicare_part_b_by_age_gaps"][0]["age_bucket"] - == "10_to_19" - ) - assert support_summary["top_aca_ptc_spending_gaps"][0]["state"] == "CA" - structure = payload["entity_structure"]["candidate_matched"] - assert structure["household_count"] == 2 - assert structure["person_count"] == 3 - assert structure["tax_unit_count"] == 2 - assert structure["tax_unit"]["singleton_unit_count"] == 1 - assert structure["tax_unit"]["singleton_unit_share"] == pytest.approx(0.5) - assert structure["tax_unit"]["duplicate_unit_id_count"] == 0 - assert structure["tax_unit"]["missing_referenced_unit_count"] == 0 - assert structure["tax_unit"]["cross_household_unit_count"] == 0 - assert structure["spm_unit_count"] == 2 - assert structure["family_count"] == 2 - assert structure["marital_unit_count"] == 3 - assert structure["marital_unit"]["singleton_unit_share"] == pytest.approx(1.0) - assert payload["entity_structure"]["baseline_refit"]["household_count"] == 2 - candidate_curve = payload["candidate_refit"]["loss_curve"] - baseline_curve = payload["baseline_refit"]["loss_curve"] - assert candidate_curve[0]["iteration"] == 0 - assert baseline_curve[0]["iteration"] == 0 - assert candidate_curve[0]["full_loss"] == pytest.approx( - payload["candidate_refit"]["initial_full_loss"] - ) - assert candidate_curve[-1]["full_loss"] == pytest.approx( - payload["candidate_refit"]["optimized_full_loss"] - ) - assert baseline_curve[-1]["holdout_loss"] == pytest.approx( - payload["baseline_refit"]["optimized_holdout_loss"] - ) - - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - shutil.copy2(candidate, artifact_dir / "candidate.h5") - _artifact_manifest(artifact_dir, baseline) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _benchmark_manifest(benchmark_manifest, certificate=certificate) - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - runtime_smoke_payload={"runtime_ratio": 1.0}, - arch_coverage_payload=_arch_coverage_payload(), - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - gate_report = json.loads(report_path.read_text()) - - assert gate_report["summary"]["status"] == "failed" - assert gate_report["summary"]["failed_required_gates"] == ["column_contract"] - assert gate_report["gates"]["ecps_comparison"]["status"] == "pass" - - -def test_sound_ecps_replacement_comparison_rejects_noncanonical_release_pins( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - targets_db = tmp_path / "policyengine_targets.db" - targets_db.write_bytes(b"not the production target database") - scorer_repo = _write_clean_git_repo(tmp_path / "policyengine-us-data") - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - with pytest.raises(ecps.ComparisonGateError, match="release-pinned production"): - ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "comparison", - optimizer_max_iter=50, - policyengine_targets_db_path=targets_db, - policyengine_us_data_repo=scorer_repo, - ) - - -def test_sound_ecps_replacement_comparison_skips_exact_rescore_by_default( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - def fail_exact_rescore(**_kwargs): - raise AssertionError("exact PE-native rescore should be opt-in") - - monkeypatch.setattr(ecps, "compute_us_pe_native_scores", fail_exact_rescore) - - payload = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "comparison", - optimizer_max_iter=50, - enforce_production_pins=False, - ) - - assert payload["summary"]["score_source"] == "refit_loss_matrix" - assert payload["summary"]["exact_rescore_requested"] is False - assert payload["summary"]["exact_rescore_status"] == "skipped" - assert payload["score"]["score_source"] == "refit_loss_matrix" - - -def test_sound_ecps_replacement_comparison_enforces_benchmark_manifest( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - targets_db = tmp_path / "policyengine_targets.db" - targets_db.write_bytes(b"pinned target database") - scorer_repo = _write_clean_git_repo(tmp_path / "policyengine-us-data") - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - bootstrap = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "bootstrap", - optimizer_max_iter=50, - policyengine_targets_db_path=targets_db, - policyengine_us_data_repo=scorer_repo, - enforce_production_pins=False, - ) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _benchmark_manifest( - benchmark_manifest, - certificate=bootstrap["frozen_ecps_baseline_certificate"], - ) - - payload = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "comparison", - optimizer_max_iter=50, - policyengine_targets_db_path=targets_db, - policyengine_us_data_repo=scorer_repo, - benchmark_manifest_path=benchmark_manifest, - enforce_production_pins=False, - ) - - assert payload["benchmark_manifest"]["certificate_match"]["status"] == "passed" - assert ( - payload["benchmark_manifest"]["certificate_match"]["checked_evidence"][ - "target_surface.target_names_sha256" - ] - == bootstrap["frozen_ecps_baseline_certificate"]["target_surface"][ - "target_names_sha256" - ] - ) - - -def test_sound_ecps_replacement_comparison_rejects_benchmark_manifest_mismatch( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - targets_db = tmp_path / "policyengine_targets.db" - targets_db.write_bytes(b"pinned target database") - scorer_repo = _write_clean_git_repo(tmp_path / "policyengine-us-data") - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - bootstrap = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "bootstrap", - optimizer_max_iter=50, - policyengine_targets_db_path=targets_db, - policyengine_us_data_repo=scorer_repo, - enforce_production_pins=False, - ) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _benchmark_manifest( - benchmark_manifest, - certificate=bootstrap["frozen_ecps_baseline_certificate"], - ) - manifest = json.loads(benchmark_manifest.read_text()) - manifest["target_surface"]["target_names_sha256"] = "f" * 64 - benchmark_manifest.write_text(json.dumps(manifest, indent=2, sort_keys=True)) - - with pytest.raises(ecps.ComparisonGateError) as excinfo: - ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "comparison", - optimizer_max_iter=50, - policyengine_targets_db_path=targets_db, - policyengine_us_data_repo=scorer_repo, - benchmark_manifest_path=benchmark_manifest, - enforce_production_pins=False, - ) - - assert "target_surface.target_names_sha256" in str(excinfo.value) - - -def test_sound_ecps_replacement_comparison_rejects_benchmark_metric_mismatch( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - targets_db = tmp_path / "policyengine_targets.db" - targets_db.write_bytes(b"pinned target database") - scorer_repo = _write_clean_git_repo(tmp_path / "policyengine-us-data") - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - bootstrap = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "bootstrap", - optimizer_max_iter=50, - policyengine_targets_db_path=targets_db, - policyengine_us_data_repo=scorer_repo, - enforce_production_pins=False, - ) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _benchmark_manifest( - benchmark_manifest, - certificate=bootstrap["frozen_ecps_baseline_certificate"], - ) - manifest = json.loads(benchmark_manifest.read_text()) - manifest["baseline_metrics"]["baseline_holdout_loss"] = 999.0 - benchmark_manifest.write_text(json.dumps(manifest, indent=2, sort_keys=True)) - - with pytest.raises(ecps.ComparisonGateError) as excinfo: - ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "comparison", - optimizer_max_iter=50, - policyengine_targets_db_path=targets_db, - policyengine_us_data_repo=scorer_repo, - benchmark_manifest_path=benchmark_manifest, - enforce_production_pins=False, - ) - - assert "baseline_metrics.baseline_holdout_loss" in str(excinfo.value) - - -def test_sound_ecps_replacement_comparison_writes_target_diagnostics_sidecar( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - output_dir = tmp_path / "comparison" - output_path = output_dir / "comparison.json" - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_scores", _fake_pe_native_scores) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - written = ecps.write_sound_ecps_replacement_comparison( - output_path, - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=output_dir, - optimizer_max_iter=50, - target_diagnostics_top_k=3, - enforce_production_pins=False, - ) - - payload = json.loads(written.read_text()) - diagnostics_path = output_dir / "target_loss_diagnostics.json" - diagnostics_payload = json.loads(diagnostics_path.read_text()) - descriptor = payload["artifacts"]["target_loss_diagnostics"] - support_descriptor = payload["artifacts"]["support_audit"] - - assert descriptor["path"] == str(diagnostics_path.resolve()) - assert descriptor["size_bytes"] == diagnostics_path.stat().st_size - assert payload["target_diagnostics"] == diagnostics_payload - support_path = output_dir / "support_audit.json" - assert support_descriptor["path"] == str(support_path.resolve()) - assert json.loads(support_path.read_text()) == payload["support_audit"] - assert diagnostics_payload["summary"]["top_k"] == 3 - assert len(diagnostics_payload["top_regressions"]) == 3 - assert len(diagnostics_payload["top_improvements"]) == 3 - - -def test_sound_ecps_replacement_comparison_flags_score_mismatch( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - def mismatched_scores(**kwargs): - payload = _fake_pe_native_scores(**kwargs) - payload["summary"]["candidate_enhanced_cps_native_loss"] += 0.1 - return payload - - monkeypatch.setattr(ecps, "compute_us_pe_native_scores", mismatched_scores) - - payload = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "comparison", - optimizer_max_iter=50, - exact_rescore=True, - enforce_production_pins=False, - ) - - assert payload["summary"]["refit_objective_matches_scoring"] is False - assert payload["summary"]["candidate_score_abs_error"] == pytest.approx(0.1) - - -def test_sound_ecps_replacement_comparison_forwards_targets_db( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - targets_db = tmp_path / "policy_data.db" - targets_db.write_bytes(b"sqlite placeholder") - captured_loss_calls: list[dict[str, object]] = [] - captured_score_call: dict[str, object] = {} - - def fake_loss_inputs_with_capture(**kwargs): - captured_loss_calls.append(kwargs) - return _fake_loss_inputs(kwargs["input_dataset_path"]) - - def fake_scores_with_capture(**kwargs): - captured_score_call.update(kwargs) - return _fake_pe_native_scores(**kwargs) - - monkeypatch.setattr( - ecps, - "_extract_pe_native_loss_inputs", - fake_loss_inputs_with_capture, - ) - monkeypatch.setattr(ecps, "compute_us_pe_native_scores", fake_scores_with_capture) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - payload = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "comparison", - optimizer_max_iter=50, - exact_rescore=True, - policyengine_targets_db_path=targets_db, - enforce_production_pins=False, - ) - - assert len(captured_loss_calls) == 2 - assert { - Path(call["policyengine_targets_db_path"]) for call in captured_loss_calls - } == {targets_db.resolve()} - assert Path(captured_score_call["policyengine_targets_db_path"]) == ( - targets_db.resolve() - ) - assert payload["summary"]["policyengine_targets_db"]["path"] == str( - targets_db.resolve() - ) - - -def test_sound_ecps_replacement_comparison_refuses_stale_matched_files( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - output_dir = tmp_path / "comparison" - output_dir.mkdir() - (output_dir / "candidate_matched.h5").write_bytes(b"stale") - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_scores", _fake_pe_native_scores) - - with pytest.raises(FileExistsError): - ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=output_dir, - optimizer_max_iter=50, - enforce_production_pins=False, - ) - - -def test_ecps_replacement_comparison_module_cli_help_runs(): - completed = subprocess.run( - [ - sys.executable, - "-m", - "microplex_us.pipelines.ecps_replacement_comparison", - "--help", - ], - capture_output=True, - check=False, - text=True, - ) - - assert completed.returncode == 0 - assert "Build a sound Microplex-vs-eCPS replacement comparison payload" in ( - completed.stdout - ) - assert "--policyengine-targets-db" in completed.stdout - - -def test_assert_refit_effective_passes_when_loss_drops(): - ecps._assert_refit_effective( - "candidate", - {"initial_full_loss": 0.05, "optimized_full_loss": 0.03}, - 1e-9, - ) - - -def test_assert_refit_effective_raises_on_no_op_refit(): - # optimized == initial: the refit never reweighted (the d2d621b failure mode). - with pytest.raises(ecps.ComparisonGateError) as excinfo: - ecps._assert_refit_effective( - "candidate", - {"initial_full_loss": 0.042815, "optimized_full_loss": 0.042815}, - 1e-9, - ) - assert "no-op" in str(excinfo.value) - - -def test_assert_refit_effective_passes_when_loss_moves_but_rises(): - # The refit minimizes the train objective; on an already-well-calibrated - # baseline the full-set loss can tick up from the held-out split even though - # the refit genuinely ran. Only a frozen no-movement refit is a failure. - ecps._assert_refit_effective( - "baseline", - {"initial_full_loss": 0.0243817, "optimized_full_loss": 0.0266164}, - 1e-9, - ) - - -def test_assert_baseline_sane_passes_on_clean_baseline(): - ecps._assert_baseline_sane({"baseline_unweighted_msre": 0.17}, 2.0) - - -def test_assert_baseline_sane_raises_on_mis_scored_surface(): - # MSRE 8.17 is the soi-total-se mis-scored surface (eCPS +615% unemployment). - with pytest.raises(ecps.ComparisonGateError) as excinfo: - ecps._assert_baseline_sane({"baseline_unweighted_msre": 8.166}, 2.0) - assert "anomalously" in str(excinfo.value) - - -def test_assert_baseline_sane_skips_when_msre_absent(): - # Exact-rescore path has no unweighted MSRE; the gate must no-op, not crash. - summary = ecps._assert_baseline_sane({}, 2.0) - assert summary["status"] == "skipped" - - -def test_assert_production_baseline_content_sane_passes_on_required_columns( - tmp_path, -): - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - - summary = ecps._assert_production_baseline_content_sane(baseline, period=2024) - - assert summary["mode"] == "content" - assert summary["status"] == "passed" - assert ( - summary["required_nonzero_columns"]["social_security_retirement"][ - "nonzero_count" - ] - == 1 - ) - - -def test_assert_production_baseline_content_sane_raises_on_missing_column( - tmp_path, -): - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - with h5py.File(baseline, "a") as handle: - del handle["social_security_retirement"] - - with pytest.raises(ecps.ComparisonGateError) as excinfo: - ecps._assert_production_baseline_content_sane(baseline, period=2024) - - assert "missing social_security_retirement/2024" in str(excinfo.value) - - -def test_sound_ecps_replacement_comparison_can_use_content_baseline_sanity( - monkeypatch, - tmp_path, -): - candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") - baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") - monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) - monkeypatch.setattr(ecps, "compute_us_pe_native_support_audit", _fake_support_audit) - - payload = ecps.build_sound_ecps_replacement_comparison( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - output_dir=tmp_path / "comparison", - optimizer_max_iter=50, - baseline_sanity_mode="content", - enforce_production_pins=False, - ) - - assert payload["summary"]["baseline_sanity"]["mode"] == "content" - assert payload["summary"]["baseline_sanity"]["status"] == "passed" diff --git a/tests/pipelines/test_experiments.py b/tests/pipelines/test_experiments.py deleted file mode 100644 index 40e4f478..00000000 --- a/tests/pipelines/test_experiments.py +++ /dev/null @@ -1,607 +0,0 @@ -"""Tests for source-mix PE-US experiment runners.""" - -from __future__ import annotations - -import json -from pathlib import Path - -from microplex_us.pipelines.artifacts import USMicroplexArtifactPaths -from microplex_us.pipelines.experiments import ( - USMicroplexExperimentReport, - USMicroplexExperimentResult, - USMicroplexSourceExperimentSpec, - _refresh_experiment_results_from_registry, - build_us_n_synthetic_sweep_experiments, - default_us_source_mix_experiments, - run_us_microplex_n_synthetic_sweep, - run_us_microplex_source_experiments, -) -from microplex_us.pipelines.performance import USMicroplexPerformanceHarnessConfig -from microplex_us.pipelines.registry import USMicroplexRunRegistryEntry -from microplex_us.pipelines.us import USMicroplexBuildConfig - - -class _DummyProvider: - def __init__(self, name: str): - self.descriptor = type("Descriptor", (), {"name": name})() - - -def _artifact_paths(root: Path, name: str) -> USMicroplexArtifactPaths: - output_dir = root / name - return USMicroplexArtifactPaths( - output_dir=output_dir, - version_id=name, - seed_data=output_dir / "seed.parquet", - synthetic_data=output_dir / "synthetic.parquet", - calibrated_data=output_dir / "calibrated.parquet", - targets=output_dir / "targets.json", - manifest=output_dir / "manifest.json", - synthesizer=None, - policyengine_dataset=output_dir / "policyengine.h5", - data_flow_snapshot=output_dir / "data_flow_snapshot.json", - artifact_inventory=output_dir / "stage_artifacts" / "artifact_inventory.json", - conditional_readiness=( - output_dir / "stage_artifacts" / "conditional_readiness.json" - ), - policyengine_harness=output_dir / "policyengine_harness.json", - policyengine_native_scores=output_dir / "policyengine_native_scores.json", - policyengine_native_audit=output_dir / "pe_us_data_rebuild_native_audit.json", - run_registry=root / "run_registry.jsonl", - run_index_db=root / "run_index.duckdb", - ) - - -def _entry( - artifact_id: str, - *, - composite_loss: float, - source_names: tuple[str, ...], -) -> USMicroplexRunRegistryEntry: - return USMicroplexRunRegistryEntry( - created_at="2026-03-25T12:00:00+00:00", - artifact_id=artifact_id, - artifact_dir=f"/tmp/{artifact_id}", - manifest_path=f"/tmp/{artifact_id}/manifest.json", - candidate_mean_abs_relative_error=composite_loss + 0.1, - baseline_mean_abs_relative_error=0.5, - mean_abs_relative_error_delta=composite_loss - 0.25, - candidate_composite_parity_loss=composite_loss, - baseline_composite_parity_loss=0.5, - composite_parity_loss_delta=composite_loss - 0.5, - source_names=source_names, - ) - - -def test_run_us_microplex_source_experiments_saves_report_and_sorts(monkeypatch, tmp_path): - call_log: list[dict[str, object]] = [] - - def fake_build_and_save( - providers, - output_root, - *, - config=None, - queries=None, - frontier_metric="candidate_composite_parity_loss", - policyengine_comparison_cache=None, - policyengine_target_provider=None, - policyengine_baseline_dataset=None, - policyengine_harness_slices=None, - policyengine_harness_metadata=None, - run_registry_path=None, - run_registry_metadata=None, - ): - experiment_name = run_registry_metadata["experiment_name"] - call_log.append( - { - "name": experiment_name, - "frontier_metric": frontier_metric, - "policyengine_comparison_cache": policyengine_comparison_cache, - "run_registry_metadata": dict(run_registry_metadata), - "policyengine_harness_metadata": dict(policyengine_harness_metadata), - } - ) - composite_loss = 0.35 if experiment_name == "cps+puf" else 0.45 - current_entry = _entry( - experiment_name, - composite_loss=composite_loss, - source_names=tuple(provider.descriptor.name for provider in providers), - ) - return type( - "FakeArtifacts", - (), - { - "artifact_paths": _artifact_paths(Path(output_root), experiment_name), - "current_entry": current_entry, - "frontier_entry": current_entry, - "frontier_delta": 0.0, - }, - )() - - monkeypatch.setattr( - "microplex_us.pipelines.experiments.build_and_save_versioned_us_microplex_from_source_providers", - fake_build_and_save, - ) - - report = run_us_microplex_source_experiments( - [ - USMicroplexSourceExperimentSpec( - name="cps-only", - providers=(_DummyProvider("cps"),), - metadata={"family": "baseline"}, - ), - USMicroplexSourceExperimentSpec( - name="cps+puf", - providers=(_DummyProvider("cps"), _DummyProvider("puf")), - metadata={"family": "tax"}, - ), - ], - tmp_path / "experiments", - metadata={"suite": "parity-search"}, - ) - - assert report.best_result is not None - assert report.best_result.name == "cps+puf" - assert [result.name for result in report.leaderboard] == ["cps+puf", "cps-only"] - assert report.metadata["suite"] == "parity-search" - assert len(call_log) == 2 - assert call_log[0]["frontier_metric"] == "candidate_composite_parity_loss" - assert call_log[0]["policyengine_comparison_cache"] is call_log[1]["policyengine_comparison_cache"] - assert call_log[0]["run_registry_metadata"]["experiment_name"] == "cps-only" - assert call_log[1]["policyengine_harness_metadata"]["experiment_name"] == "cps+puf" - - report_path = tmp_path / "experiments" / "experiment_report.json" - assert report_path.exists() - loaded = USMicroplexExperimentReport.load(report_path) - assert loaded.best_result is not None - assert loaded.best_result.name == "cps+puf" - assert loaded.leaderboard[0].current_entry is not None - assert loaded.leaderboard[0].current_entry.candidate_composite_parity_loss == 0.35 - assert loaded.leaderboard[0].artifact_paths.data_flow_snapshot is not None - assert loaded.leaderboard[0].artifact_paths.artifact_inventory is not None - assert loaded.leaderboard[0].artifact_paths.conditional_readiness is not None - assert loaded.leaderboard[0].artifact_paths.policyengine_native_scores is not None - assert loaded.leaderboard[0].artifact_paths.policyengine_native_audit is not None - assert loaded.leaderboard[0].artifact_paths.run_index_db is not None - - -def test_run_us_microplex_source_experiments_requires_at_least_one_experiment(tmp_path): - try: - run_us_microplex_source_experiments([], tmp_path / "experiments") - except ValueError as exc: - assert "at least one experiment" in str(exc) - else: - raise AssertionError("Expected ValueError for empty experiment batch") - - -def test_build_us_n_synthetic_sweep_experiments_updates_names_and_config(): - base_experiment = USMicroplexSourceExperimentSpec( - name="cps+puf", - providers=(_DummyProvider("cps"), _DummyProvider("puf")), - config=USMicroplexBuildConfig(n_synthetic=500, random_seed=11), - metadata={"family": "tax"}, - ) - - sweep = build_us_n_synthetic_sweep_experiments(base_experiment, [2000, 10000]) - - assert [experiment.name for experiment in sweep] == [ - "cps+puf-n2000", - "cps+puf-n10000", - ] - assert [experiment.config.n_synthetic for experiment in sweep] == [2000, 10000] - assert all(experiment.config.random_seed == 11 for experiment in sweep) - assert sweep[0].metadata["family"] == "tax" - assert sweep[0].metadata["base_experiment_name"] == "cps+puf" - assert sweep[1].metadata["n_synthetic"] == 10000 - - -def test_run_us_microplex_n_synthetic_sweep_expands_metadata(monkeypatch, tmp_path): - captured: dict[str, object] = {} - - def fake_run_source_experiments( - experiments, - output_root, - *, - frontier_metric="candidate_composite_parity_loss", - policyengine_target_provider=None, - policyengine_baseline_dataset=None, - policyengine_comparison_cache=None, - policyengine_harness_slices=None, - policyengine_harness_metadata=None, - run_registry_path=None, - report_path=None, - performance_harness_config=None, - performance_session=None, - metadata=None, - ): - captured["experiments"] = experiments - captured["output_root"] = Path(output_root) - captured["metadata"] = dict(metadata or {}) - return USMicroplexExperimentReport( - output_root=Path(output_root), - frontier_metric=frontier_metric, - results=(), - metadata=dict(metadata or {}), - ) - - monkeypatch.setattr( - "microplex_us.pipelines.experiments.run_us_microplex_source_experiments", - fake_run_source_experiments, - ) - - report = run_us_microplex_n_synthetic_sweep( - USMicroplexSourceExperimentSpec( - name="cps+puf", - providers=(_DummyProvider("cps"), _DummyProvider("puf")), - ), - [2000, 10000], - tmp_path / "scale-sweep", - metadata={"suite": "size-sweep"}, - ) - - sweep_experiments = captured["experiments"] - assert [experiment.name for experiment in sweep_experiments] == [ - "cps+puf-n2000", - "cps+puf-n10000", - ] - assert captured["metadata"] == { - "base_experiment_name": "cps+puf", - "n_synthetic_values": [2000, 10000], - "sweep_parameter": "n_synthetic", - "suite": "size-sweep", - } - assert report.metadata["suite"] == "size-sweep" - - -def test_default_us_source_mix_experiments_builds_standard_ladder(): - cps_provider = _DummyProvider("cps") - puf_provider = _DummyProvider("puf") - psid_provider = _DummyProvider("psid") - - experiments = default_us_source_mix_experiments( - cps_provider=cps_provider, - puf_provider=puf_provider, - psid_provider=psid_provider, - ) - - assert [experiment.name for experiment in experiments] == [ - "cps-only", - "cps+puf", - "cps+psid", - "cps+puf+psid", - ] - assert experiments[0].metadata["sources"] == ["cps"] - assert experiments[-1].metadata["sources"] == ["cps", "puf", "psid"] - - -def test_run_us_microplex_source_experiments_can_use_performance_session( - monkeypatch, - tmp_path, -): - calls: dict[str, list[object]] = { - "warm": [], - "run": [], - "save": [], - } - - class FakeSession: - def __init__(self): - self.comparison_cache = object() - - def warm_parity_cache(self, *, config): - calls["warm"].append(config) - - def run(self, providers, *, config, queries=None): - calls["run"].append( - { - "providers": tuple(provider.descriptor.name for provider in providers), - "config": config, - "queries": queries, - } - ) - return type( - "FakePerformanceResult", - (), - { - "build_result": f"build:{'+'.join(provider.descriptor.name for provider in providers)}", - "parity_run": type( - "FakeParityRun", - (), - { - "to_dict": lambda self: { - "summary": { - "candidate_composite_parity_loss": 0.4, - "baseline_composite_parity_loss": 0.5, - } - } - }, - )(), - "pe_native_scores": { - "summary": { - "candidate_enhanced_cps_native_loss": 0.9, - "baseline_enhanced_cps_native_loss": 1.1, - "enhanced_cps_native_loss_delta": -0.2, - } - }, - }, - )() - - def fake_save_build_result( - build_result, - output_root, - *, - frontier_metric="candidate_composite_parity_loss", - policyengine_comparison_cache=None, - policyengine_target_provider=None, - policyengine_baseline_dataset=None, - policyengine_harness_slices=None, - policyengine_harness_metadata=None, - precomputed_policyengine_harness_payload=None, - defer_policyengine_harness=False, - precomputed_policyengine_native_scores=None, - defer_policyengine_native_score=False, - run_registry_path=None, - run_registry_metadata=None, - version_id=None, - ): - experiment_name = run_registry_metadata["experiment_name"] - calls["save"].append( - { - "build_result": build_result, - "output_root": Path(output_root), - "frontier_metric": frontier_metric, - "policyengine_comparison_cache": policyengine_comparison_cache, - "policyengine_harness_metadata": dict(policyengine_harness_metadata), - "precomputed_policyengine_harness_payload": precomputed_policyengine_harness_payload, - "defer_policyengine_harness": defer_policyengine_harness, - "precomputed_policyengine_native_scores": precomputed_policyengine_native_scores, - "defer_policyengine_native_score": defer_policyengine_native_score, - "run_registry_metadata": dict(run_registry_metadata), - "version_id": version_id, - } - ) - current_entry = _entry( - experiment_name, - composite_loss=0.3 if experiment_name == "cps+puf" else 0.4, - source_names=tuple(run_registry_metadata["sources"]) - if "sources" in run_registry_metadata - else (experiment_name,), - ) - return type( - "FakeArtifacts", - (), - { - "artifact_paths": _artifact_paths(Path(output_root), experiment_name), - "current_entry": current_entry, - "frontier_entry": current_entry, - "frontier_delta": 0.0, - }, - )() - - monkeypatch.setattr( - "microplex_us.pipelines.experiments.save_versioned_us_microplex_build_result", - fake_save_build_result, - ) - registry_entries = [ - _entry("cps-only", composite_loss=0.4, source_names=("cps",)), - _entry("cps+puf", composite_loss=0.3, source_names=("cps", "puf")), - ] - monkeypatch.setattr( - "microplex_us.pipelines.experiments.backfill_us_pe_native_scores_bundles", - lambda artifact_dirs, **kwargs: [ - Path(path) / "manifest.json" for path in artifact_dirs - ], - ) - calls["backfill_native_audit"] = [] - monkeypatch.setattr( - "microplex_us.pipelines.experiments.backfill_us_pe_native_audit_bundles", - lambda artifact_dirs, **kwargs: calls["backfill_native_audit"].append( - { - "artifact_dirs": [Path(path) for path in artifact_dirs], - "kwargs": dict(kwargs), - } - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.experiments.load_us_microplex_run_registry", - lambda _path: registry_entries, - ) - monkeypatch.setattr( - "microplex_us.pipelines.experiments.select_us_microplex_frontier_entry", - lambda _path, *, metric="candidate_composite_parity_loss": min( - registry_entries, - key=lambda entry: getattr(entry, metric), - ), - ) - - session = FakeSession() - performance_config = USMicroplexPerformanceHarnessConfig( - sample_n=25, - n_synthetic=25, - targets_db="/tmp/policy_data.db", - baseline_dataset="/tmp/baseline.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - evaluate_parity=True, - evaluate_pe_native_loss=True, - ) - report = run_us_microplex_source_experiments( - [ - USMicroplexSourceExperimentSpec( - name="cps-only", - providers=(_DummyProvider("cps"),), - config=USMicroplexBuildConfig(n_synthetic=2000, random_seed=7), - metadata={"sources": ["cps"]}, - ), - USMicroplexSourceExperimentSpec( - name="cps+puf", - providers=(_DummyProvider("cps"), _DummyProvider("puf")), - config=USMicroplexBuildConfig(n_synthetic=4000, random_seed=9), - metadata={"sources": ["cps", "puf"]}, - ), - ], - tmp_path / "experiments", - performance_harness_config=performance_config, - performance_session=session, - ) - - assert report.best_result is not None - assert report.best_result.name == "cps+puf" - assert len(calls["warm"]) == 1 - assert len(calls["run"]) == 2 - assert len(calls["save"]) == 2 - assert calls["run"][0]["config"].evaluate_parity is False - assert calls["run"][0]["config"].evaluate_pe_native_loss is False - assert calls["run"][0]["config"].n_synthetic == 2000 - assert calls["run"][0]["config"].random_seed == 7 - assert calls["run"][1]["config"].n_synthetic == 4000 - assert calls["run"][1]["config"].random_seed == 9 - assert calls["save"][0]["policyengine_comparison_cache"] is session.comparison_cache - assert calls["save"][1]["run_registry_metadata"]["experiment_name"] == "cps+puf" - assert calls["save"][1]["policyengine_harness_metadata"]["experiment_name"] == "cps+puf" - assert calls["save"][0]["precomputed_policyengine_harness_payload"] == { - "summary": { - "candidate_composite_parity_loss": 0.4, - "baseline_composite_parity_loss": 0.5, - } - } - assert calls["save"][0]["defer_policyengine_harness"] is False - assert calls["save"][0]["precomputed_policyengine_native_scores"] is None - assert calls["save"][0]["defer_policyengine_native_score"] is True - assert len(calls["backfill_native_audit"]) == 1 - assert calls["backfill_native_audit"][0]["artifact_dirs"] == [ - tmp_path / "experiments" / "cps-only", - tmp_path / "experiments" / "cps+puf", - ] - assert ( - calls["backfill_native_audit"][0]["kwargs"]["policyengine_us_data_repo"] - == "/tmp/policyengine-us-data" - ) - assert report.best_result.current_entry is not None - assert report.best_result.current_entry.artifact_id == "cps+puf" - - -def test_refresh_experiment_results_from_registry_returns_original_results_when_empty( - tmp_path, -): - registry_path = tmp_path / "run_registry.jsonl" - results = ( - USMicroplexExperimentResult( - name="cps-only", - artifact_paths=_artifact_paths(tmp_path, "cps-only"), - frontier_metric="candidate_composite_parity_loss", - frontier_delta=None, - ), - ) - - loaded = _refresh_experiment_results_from_registry( - results, - run_registry_path=registry_path, - frontier_metric="candidate_composite_parity_loss", - ) - - assert loaded == results - - -def test_refresh_experiment_results_from_registry_refreshes_backfilled_artifact_paths( - monkeypatch, - tmp_path, -) -> None: - output_dir = tmp_path / "cps-only" - output_dir.mkdir() - manifest_path = output_dir / "manifest.json" - manifest_path.write_text( - json.dumps( - { - "artifacts": { - "data_flow_snapshot": "data_flow_snapshot.json", - "artifact_inventory": "stage_artifacts/artifact_inventory.json", - "conditional_readiness": ( - "stage_artifacts/conditional_readiness.json" - ), - "policyengine_native_scores": "policyengine_native_scores.json", - "policyengine_native_audit": "pe_us_data_rebuild_native_audit.json", - } - } - ) - ) - for name in ( - "data_flow_snapshot.json", - "policyengine_native_scores.json", - "pe_us_data_rebuild_native_audit.json", - ): - (output_dir / name).write_text("{}") - (output_dir / "stage_artifacts").mkdir() - for name in ("artifact_inventory.json", "conditional_readiness.json"): - (output_dir / "stage_artifacts" / name).write_text("{}") - registry_path = tmp_path / "run_registry.jsonl" - result = USMicroplexExperimentResult( - name="cps-only", - artifact_paths=USMicroplexArtifactPaths( - output_dir=output_dir, - version_id="cps-only", - seed_data=output_dir / "seed.parquet", - synthetic_data=output_dir / "synthetic.parquet", - calibrated_data=output_dir / "calibrated.parquet", - targets=output_dir / "targets.json", - manifest=manifest_path, - policyengine_native_scores=None, - policyengine_native_audit=None, - data_flow_snapshot=None, - ), - frontier_metric="candidate_composite_parity_loss", - frontier_delta=None, - ) - registry_entries = [ - _entry("cps-only", composite_loss=0.4, source_names=("cps",)), - ] - monkeypatch.setattr( - "microplex_us.pipelines.experiments.load_us_microplex_run_registry", - lambda _path: registry_entries, - ) - monkeypatch.setattr( - "microplex_us.pipelines.experiments.select_us_microplex_frontier_entry", - lambda _path, *, metric="candidate_composite_parity_loss": registry_entries[0] - ) - loaded = _refresh_experiment_results_from_registry( - (result,), - run_registry_path=registry_path, - frontier_metric="candidate_composite_parity_loss", - ) - - assert loaded[0].artifact_paths.data_flow_snapshot == output_dir / "data_flow_snapshot.json" - assert loaded[0].artifact_paths.artifact_inventory == ( - output_dir / "stage_artifacts" / "artifact_inventory.json" - ) - assert loaded[0].artifact_paths.conditional_readiness == ( - output_dir / "stage_artifacts" / "conditional_readiness.json" - ) - assert ( - loaded[0].artifact_paths.policyengine_native_scores - == output_dir / "policyengine_native_scores.json" - ) - assert ( - loaded[0].artifact_paths.policyengine_native_audit - == output_dir / "pe_us_data_rebuild_native_audit.json" - ) - - -def test_run_us_microplex_source_experiments_requires_performance_config_for_session( - tmp_path, -): - class FakeSession: - comparison_cache = object() - - try: - run_us_microplex_source_experiments( - [ - USMicroplexSourceExperimentSpec( - name="cps-only", - providers=(_DummyProvider("cps"),), - ) - ], - tmp_path / "experiments", - performance_session=FakeSession(), - ) - except ValueError as exc: - assert "performance_harness_config is required" in str(exc) - else: - raise AssertionError("Expected ValueError when session is provided without config") diff --git a/tests/pipelines/test_export_lineage_manifest.py b/tests/pipelines/test_export_lineage_manifest.py deleted file mode 100644 index 171e77f8..00000000 --- a/tests/pipelines/test_export_lineage_manifest.py +++ /dev/null @@ -1,63 +0,0 @@ -import json - -import h5py -import numpy as np - -from microplex_us.pipelines.export_lineage_manifest import ( - build_export_lineage_manifest, -) - - -def _columns_by_name(payload): - return {column["column"]: column for column in payload["columns"]} - - -def test_export_lineage_manifest_tracks_source_backed_blocks(): - payload = build_export_lineage_manifest() - columns = _columns_by_name(payload) - - for column in ( - "business_is_sstb", - "home_mortgage_interest", - "reported_has_medicaid_health_coverage_at_interview", - "ssn_card_type", - "selected_marketplace_plan_benchmark_ratio", - "weekly_hours_worked_before_lsr", - ): - assert columns[column]["has_source_lineage"] - - -def test_export_lineage_manifest_flags_populated_ecps_default_only_column(tmp_path): - contract_path = tmp_path / "contract.json" - contract_path.write_text( - json.dumps( - { - "required": [ - "is_wic_at_nutritional_risk", - "weekly_hours_worked_before_lsr", - ], - "forbidden": [], - } - ) - ) - baseline_path = tmp_path / "baseline.h5" - with h5py.File(baseline_path, "w") as handle: - wic = handle.create_group("is_wic_at_nutritional_risk") - wic.create_dataset("2024", data=np.array([False, True])) - weekly_hours = handle.create_group("weekly_hours_worked_before_lsr") - weekly_hours.create_dataset("2024", data=np.array([0.0, 40.0])) - - payload = build_export_lineage_manifest( - contract_path=contract_path, - support_baseline=baseline_path, - ) - - issues = {issue["column"]: issue for issue in payload["issues"]} - assert issues == { - "is_wic_at_nutritional_risk": { - "column": "is_wic_at_nutritional_risk", - "ecps_support_requirement": "categorical_variation", - "export_path_status": "default_only", - "issue": "ecps_populated_export_has_no_source_lineage", - } - } diff --git a/tests/pipelines/test_geoid_cd_encoding.py b/tests/pipelines/test_geoid_cd_encoding.py deleted file mode 100644 index 6b919b2c..00000000 --- a/tests/pipelines/test_geoid_cd_encoding.py +++ /dev/null @@ -1,181 +0,0 @@ -"""Geoid carry-over guards for the CPS block-geography export (PR #129). - -Two regressions are pinned here, both surfaced while reconciling PR #129 -against the closed PR #130: - -1. ``congressional_district_geoid`` encoding must match the eCPS 436-CD - calibration universe, which encodes at-large districts (and DC) as district - ``01`` -- e.g. AK->201, WY->5601, DC->1101. The raw Census block crosswalk - carries at-large as ``00``/``98``; eCPS normalizes those to ``01`` in - ``policyengine-us-data db/create_initial_strata.py`` before writing the - targets DB and the dataset. ``_congressional_district_geoid_from_cd_id`` must - reproduce the ``01`` convention regardless of which input form it is fed. - -2. ``_attach_household_census_geographies`` must not assume a unique household - -frame index. It assigns blocks and writes them back via ``.loc[row_index]``; - a duplicate index previously raised - ``ValueError: cannot reindex on an axis with duplicate labels``. -""" - -import pandas as pd -import pytest - -from microplex_us.pipelines.us import ( - _attach_household_census_geographies, - _congressional_district_geoid_from_cd_id, - _default_block_geography, -) - -# state_fips for the 7 single-seat / at-large jurisdictions in the 119th Congress. -_AT_LARGE_STATES = { - "AK": 2, - "DE": 10, - "DC": 11, - "ND": 38, - "SD": 46, - "VT": 50, - "WY": 56, -} - - -def test_multi_district_encoding_is_ssdd(): - cases = { - ("CA-52", 6): 652, - ("NY-12", 36): 3612, - ("TX-38", 48): 4838, - ("FL-28", 12): 1228, - ("AL-02", 1): 102, - } - for (cd_id, state), expected in cases.items(): - assert _congressional_district_geoid_from_cd_id(cd_id, state) == expected - - -def test_at_large_uses_district_01_via_AL_token(): - # Microplex's crosswalk feeds the "-AL" token; every at-large state and - # DC must encode to district 01 (state*100 + 1), matching the eCPS universe. - for abbr, state in _AT_LARGE_STATES.items(): - assert ( - _congressional_district_geoid_from_cd_id(f"{abbr}-AL", state) - == state * 100 + 1 - ) - - -def test_raw_census_at_large_forms_normalize_to_01(): - # Hardening: even if a raw Census form leaks through (DC as "98", at-large as - # "ZZ" or "00"), the encoder must still produce district 01, never 1198/5600. - assert _congressional_district_geoid_from_cd_id("DC-98", 11) == 1101 - assert _congressional_district_geoid_from_cd_id("WY-ZZ", 56) == 5601 - assert _congressional_district_geoid_from_cd_id("AK-00", 2) == 201 - assert _congressional_district_geoid_from_cd_id("98", 11) == 1101 - - -def test_no_at_large_geoid_ends_in_00_or_98(): - # The invariant that distinguishes the eCPS universe from the raw crosswalk. - for abbr, state in _AT_LARGE_STATES.items(): - for token in (f"{abbr}-AL", f"{abbr}-00", f"{abbr}-98", f"{abbr}-ZZ"): - geoid = _congressional_district_geoid_from_cd_id(token, state) - assert geoid % 100 not in (0, 98), f"{token} -> {geoid}" - - -def test_invalid_inputs_return_zero(): - assert _congressional_district_geoid_from_cd_id("", 6) == 0 - assert _congressional_district_geoid_from_cd_id("nan", 6) == 0 - assert _congressional_district_geoid_from_cd_id("", 6) == 0 - assert _congressional_district_geoid_from_cd_id("CA-12", "not-a-state") == 0 - - -# --- duplicate-index robustness ------------------------------------------------- - - -class _StubAssigner: - def __init__(self, block_geoid: str): - self._block_geoid = block_geoid - - def assign(self, frame: pd.DataFrame, random_state: int = 0) -> pd.DataFrame: - out = frame.copy() - out["block_geoid"] = self._block_geoid - return out - - -class _StubBlockGeography: - """Minimal BlockGeography surface for exercising the assignment write-back. - - Returns a single deterministic CA block so the test focuses on index - handling, not the probabilistic draw. - """ - - _BLOCK = "060371000001000" - - def __init__(self): - self.data = pd.DataFrame({"county_fips": ["06037"]}) - - def load_assigner(self, query) -> _StubAssigner: # noqa: ANN001 - query unused - return _StubAssigner(self._BLOCK) - - def assign(self, frame: pd.DataFrame, random_state: int = 0) -> pd.DataFrame: - out = frame.copy() - out["block_geoid"] = self._BLOCK - return out - - def materialize(self, frame: pd.DataFrame, columns=()) -> pd.DataFrame: - out = frame.copy() - out["state_fips"] = "06" - out["county_fips"] = "06037" - out["tract_geoid"] = "06037100000" - out["cd_id"] = "CA-37" - return out - - -def test_attach_geographies_handles_duplicate_household_index(): - # Duplicate labels [0, 0, 1] previously broke the .loc[row_index] write-back. - households = pd.DataFrame( - { - "household_id": [10, 11, 12], - "state_fips": [6, 6, 6], - "county_fips": [37, 37, 37], # CPS fragment -> 06037 - }, - index=[0, 0, 1], - ) - result = _attach_household_census_geographies( - households, seed=0, geography=_StubBlockGeography() - ) - assert len(result) == 3 - assert (result["block_geoid"] == "060371000001000").all() - assert (result["tract_geoid"] == "06037100000").all() - # CA-37 -> 6 * 100 + 37 - assert (result["congressional_district_geoid"] == 637).all() - # household_id is preserved (consumed downstream via merge on this column). - assert sorted(result["household_id"]) == [10, 11, 12] - - -# --- live universe parity (skips when the crosswalk parquet is unavailable) ----- - - -def test_cd_encoder_reproduces_ecps_436_cd_universe(): - """Run the encoder over the real block crosswalk's distinct (state, cd_id). - - Verified during review to equal the eCPS calibration target universe in - policy_data.db exactly (436 CDs, at-large=01). Skips in environments without - the crosswalk parquet (e.g. CI). - """ - try: - data = _default_block_geography().data - except (FileNotFoundError, OSError): - pytest.skip("block crosswalk parquet not available") - if "cd_id" not in data.columns or "state_fips" not in data.columns: - pytest.skip("crosswalk lacks cd_id/state_fips columns") - - pairs = data[["state_fips", "cd_id"]].dropna().drop_duplicates() - geoids = { - _congressional_district_geoid_from_cd_id(cd_id, state) - for state, cd_id in zip(pairs["state_fips"], pairs["cd_id"], strict=False) - } - geoids.discard(0) - - assert len(geoids) == 436, f"expected 436-CD universe, got {len(geoids)}" - # at-large districts encode to 01, so nothing ends in 00. - assert not any(g % 100 == 0 for g in geoids) - for g in geoids: - state, district = divmod(g, 100) - assert 1 <= state <= 78, f"invalid state in {g}" - assert 1 <= district <= 53, f"invalid district in {g}" diff --git a/tests/pipelines/test_hf_artifacts.py b/tests/pipelines/test_hf_artifacts.py deleted file mode 100644 index cadfb839..00000000 --- a/tests/pipelines/test_hf_artifacts.py +++ /dev/null @@ -1,251 +0,0 @@ -"""Tests for Hugging Face artifact publishing.""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -from microplex_us.pipelines import hf_artifacts -from microplex_us.pipelines.hf_artifacts import ( - HF_PUBLISH_MANIFEST_FILENAME, - HuggingFacePublishConfig, - build_hf_repo_path, - dataset_repo_paths, - diagnostics_repo_paths, - publish_microplex_artifact_to_hf, - smoke_published_hf_artifact, -) - - -class FakeHfApi: - def __init__(self) -> None: - self.commits: list[dict[str, Any]] = [] - self.files_by_repo: dict[str, list[str]] = {} - - def create_commit(self, **kwargs: Any) -> None: - self.commits.append(kwargs) - - def list_repo_files(self, **kwargs: Any) -> list[str]: - return self.files_by_repo[kwargs["repo_id"]] - - -def _fake_add(path_in_repo: str, local_path: Path) -> dict[str, str]: - return { - "path_in_repo": path_in_repo, - "path_or_fileobj": str(local_path), - } - - -def _fake_add_text(path_in_repo: str, text: str) -> dict[str, str]: - return { - "path_in_repo": path_in_repo, - "path_or_fileobj": text, - } - - -def _fake_add_bytes(path_in_repo: str, payload: dict[str, Any]) -> dict[str, str]: - return _fake_add_text(path_in_repo, json.dumps(payload, sort_keys=True)) - - -def _patch_operations(monkeypatch) -> None: - monkeypatch.setattr(hf_artifacts, "_commit_add", _fake_add) - monkeypatch.setattr(hf_artifacts, "_commit_add_text", _fake_add_text) - monkeypatch.setattr(hf_artifacts, "_commit_add_bytes", _fake_add_bytes) - - -def _write_bundle(tmp_path: Path) -> Path: - artifact_dir = tmp_path / "run-a" - artifact_dir.mkdir() - (artifact_dir / "policyengine_us.h5").write_bytes(b"h5") - (artifact_dir / "policyengine_native_scores.json").write_text( - json.dumps( - { - "summary": { - "candidate_enhanced_cps_native_loss": 0.3, - "enhanced_cps_native_loss_delta": -0.1, - } - } - ) - ) - (artifact_dir / "pe_us_data_rebuild_native_audit.json").write_text("{}") - (artifact_dir / "pe_native_target_diagnostics.json").write_text("{}") - manifest = { - "created_at": "2026-06-03T00:00:00+00:00", - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "policyengine_native_scores": "policyengine_native_scores.json", - "policyengine_native_audit": "pe_us_data_rebuild_native_audit.json", - "policyengine_native_target_diagnostics": ( - "pe_native_target_diagnostics.json" - ), - }, - "policyengine_native_scores": { - "candidate_enhanced_cps_native_loss": 0.3, - "enhanced_cps_native_loss_delta": -0.1, - }, - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - return artifact_dir - - -def test_build_hf_repo_path_normalizes_parts() -> None: - assert build_hf_repo_path("/runs/", "run-a", "/manifest.json") == ( - "runs/run-a/manifest.json" - ) - - -def test_diagnostics_repo_paths_use_stable_layout() -> None: - paths = diagnostics_repo_paths("artifact/run-a", run_id="run-a") - - assert paths["manifest"] == "runs/run-a/manifest.json" - assert paths["policyengine_native_scores"] == ( - "runs/run-a/policyengine_native_scores.json" - ) - assert paths["policyengine_native_audit"] == ( - "runs/run-a/pe_us_data_rebuild_native_audit.json" - ) - assert paths["policyengine_native_target_diagnostics"] == ( - "runs/run-a/pe_native_target_diagnostics.json" - ) - assert paths["latest"] == "latest.json" - assert paths["run_registry"] == "run_registry.jsonl" - - -def test_dataset_repo_paths_can_include_promoted_current_files() -> None: - paths = dataset_repo_paths("artifact/run-a", run_id="run-a", promote=True) - - assert paths["policyengine_dataset"] == "staging/run-a/policyengine_us.h5" - assert paths["manifest"] == "staging/run-a/manifest.json" - assert paths["promoted_policyengine_dataset"] == "policyengine_us.h5" - assert paths["promoted_manifest"] == "manifest.json" - - -def test_publish_diagnostics_dry_run_writes_manifest( - tmp_path: Path, - monkeypatch, -) -> None: - _patch_operations(monkeypatch) - artifact_dir = _write_bundle(tmp_path) - config = HuggingFacePublishConfig( - diagnostics_repo="policyengine/microplex-us-diagnostics", - dataset_repo="policyengine/microplex-us-deployed-datasets", - token="token", - ) - - result = publish_microplex_artifact_to_hf( - artifact_dir, - config, - dry_run=True, - registry_loader=lambda _config: "", - ) - - assert result["status"] == "dry_run" - assert result["diagnostics"]["paths"]["latest"] == "latest.json" - registry_entry = result["diagnostics"]["run_registry_entry"] - assert registry_entry["run_id"] == "run-a" - assert registry_entry["candidate_enhanced_cps_native_loss"] == 0.3 - local_manifest = json.loads( - (artifact_dir / HF_PUBLISH_MANIFEST_FILENAME).read_text() - ) - assert local_manifest["diagnostics"]["operation_count"] == 6 - - -def test_publish_full_bundle_calls_expected_repos( - tmp_path: Path, - monkeypatch, -) -> None: - _patch_operations(monkeypatch) - artifact_dir = _write_bundle(tmp_path) - api = FakeHfApi() - config = HuggingFacePublishConfig( - diagnostics_repo="policyengine/microplex-us-diagnostics", - dataset_repo="policyengine/microplex-us-deployed-datasets", - token="token", - ) - - result = publish_microplex_artifact_to_hf( - artifact_dir, - config, - publish_dataset=True, - promote_dataset=True, - api=api, - registry_loader=lambda _config: ( - '{"run_id":"old-run","manifest":"runs/old-run/manifest.json"}\n' - ), - ) - - assert result["status"] == "published" - assert [commit["repo_id"] for commit in api.commits] == [ - "policyengine/microplex-us-diagnostics", - "policyengine/microplex-us-deployed-datasets", - ] - diagnostics_paths = { - op["path_in_repo"] for op in api.commits[0]["operations"] - } - assert "latest.json" in diagnostics_paths - assert "run_registry.jsonl" in diagnostics_paths - dataset_paths = {op["path_in_repo"] for op in api.commits[1]["operations"]} - assert "staging/run-a/policyengine_us.h5" in dataset_paths - assert "staging/run-a/manifest.json" in dataset_paths - assert "policyengine_us.h5" in dataset_paths - assert "manifest.json" in dataset_paths - - -def test_smoke_published_hf_artifact_passes_when_expected_files_exist() -> None: - api = FakeHfApi() - api.files_by_repo = { - "policyengine/microplex-us-diagnostics": [ - "latest.json", - "run_registry.jsonl", - "runs/run-a/manifest.json", - "runs/run-a/policyengine_native_scores.json", - "runs/run-a/pe_us_data_rebuild_native_audit.json", - "runs/run-a/pe_native_target_diagnostics.json", - ], - "policyengine/microplex-us-deployed-datasets": [ - "staging/run-a/policyengine_us.h5", - "staging/run-a/manifest.json", - "policyengine_us.h5", - "manifest.json", - ], - } - config = HuggingFacePublishConfig( - diagnostics_repo="policyengine/microplex-us-diagnostics", - dataset_repo="policyengine/microplex-us-deployed-datasets", - ) - - result = smoke_published_hf_artifact( - config, - api=api, - latest_loader=lambda _config: {"run_id": "run-a"}, - ) - - assert result["status"] == "passed" - assert result["missing_count"] == 0 - assert result["run_id"] == "run-a" - - -def test_smoke_published_hf_artifact_reports_missing_files() -> None: - api = FakeHfApi() - api.files_by_repo = { - "policyengine/microplex-us-diagnostics": ["latest.json"], - "policyengine/microplex-us-deployed-datasets": [ - "staging/run-a/policyengine_us.h5" - ], - } - config = HuggingFacePublishConfig( - diagnostics_repo="policyengine/microplex-us-diagnostics", - dataset_repo="policyengine/microplex-us-deployed-datasets", - ) - - result = smoke_published_hf_artifact( - config, - run_id="run-a", - api=api, - latest_loader=lambda _config: {"run_id": "ignored"}, - ) - - assert result["status"] == "failed" - assert "runs/run-a/manifest.json" in result["diagnostics"]["missing"] - assert "manifest.json" in result["dataset"]["missing"] diff --git a/tests/pipelines/test_imputation_ablation.py b/tests/pipelines/test_imputation_ablation.py deleted file mode 100644 index b2db832b..00000000 --- a/tests/pipelines/test_imputation_ablation.py +++ /dev/null @@ -1,165 +0,0 @@ -"""Tests for imputation ablation scorecards.""" - -from __future__ import annotations - -import json - -import pandas as pd -import pytest - -from microplex_us.pipelines.imputation_ablation import ( - ImputationAblationSliceSpec, - ImputationAblationVariant, - default_imputation_ablation_variants, - score_imputation_ablation_variants, -) - - -def test_score_imputation_ablation_variants_ranks_structured_candidate() -> None: - observed = pd.DataFrame( - { - "age_band": ["adult", "adult", "senior", "senior"], - "tax_unit_is_joint": [0, 1, 0, 1], - "weight": [1.0, 2.0, 1.0, 2.0], - "employment_income": [0.0, 100.0, 0.0, 200.0], - } - ) - broad = observed.copy() - broad["employment_income"] = [50.0, 50.0, 50.0, 50.0] - structured = observed.copy() - structured["employment_income"] = [0.0, 90.0, 0.0, 210.0] - - report = score_imputation_ablation_variants( - observed_frame=observed, - imputed_frames={ - "broad_common_qrf": broad, - "structured_pe_conditioning": structured, - }, - target_variables=("employment_income",), - slice_specs=( - ImputationAblationSliceSpec( - name="age_by_joint", - columns=("age_band", "tax_unit_is_joint"), - ), - ), - weight_column="weight", - ) - - broad_metrics = report.variants["broad_common_qrf"].aggregate_metrics - structured_metrics = report.variants["structured_pe_conditioning"].aggregate_metrics - assert structured_metrics["mean_weighted_mae"] < broad_metrics["mean_weighted_mae"] - assert structured_metrics["mean_support_f1"] > broad_metrics["mean_support_f1"] - assert ( - structured_metrics["mean_slice_positive_rate_delta"] - < broad_metrics["mean_slice_positive_rate_delta"] - ) - assert report.variants["structured_pe_conditioning"].variant.semantic_guards is True - - -def test_score_imputation_ablation_report_is_json_serializable() -> None: - observed = pd.DataFrame( - { - "slice": ["a", "a", "b"], - "weight": [1.0, 1.0, 1.0], - "rent": [0.0, 100.0, 200.0], - } - ) - imputed = observed.copy() - imputed["rent"] = [0.0, 120.0, 180.0] - - report = score_imputation_ablation_variants( - observed_frame=observed, - imputed_frames={"custom": imputed}, - variants=( - ImputationAblationVariant( - name="custom", - description="custom test", - condition_selection="fixture", - ), - ), - target_variables=("rent",), - slice_specs=(ImputationAblationSliceSpec(name="slice", columns=("slice",)),), - weight_column="weight", - post_calibration_metrics={ - "custom": { - "post_calibration_native_loss": 0.4, - "household_effective_sample_size": 2.5, - } - }, - ) - - payload = report.to_dict() - assert payload["variants"]["custom"]["post_calibration_metrics"] == { - "post_calibration_native_loss": 0.4, - "household_effective_sample_size": 2.5, - } - json.dumps(payload) - - -def test_score_imputation_ablation_scores_matching_non_default_indexes() -> None: - observed = pd.DataFrame( - { - "slice": ["a", "b"], - "weight": [1.0, 1.0], - "rent": [100.0, 200.0], - }, - index=[10, 11], - ) - imputed = pd.DataFrame( - { - "rent": [100.0, 200.0], - }, - index=[10, 11], - ) - - report = score_imputation_ablation_variants( - observed_frame=observed, - imputed_frames={"candidate": imputed}, - target_variables=("rent",), - slice_specs=(ImputationAblationSliceSpec(name="slice", columns=("slice",)),), - weight_column="weight", - ) - - score = report.variants["candidate"].target_scores["rent"] - assert score.mean_absolute_error == 0.0 - assert score.weighted_mean_absolute_error == 0.0 - assert score.weighted_total_relative_error == 0.0 - slice_score = report.variants["candidate"].slice_scores[0] - assert slice_score.total_js_divergence == 0.0 - - -def test_score_imputation_ablation_rejects_mismatched_indexes() -> None: - observed = pd.DataFrame({"weight": [1.0, 1.0], "rent": [100.0, 200.0]}) - imputed = pd.DataFrame( - { - "rent": [100.0, 200.0], - }, - index=[10, 11], - ) - - with pytest.raises(ValueError, match="matching indexes"): - score_imputation_ablation_variants( - observed_frame=observed, - imputed_frames={"candidate": imputed}, - target_variables=("rent",), - weight_column="weight", - ) - - -def test_default_imputation_ablation_variants_encode_hypothesis() -> None: - variants = { - variant.name: variant for variant in default_imputation_ablation_variants() - } - - assert set(variants) == { - "broad_common_qrf", - "structured_pe_conditioning", - "broad_common_with_guards", - "rich_predictor_stress", - } - assert variants["broad_common_qrf"].condition_selection == "all_shared" - structured = variants["structured_pe_conditioning"] - assert structured.condition_selection == "pe_prespecified" - assert "age" in structured.primary_predictors - assert "tax_unit_is_joint" in structured.hard_gate_columns - assert structured.support_mapping == "zero_inflated_positive" diff --git a/tests/pipelines/test_local_reweighting.py b/tests/pipelines/test_local_reweighting.py deleted file mode 100644 index dcf3809c..00000000 --- a/tests/pipelines/test_local_reweighting.py +++ /dev/null @@ -1,52 +0,0 @@ -from __future__ import annotations - -import pandas as pd -from microplex.core import EntityType -from microplex.targets import FilterOperator, TargetFilter, TargetSpec - -from microplex_us.pipelines import reweight_us_household_targets -from microplex_us.policyengine.us import PolicyEngineUSEntityTableBundle - - -def test_reweight_us_household_targets_updates_household_and_person_weights(): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20], - "household_weight": [1.0, 1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "weight": [1.0, 1.0, 1.0], - "age": [5, 8, 30], - "state_fips": ["06", "06", "36"], - } - ), - ) - targets = [ - TargetSpec( - name="state06_age0_10", - entity=EntityType.PERSON, - value=4.0, - period=2024, - aggregation="count", - filters=( - TargetFilter("state_fips", FilterOperator.EQ, "06"), - TargetFilter("age", FilterOperator.GTE, 0), - TargetFilter("age", FilterOperator.LT, 10), - ), - ) - ] - - result = reweight_us_household_targets( - tables, - targets=targets, - ) - - assert result.tables.households["household_weight"].tolist() == [2.0, 1.0] - assert result.tables.persons["weight"].tolist() == [2.0, 2.0, 1.0] - assert result.diagnostics.constraint_count == 1 - assert result.compilation.skipped_targets == () diff --git a/tests/pipelines/test_mp300k_artifact_gates.py b/tests/pipelines/test_mp300k_artifact_gates.py deleted file mode 100644 index 82419d85..00000000 --- a/tests/pipelines/test_mp300k_artifact_gates.py +++ /dev/null @@ -1,1832 +0,0 @@ -"""Tests for mp-300k artifact quality gates.""" - -from __future__ import annotations - -import json -from pathlib import Path - -import h5py -import numpy as np -import pytest - -from microplex_us.pipelines.mp300k_artifact_gates import ( - main, - write_mp300k_artifact_gate_report, -) -from microplex_us.pipelines.mp_benchmark_manifest import ( - FROZEN_PRODUCTION_ECPS_BASELINE_ENHANCED_CPS_NATIVE_LOSS, - FROZEN_PRODUCTION_ECPS_BASELINE_HOLDOUT_LOSS, - FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - FROZEN_PRODUCTION_ECPS_BASELINE_UNWEIGHTED_MSRE, - FROZEN_PRODUCTION_ECPS_SCORING_CONFIG_SHA256, - FROZEN_PRODUCTION_ECPS_TARGET_COUNT, - FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, - FROZEN_PRODUCTION_ECPS_TARGET_NAMES_SHA256, -) -from microplex_us.policyengine.us import write_policyengine_us_time_period_dataset - -_EXPORT_CONTRACT_PATH = ( - Path(__file__).resolve().parents[2] - / "src" - / "microplex_us" - / "pipelines" - / "ecps_export_contract.json" -) - - -def _write_minimal_policyengine_dataset(path: Path, *, period: int = 2024) -> Path: - arrays = { - "household_id": {str(period): np.asarray([1, 2])}, - "household_weight": {str(period): np.asarray([10.0, 20.0])}, - "person_id": {str(period): np.asarray([1, 2, 3])}, - "person_household_id": {str(period): np.asarray([1, 1, 2])}, - "tax_unit_id": {str(period): np.asarray([10, 20])}, - "person_tax_unit_id": {str(period): np.asarray([10, 10, 20])}, - "spm_unit_id": {str(period): np.asarray([100, 200])}, - "person_spm_unit_id": {str(period): np.asarray([100, 100, 200])}, - "family_id": {str(period): np.asarray([1000, 2000])}, - "person_family_id": {str(period): np.asarray([1000, 1000, 2000])}, - "marital_unit_id": {str(period): np.asarray([10000, 10001, 20000])}, - "person_marital_unit_id": {str(period): np.asarray([10000, 10001, 20000])}, - } - return write_policyengine_us_time_period_dataset(arrays, path) - - -def _write_contract_policyengine_dataset(path: Path, *, period: int = 2024) -> Path: - """Write a structurally minimal H5 with the frozen export contract columns.""" - _write_minimal_policyengine_dataset(path, period=period) - contract = json.loads(_EXPORT_CONTRACT_PATH.read_text()) - with h5py.File(path, "a") as handle: - for variable in contract["required"]: - if variable in handle: - continue - group = handle.create_group(variable) - group.create_dataset(str(period), data=np.asarray([0.0, 0.0])) - return path - - -def _remove_period_dataset(path: Path, variable: str) -> None: - with h5py.File(path, "a") as handle: - if variable in handle: - del handle[variable] - - -def _write_incomplete_policyengine_dataset(path: Path, *, period: int = 2024) -> Path: - _write_contract_policyengine_dataset(path, period=period) - with h5py.File(path, "a") as handle: - del handle["person_household_id"] - return path - - -def _add_period_dataset( - path: Path, - variable: str, - values: list[object] | np.ndarray, - *, - period: int = 2024, -) -> None: - with h5py.File(path, "a") as handle: - if variable in handle: - del handle[variable] - group = handle.create_group(variable) - group.create_dataset(str(period), data=np.asarray(values)) - - -def _write_artifact_manifest( - artifact_dir: Path, - *, - baseline_dataset: Path | None = None, - source_weight_diagnostics: bool = True, -) -> None: - artifacts = {"policyengine_dataset": "candidate.h5"} - if source_weight_diagnostics: - (artifact_dir / "source_weight_diagnostics.json").write_text( - json.dumps(_source_weight_diagnostics_payload()) - ) - artifacts["source_weight_diagnostics"] = "source_weight_diagnostics.json" - manifest = { - "created_at": "2026-05-27T00:00:00+00:00", - "config": { - "policyengine_baseline_dataset": str(baseline_dataset) - if baseline_dataset is not None - else None, - "policyengine_dataset_year": 2024, - }, - "artifacts": artifacts, - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - - -def _write_benchmark_manifest(path: Path) -> None: - path.write_text( - json.dumps( - { - "schema_version": 1, - "certificate_type": "frozen_production_ecps_baseline", - "period": 2024, - "target_profile": "pe_native_broad", - "target_scope": "all", - "target_surface": { - "target_profile": "pe_native_broad", - "target_scope": "all", - "target_count": 150, - "target_names_sha256": "d" * 64, - }, - "scoring_config": {"sha256": "e" * 64}, - "baseline_metrics": { - "baseline_enhanced_cps_native_loss": 0.20, - "baseline_holdout_loss": 0.04, - "baseline_unweighted_msre": 0.17, - }, - "baseline_dataset": { - "path": "/tmp/enhanced_cps_2024.h5", - "sha256": FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - }, - "policyengine_us_data": { - "repo": "PolicyEngine/policyengine-us-data", - "commit": "b" * 40, - }, - "policyengine_us": {"version": "1.587.0"}, - "target_db": { - "path": "/tmp/policyengine_targets.db", - "sha256": FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, - }, - } - ) - ) - - -def _arch_coverage_payload( - *, - profile_name: str = "pe_native_broad_source_backed", - period: int = 2024, - target_cell_count: int = 183, - uncovered_cell_count: int = 0, -) -> dict[str, object]: - covered_cell_count = target_cell_count - uncovered_cell_count - return { - "profile_name": profile_name, - "period": period, - "target_cell_count": target_cell_count, - "covered_cell_count": covered_cell_count, - "uncovered_cell_count": uncovered_cell_count, - "coverage_rate": ( - covered_cell_count / target_cell_count if target_cell_count else 0.0 - ), - } - - -def _source_weight_diagnostics_payload( - *, - puf_support_share: float = 0.05, -) -> dict[str, object]: - return { - "schema_version": 1, - "summary": { - "max_source_household_weight_share": 0.85, - "puf_support_household_weight_share": puf_support_share, - }, - "sources": [ - { - "source_name": "cps_asec", - "source_class": "base", - "household_weight_share": 0.85, - }, - { - "source_name": "irs_soi_puf_support_clone", - "source_class": "puf_support", - "household_weight_share": puf_support_share, - }, - { - "source_name": "forbes_fixed_spine", - "source_class": "fixed_spine", - "household_weight_share": 0.10, - }, - ], - } - - -def _sound_ecps_comparison_payload( - *, - candidate_loss: float = 0.12, - baseline_loss: float = 0.20, - candidate_holdout_loss: float = 0.03, - baseline_holdout_loss: float = 0.04, - candidate_unweighted_msre: float = 0.10, - baseline_unweighted_msre: float = 0.17, - target_count: int = 150, - target_names_sha256: str = "d" * 64, - scoring_config_sha256: str = "e" * 64, - policyengine_us_data_commit: str = "b" * 40, - policyengine_us_version: str = "1.587.0", -) -> dict[str, object]: - fit_config = { - "lambda_l0": 0.0, - "lambda_l2": 0.0, - "use_gates": False, - "epochs": 2000, - } - protected_family_losses = { - family: {"candidate_loss": 0.01, "baseline_loss": 0.01} - for family in ( - "ssi", - "snap", - "wages", - "self_employment_income", - "capital_gains", - "interest", - "dividends", - "retirement_income", - "disability", - "household_net_income", - ) - } - family_breakdown = [ - { - "family": family, - "candidate_loss_contribution": 0.01, - "baseline_loss_contribution": 0.01, - } - for family in ( - "state_agi_distribution", - "state_age_distribution", - "national_ssa", - "national_irs_other", - "state_aca_spending", - ) - ] - return { - "frozen_ecps_baseline_certificate": { - "schema_version": 1, - "certificate_type": "frozen_production_ecps_baseline", - "period": 2024, - "baseline_dataset": { - "path": "/tmp/enhanced_cps_2024.h5", - "sha256": FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - }, - "target_db": { - "path": "/tmp/policyengine_targets.db", - "sha256": FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, - }, - "policyengine_us_data": { - "repo": "PolicyEngine/policyengine-us-data", - "commit": policyengine_us_data_commit, - }, - "policyengine_us": {"version": policyengine_us_version}, - "target_surface": { - "target_profile": "pe_native_broad", - "target_scope": "all", - "target_count": target_count, - "target_names_sha256": target_names_sha256, - }, - "scoring_config": {"sha256": scoring_config_sha256}, - "baseline_metrics": { - "baseline_enhanced_cps_native_loss": baseline_loss, - "baseline_holdout_loss": baseline_holdout_loss, - "baseline_unweighted_msre": baseline_unweighted_msre, - }, - }, - "summary": { - "candidate_enhanced_cps_native_loss": candidate_loss, - "baseline_enhanced_cps_native_loss": baseline_loss, - "enhanced_cps_native_loss_delta": candidate_loss - baseline_loss, - "candidate_beats_baseline": candidate_loss < baseline_loss, - "n_targets_kept": 150, - "candidate_household_count": 41_314, - "baseline_household_count": 41_314, - "candidate_refit_config": fit_config, - "baseline_refit_config": fit_config, - "refit_objective_matches_scoring": True, - "ecps_refit_effective_passed": True, - "candidate_holdout_loss": candidate_holdout_loss, - "baseline_holdout_loss": baseline_holdout_loss, - "candidate_unweighted_msre": candidate_unweighted_msre, - "baseline_unweighted_msre": baseline_unweighted_msre, - "holdout_target_fraction": 0.2, - "protected_family_losses": protected_family_losses, - }, - "score": {"family_breakdown": family_breakdown}, - } - - -def test_write_mp300k_artifact_gate_report_passes_with_all_evidence(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={ - "candidate_seconds": 11.0, - "baseline_seconds": 10.0, - }, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - ) - - record = json.loads(report_path.read_text()) - manifest = json.loads((artifact_dir / "manifest.json").read_text()) - - assert record["summary"]["status"] == "passed" - assert record["gates"]["candidate_artifact"]["status"] == "pass" - assert record["gates"]["compatibility"]["metrics"]["household_count"] == 2 - assert record["gates"]["compatibility"]["metrics"]["person_count"] == 3 - assert record["gates"]["column_contract"]["status"] == "pass" - assert record["gates"]["export_support"]["status"] == "pass" - assert record["gates"]["export_lineage"]["status"] == "pass" - assert record["gates"]["artifact_size"]["status"] == "pass" - assert record["gates"]["ecps_comparison"]["status"] == "pass" - assert record["gates"]["arch_target_coverage"]["status"] == "pass" - assert record["gates"]["runtime"]["status"] == "pass" - assert record["gates"]["runtime"]["metrics"]["runtime_ratio"] == 1.1 - assert record["gates"]["source_weight_diagnostics"]["status"] == "pass" - assert ( - record["gates"]["source_weight_diagnostics"]["metrics"][ - "puf_support_household_weight_share" - ] - == 0.05 - ) - assert record["gates"]["benchmark_manifest"]["status"] == "pass" - assert record["candidate_dataset"]["path"] == str(candidate_dataset.resolve()) - assert ( - manifest["artifacts"]["mp300k_artifact_gates"] == "mp300k_artifact_gates.json" - ) - assert manifest["mp300k_artifact_gates"]["status"] == "passed" - - -def test_write_mp300k_artifact_gate_report_uses_packaged_benchmark_manifest( - tmp_path, -): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload( - candidate_loss=0.01, - baseline_loss=FROZEN_PRODUCTION_ECPS_BASELINE_ENHANCED_CPS_NATIVE_LOSS, - candidate_holdout_loss=0.001, - baseline_holdout_loss=FROZEN_PRODUCTION_ECPS_BASELINE_HOLDOUT_LOSS, - candidate_unweighted_msre=0.1, - baseline_unweighted_msre=FROZEN_PRODUCTION_ECPS_BASELINE_UNWEIGHTED_MSRE, - target_count=FROZEN_PRODUCTION_ECPS_TARGET_COUNT, - target_names_sha256=FROZEN_PRODUCTION_ECPS_TARGET_NAMES_SHA256, - scoring_config_sha256=FROZEN_PRODUCTION_ECPS_SCORING_CONFIG_SHA256, - policyengine_us_data_commit="f7458313c86fa580fb1e43a2f18252d67cf76e4a", - policyengine_us_version="1.715.2", - ), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - benchmark_gate = record["gates"]["benchmark_manifest"] - - assert record["summary"]["status"] == "passed" - assert benchmark_gate["status"] == "pass" - assert record["benchmark_manifest"]["packaged_default"] is True - assert benchmark_gate["details"]["present_evidence"][ - "target_surface.target_count" - ] == FROZEN_PRODUCTION_ECPS_TARGET_COUNT - - -def test_benchmark_manifest_gate_requires_pinned_release_evidence(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - benchmark_manifest.write_text(json.dumps({"schema_version": 1, "frozen": True})) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - benchmark_gate = record["gates"]["benchmark_manifest"] - - assert record["summary"]["status"] == "failed" - assert benchmark_gate["status"] == "fail" - assert benchmark_gate["details"]["missing_evidence"] == [ - "certificate_type", - "period", - "baseline_dataset.path", - "baseline_dataset.sha256", - "policyengine_us_data.commit", - "policyengine_us.version", - "target_surface.target_profile", - "target_surface.target_scope", - "target_surface.target_count", - "target_surface.target_names_sha256", - "scoring_config.sha256", - "baseline_metrics.baseline_enhanced_cps_native_loss", - "baseline_metrics.baseline_holdout_loss", - "baseline_metrics.baseline_unweighted_msre", - "target_db.path", - "target_db.sha256", - ] - - -def test_ecps_comparison_gate_requires_frozen_baseline_certificate(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - del payload["frozen_ecps_baseline_certificate"] - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert "frozen_ecps_baseline_certificate" in ecps_gate["summary"] - assert ecps_gate["details"]["frozen_ecps_baseline_certificate"][ - "missing_evidence" - ] == ["frozen_ecps_baseline_certificate"] - - -def test_ecps_comparison_gate_rejects_baseline_certificate_metric_drift(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - payload["frozen_ecps_baseline_certificate"]["baseline_metrics"][ - "baseline_enhanced_cps_native_loss" - ] = 0.05 - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - mismatches = ecps_gate["details"]["frozen_ecps_baseline_certificate"][ - "mismatches" - ] - assert { - "field": "baseline_metrics.baseline_enhanced_cps_native_loss", - "summary_value": 0.2, - "certificate_value": 0.05, - } in mismatches - assert { - "field": "baseline_metrics.baseline_enhanced_cps_native_loss", - "benchmark_manifest_value": 0.2, - "certificate_value": 0.05, - } in mismatches - - -def test_ecps_comparison_gate_rejects_benchmark_certificate_mismatch(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - payload["frozen_ecps_baseline_certificate"]["baseline_dataset"]["sha256"] = "f" * 64 - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - mismatches = ecps_gate["details"]["frozen_ecps_baseline_certificate"][ - "mismatches" - ] - assert any(item["field"] == "baseline_dataset.sha256" for item in mismatches) - - -def test_ecps_comparison_gate_rejects_benchmark_metric_mismatch(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - manifest = json.loads(benchmark_manifest.read_text()) - manifest["baseline_metrics"]["baseline_unweighted_msre"] = 999.0 - benchmark_manifest.write_text(json.dumps(manifest)) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - mismatches = ecps_gate["details"]["frozen_ecps_baseline_certificate"][ - "mismatches" - ] - assert any( - item["field"] == "baseline_metrics.baseline_unweighted_msre" - for item in mismatches - ) - - -def test_ecps_comparison_gate_rejects_self_consistent_noncanonical_pins(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - manifest = json.loads(benchmark_manifest.read_text()) - manifest["baseline_dataset"]["sha256"] = "a" * 64 - manifest["target_db"]["sha256"] = "c" * 64 - manifest["target_scope"] = "national" - manifest["target_surface"]["target_scope"] = "national" - benchmark_manifest.write_text(json.dumps(manifest)) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - payload["frozen_ecps_baseline_certificate"]["baseline_dataset"]["sha256"] = ( - "a" * 64 - ) - payload["frozen_ecps_baseline_certificate"]["target_db"]["sha256"] = "c" * 64 - payload["frozen_ecps_baseline_certificate"]["target_surface"][ - "target_scope" - ] = "national" - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - benchmark_gate = record["gates"]["benchmark_manifest"] - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert benchmark_gate["status"] == "fail" - assert ecps_gate["status"] == "fail" - assert benchmark_gate["details"]["production_pin_mismatches"] - assert ecps_gate["details"]["frozen_ecps_baseline_certificate"]["mismatches"] - - -@pytest.mark.parametrize( - ("mutate_certificate", "expected_field"), - [ - ( - lambda certificate: certificate.update( - {"certificate_type": "live_recomputed_ecps_baseline"} - ), - "certificate_type", - ), - (lambda certificate: certificate.update({"period": 2025}), "period"), - ( - lambda certificate: certificate["policyengine_us"].update( - {"version": "1.999.0"} - ), - "policyengine_us.version", - ), - ( - lambda certificate: certificate["target_surface"].update( - {"target_profile": "pe_native_narrow"} - ), - "target_surface.target_profile", - ), - ( - lambda certificate: certificate["target_surface"].update( - {"target_scope": "state"} - ), - "target_surface.target_scope", - ), - ( - lambda certificate: certificate["target_surface"].update( - {"target_count": 149} - ), - "target_surface.target_count", - ), - ( - lambda certificate: certificate["target_surface"].update( - {"target_names_sha256": "f" * 64} - ), - "target_surface.target_names_sha256", - ), - ( - lambda certificate: certificate["scoring_config"].update( - {"sha256": "f" * 64} - ), - "scoring_config.sha256", - ), - ], -) -def test_ecps_comparison_gate_rejects_stale_frozen_surface_certificate( - tmp_path, - mutate_certificate, - expected_field, -): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - mutate_certificate(payload["frozen_ecps_baseline_certificate"]) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - certificate_details = ecps_gate["details"]["frozen_ecps_baseline_certificate"] - mismatch_fields = { - mismatch["field"] for mismatch in certificate_details["mismatches"] - } - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert expected_field in ( - mismatch_fields | set(certificate_details["missing_evidence"]) - ) - - -def test_core_benchmark_floor_accepts_aca_enrollment_family_alias(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - comparison = _sound_ecps_comparison_payload() - family_breakdown = comparison["score"]["family_breakdown"] - for row in family_breakdown: - if row["family"] == "state_aca_spending": - row["family"] = "state_aca_enrollment" - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=comparison, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - comparison_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "passed" - assert comparison_gate["status"] == "pass" - assert ( - comparison_gate["details"]["core_benchmark_family_floor"]["missing_families"] - == [] - ) - - -def test_column_contract_gate_rejects_missing_ecps_contract_column(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _remove_period_dataset(candidate_dataset, "age") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - column_gate = record["gates"]["column_contract"] - - assert record["summary"]["status"] == "failed" - assert column_gate["status"] == "fail" - assert column_gate["metrics"]["missing_contract_column_count"] == 1 - assert column_gate["details"]["missing_contract_columns"] == ["age"] - - -def test_export_support_gate_rejects_ecps_populated_numeric_filler(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _add_period_dataset(candidate_dataset, "hourly_wage", [0.0, 0.0, 0.0]) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _add_period_dataset(baseline_dataset, "hourly_wage", [0.0, 25.0, 0.0]) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - support_gate = record["gates"]["export_support"] - - assert record["summary"]["status"] == "failed" - assert support_gate["status"] == "fail" - assert support_gate["metrics"]["unsupported_populated_export_column_count"] == 1 - assert support_gate["details"]["issues"][0]["column"] == "hourly_wage" - assert support_gate["details"]["issues"][0]["requirement"] == "numeric_positive" - - -def test_export_support_gate_requires_signed_self_employment_support(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _add_period_dataset( - candidate_dataset, - "self_employment_income_before_lsr", - [0.0, 5_000.0, 0.0], - ) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _add_period_dataset( - baseline_dataset, - "self_employment_income_before_lsr", - [-2_000.0, 5_000.0, 0.0], - ) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - support_gate = record["gates"]["export_support"] - - assert record["summary"]["status"] == "failed" - assert support_gate["status"] == "fail" - assert support_gate["metrics"]["unsupported_populated_export_column_count"] == 1 - assert support_gate["details"]["issues"][0]["column"] == ( - "self_employment_income_before_lsr" - ) - assert support_gate["details"]["issues"][0]["requirement"] == "numeric_signed" - - -def test_export_support_gate_rejects_ecps_varied_categorical_filler(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _add_period_dataset(candidate_dataset, "is_tipped_occupation", [False, False]) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _add_period_dataset(baseline_dataset, "is_tipped_occupation", [False, True]) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - support_gate = record["gates"]["export_support"] - - assert record["summary"]["status"] == "failed" - assert support_gate["status"] == "fail" - assert support_gate["details"]["issues"][0]["column"] == "is_tipped_occupation" - assert ( - support_gate["details"]["issues"][0]["requirement"] == "categorical_variation" - ) - - -def test_export_support_gate_ignores_ecps_filler_columns(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _add_period_dataset(candidate_dataset, "second_home_mortgage_interest", [0.0, 0.0]) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _add_period_dataset(baseline_dataset, "second_home_mortgage_interest", [0.0, 0.0]) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - support_gate = record["gates"]["export_support"] - - assert record["summary"]["status"] == "passed" - assert support_gate["status"] == "pass" - assert ( - "second_home_mortgage_interest" - in support_gate["details"]["baseline_filler_columns"] - ) - - -def test_export_lineage_gate_rejects_ecps_populated_default_only_column(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _add_period_dataset( - candidate_dataset, - "is_wic_at_nutritional_risk", - [False, True], - ) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _add_period_dataset( - baseline_dataset, - "is_wic_at_nutritional_risk", - [False, True], - ) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - support_gate = record["gates"]["export_support"] - lineage_gate = record["gates"]["export_lineage"] - - assert record["summary"]["status"] == "failed" - assert support_gate["status"] == "pass" - assert lineage_gate["status"] == "fail" - assert lineage_gate["details"]["issues"] == [ - { - "column": "is_wic_at_nutritional_risk", - "ecps_support_requirement": "categorical_variation", - "export_path_status": "default_only", - "issue": "ecps_populated_export_has_no_source_lineage", - } - ] - - -def test_column_contract_gate_reports_extra_candidate_columns(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _add_period_dataset(candidate_dataset, "filing_status", [1, 2]) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - column_gate = record["gates"]["column_contract"] - - assert record["summary"]["status"] == "passed" - assert column_gate["status"] == "pass" - assert column_gate["metrics"]["extra_unknown_column_count"] == 1 - assert column_gate["metrics"]["spec_variable_manifest_count"] == 278 - assert column_gate["details"]["extra_unknown_columns"] == ["filing_status"] - - -def test_column_contract_gate_rejects_renamed_candidate_columns(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _remove_period_dataset(candidate_dataset, "taxpayer_id_type") - _add_period_dataset( - candidate_dataset, - "taxpayer_id_type_reported", - [0.0, 0.0, 0.0], - ) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - column_gate = record["gates"]["column_contract"] - - assert record["summary"]["status"] == "failed" - assert column_gate["status"] == "fail" - assert column_gate["details"]["missing_contract_columns"] == ["taxpayer_id_type"] - assert column_gate["details"]["extra_unknown_columns"] == [ - "taxpayer_id_type_reported" - ] - - -def test_column_contract_gate_excludes_formula_owned_candidate_outputs( - tmp_path, -): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _add_period_dataset( - candidate_dataset, - "weeks_worked", - [0.0, 0.0], - ) - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - column_gate = record["gates"]["column_contract"] - - assert record["summary"]["status"] == "passed" - assert column_gate["status"] == "pass" - assert column_gate["metrics"]["excluded_contract_column_count"] == 1 - assert column_gate["details"]["extra_unknown_columns"] == [] - - -def test_column_contract_gate_rejects_missing_legacy_baseline_columns(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - candidate_dataset = _write_contract_policyengine_dataset( - artifact_dir / "candidate.h5" - ) - _remove_period_dataset(candidate_dataset, "taxpayer_id_type") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - column_gate = record["gates"]["column_contract"] - - assert record["summary"]["status"] == "failed" - assert column_gate["status"] == "fail" - assert column_gate["details"]["missing_contract_columns"] == ["taxpayer_id_type"] - - -def test_source_weight_diagnostics_gate_rejects_missing_sidecar(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest( - artifact_dir, - baseline_dataset=baseline_dataset, - source_weight_diagnostics=False, - ) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - source_gate = record["gates"]["source_weight_diagnostics"] - - assert record["summary"]["status"] == "incomplete" - assert source_gate["status"] == "unmeasured" - assert "source_weight_diagnostics" in record["summary"]["unmeasured_required_gates"] - - -def test_source_weight_diagnostics_gate_rejects_puf_support_dominance(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - source_weight_diagnostics_payload=_source_weight_diagnostics_payload( - puf_support_share=0.40 - ), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - source_gate = record["gates"]["source_weight_diagnostics"] - - assert record["summary"]["status"] == "failed" - assert source_gate["status"] == "fail" - assert source_gate["metrics"]["puf_support_household_weight_share"] == 0.40 - assert source_gate["details"]["failures"] == [ - "support_household_weight_share", - "puf_support_household_weight_share", - ] - - -def test_arch_target_coverage_gate_rejects_uncovered_source_backed_cells(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(uncovered_cell_count=1), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - coverage_gate = record["gates"]["arch_target_coverage"] - - assert record["summary"]["status"] == "failed" - assert coverage_gate["status"] == "fail" - assert coverage_gate["details"]["failures"] == [ - "uncovered_cell_count", - "covered_cell_count", - "coverage_rate", - ] - - -def test_benchmark_manifest_gate_rejects_dirty_us_data_pin(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - payload = json.loads(benchmark_manifest.read_text()) - payload["policyengine_us_data"]["dirty"] = True - benchmark_manifest.write_text(json.dumps(payload)) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - benchmark_gate = record["gates"]["benchmark_manifest"] - - assert record["summary"]["status"] == "failed" - assert benchmark_gate["status"] == "fail" - assert benchmark_gate["details"]["missing_evidence"] == [ - "policyengine_us_data.clean" - ] - - -def test_write_mp300k_artifact_gate_report_fails_missing_structural_array(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_incomplete_policyengine_dataset(artifact_dir / "candidate.h5") - _write_artifact_manifest(artifact_dir) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - - assert record["summary"]["status"] == "failed" - assert record["gates"]["candidate_artifact"]["status"] == "pass" - assert record["gates"]["artifact_size"]["status"] == "unmeasured" - assert record["gates"]["compatibility"]["status"] == "fail" - assert record["gates"]["compatibility"]["details"]["missing_arrays"] == [ - "person_household_id" - ] - assert record["gates"]["ecps_comparison"]["status"] == "unmeasured" - - -def test_write_mp300k_artifact_gate_report_fails_invalid_entity_join(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - with h5py.File(artifact_dir / "candidate.h5", "a") as handle: - handle["person_household_id"]["2024"][2] = 999 - _write_artifact_manifest(artifact_dir) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - - assert record["summary"]["status"] == "failed" - assert record["gates"]["compatibility"]["status"] == "fail" - assert record["gates"]["compatibility"]["details"][ - "invalid_person_entity_links" - ] == {"person_household_id": [999]} - - -def test_write_mp300k_artifact_gate_report_fails_source_diagnostic_variable(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - with h5py.File(artifact_dir / "candidate.h5", "a") as handle: - diagnostic = handle.create_group("ssi_reported") - diagnostic.create_dataset("2024", data=np.asarray([1.0, 0.0, 0.0])) - _write_artifact_manifest(artifact_dir) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - - assert record["summary"]["status"] == "failed" - assert record["gates"]["compatibility"]["status"] == "fail" - assert record["gates"]["compatibility"]["details"][ - "forbidden_source_diagnostic_variables" - ] == ["ssi_reported"] - - -def test_write_mp300k_artifact_gate_report_fails_nonfinite_numeric_value(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - with h5py.File(artifact_dir / "candidate.h5", "a") as handle: - income = handle.create_group("employment_income") - income.create_dataset("2024", data=np.asarray([1.0, np.nan, 3.0])) - _write_artifact_manifest(artifact_dir) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - - assert record["summary"]["status"] == "failed" - assert record["gates"]["compatibility"]["status"] == "fail" - assert record["gates"]["compatibility"]["details"]["nonfinite_numeric_arrays"] == { - "employment_income": 1 - } - - -def test_write_mp300k_artifact_gate_report_reports_missing_candidate(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_artifact_manifest(artifact_dir) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - - assert record["summary"]["status"] == "failed" - assert record["candidate_dataset"]["exists"] is False - assert record["gates"]["candidate_artifact"]["status"] == "fail" - - -def test_main_writes_artifact_gate_report_from_payload_files(tmp_path, capsys): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - ecps_comparison_path = tmp_path / "ecps_comparison.json" - ecps_comparison_path.write_text( - json.dumps(_sound_ecps_comparison_payload(candidate_loss=0.10)) - ) - runtime_path = tmp_path / "runtime.json" - runtime_path.write_text( - json.dumps({"runtime_ratio": 1.2, "runtime_ratio_threshold": 1.25}) - ) - arch_coverage_path = tmp_path / "arch_coverage.json" - arch_coverage_path.write_text(json.dumps(_arch_coverage_payload())) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - - exit_code = main( - [ - "--artifact-dir", - str(artifact_dir), - "--ecps-comparison-json", - str(ecps_comparison_path), - "--runtime-smoke-json", - str(runtime_path), - "--arch-coverage-json", - str(arch_coverage_path), - "--benchmark-manifest", - str(benchmark_manifest), - ] - ) - - printed_path = Path(capsys.readouterr().out.strip()) - record = json.loads(printed_path.read_text()) - - assert exit_code == 0 - assert printed_path == artifact_dir / "mp300k_artifact_gates.json" - assert record["summary"]["status"] == "passed" - - -def test_ecps_comparison_can_become_nonblocking(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - runtime_smoke_payload={ - "runtime_ratio": 1.0, - "runtime_ratio_threshold": 1.25, - }, - arch_coverage_payload=_arch_coverage_payload(), - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - require_ecps_comparison=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - - assert record["summary"]["status"] == "passed" - assert "ecps_comparison" not in record["required_gates"] - assert record["gates"]["ecps_comparison"]["status"] == "unmeasured" - assert record["summary"]["unmeasured_optional_gates"] == ["ecps_comparison"] - - -def test_runtime_gate_accepts_repeated_loader_smoke_payload(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(candidate_loss=0.10), - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={ - "median_runtime_ratio": 1.19, - "candidate": {"median_elapsed_seconds": 0.137}, - "baseline": {"median_elapsed_seconds": 0.115}, - }, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - - assert record["summary"]["status"] == "passed" - assert record["gates"]["runtime"]["status"] == "pass" - assert record["gates"]["runtime"]["metrics"]["runtime_ratio"] == 1.19 - assert record["gates"]["runtime"]["metrics"]["candidate_seconds"] == 0.137 - assert record["gates"]["runtime"]["metrics"]["baseline_seconds"] == 0.115 - - -def test_ecps_comparison_accepts_existing_broad_loss_array_payload(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=[ - { - "broad_loss": { - "candidate_enhanced_cps_native_loss": 0.25, - "baseline_enhanced_cps_native_loss": 0.20, - "enhanced_cps_native_loss_delta": 0.05, - "candidate_beats_baseline": False, - } - } - ], - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - - assert record["summary"]["status"] == "failed" - assert record["gates"]["ecps_comparison"]["status"] == "fail" - assert ( - record["gates"]["ecps_comparison"]["metrics"][ - "candidate_enhanced_cps_native_loss" - ] - == 0.25 - ) - - -def test_ecps_comparison_rejects_one_sided_unmatched_refit_win(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload={ - "summary": { - "candidate_enhanced_cps_native_loss": 0.09, - "baseline_enhanced_cps_native_loss": 0.16, - "enhanced_cps_native_loss_delta": -0.07, - "candidate_beats_baseline": True, - "candidate_household_count": 120_000, - "baseline_household_count": 41_314, - "score_candidate_only": True, - } - }, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert "matched_household_count" in ecps_gate["summary"] - assert ecps_gate["details"]["score_candidate_only"] is True - - -def test_ecps_comparison_rejects_protected_family_regression(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - payload["summary"]["protected_family_losses"]["ssi"] = { - "candidate_loss": 0.0301, - "baseline_loss": 0.02, - } - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert "protected_family_floors" in ecps_gate["summary"] - assert ecps_gate["details"]["protected_family_floor"]["regressions"] == [ - { - "family": "ssi", - "candidate_loss": 0.0301, - "baseline_loss": 0.02, - "loss_delta": pytest.approx(0.0101), - "allowed_delta": 0.005, - } - ] - - -def test_ecps_comparison_rejects_core_benchmark_family_regression(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - payload["score"]["family_breakdown"][0] = { - "family": "state_agi_distribution", - "candidate_loss_contribution": 0.0601, - "baseline_loss_contribution": 0.05, - } - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert "core_benchmark_family_floors" in ecps_gate["summary"] - assert ecps_gate["details"]["core_benchmark_family_floor"]["regressions"] == [ - { - "family": "state_agi_distribution", - "candidate_loss": 0.0601, - "baseline_loss": 0.05, - "loss_delta": pytest.approx(0.0101), - "allowed_delta": 0.005, - } - ] - - -def test_ecps_comparison_rejects_missing_ecps_refit_effective(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - payload["summary"]["ecps_refit_effective_passed"] = False - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert "ecps_refit_effective" in ecps_gate["summary"] - assert ecps_gate["details"]["ecps_refit_effective_passed"] is False - - -def test_ecps_comparison_requires_measured_refit_objective_identity(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - del payload["summary"]["refit_objective_matches_scoring"] - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert "refit_objective_matches_scoring" in ecps_gate["summary"] - assert ecps_gate["details"]["refit_objective_matches_scoring"] is None - - -def test_ecps_comparison_rejects_adverse_holdout_loss(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload( - candidate_loss=0.10, - baseline_loss=0.20, - candidate_holdout_loss=0.050, - baseline_holdout_loss=0.040, - ) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert "holdout_loss_beats_baseline" in ecps_gate["summary"] - assert ecps_gate["metrics"]["candidate_holdout_loss"] == pytest.approx(0.050) - assert ecps_gate["metrics"]["baseline_holdout_loss"] == pytest.approx(0.040) - assert ecps_gate["details"]["holdout_loss_beats_baseline"] is False - - -def test_ecps_comparison_rejects_adverse_unweighted_msre(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - payload = _sound_ecps_comparison_payload( - candidate_loss=0.10, - baseline_loss=0.20, - candidate_unweighted_msre=0.30, - baseline_unweighted_msre=0.17, - ) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload=payload, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert "unweighted_msre_beats_baseline" in ecps_gate["summary"] - assert ecps_gate["metrics"]["candidate_unweighted_msre"] == pytest.approx(0.30) - assert ecps_gate["metrics"]["baseline_unweighted_msre"] == pytest.approx(0.17) - assert ecps_gate["details"]["unweighted_msre_beats_baseline"] is False - - -def test_runtime_gate_ignores_contradictory_producer_verdict(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload={ - "summary": { - "candidate_enhanced_cps_native_loss": 0.1, - "baseline_enhanced_cps_native_loss": 0.2, - } - }, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={ - "runtime_ratio": 10.0, - "runtime_ratio_threshold": 100.0, - "passes_runtime_gate": True, - }, - benchmark_manifest_path=benchmark_manifest, - runtime_ratio_threshold=1.25, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - runtime_gate = record["gates"]["runtime"] - - assert record["summary"]["status"] == "failed" - assert runtime_gate["status"] == "fail" - assert runtime_gate["metrics"]["runtime_ratio_threshold"] == 1.25 - assert runtime_gate["details"]["reported_runtime_ratio_threshold"] == 100.0 - assert runtime_gate["details"]["enforced_runtime_ratio_threshold"] == 1.25 - assert runtime_gate["details"]["reported_passes_runtime_gate"] is True - assert runtime_gate["details"]["computed_passes_runtime_gate"] is False - - -def test_ecps_gate_derives_verdict_from_losses_not_producer_flag(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") - baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark_manifest.json" - _write_benchmark_manifest(benchmark_manifest) - _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) - - report_path = write_mp300k_artifact_gate_report( - artifact_dir, - ecps_comparison_payload={ - "summary": { - "candidate_enhanced_cps_native_loss": 0.3, - "baseline_enhanced_cps_native_loss": 0.2, - "enhanced_cps_native_loss_delta": -0.1, - "candidate_beats_baseline": True, - } - }, - arch_coverage_payload=_arch_coverage_payload(), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=benchmark_manifest, - compute_native_scores=False, - update_manifest=False, - ) - - record = json.loads(report_path.read_text()) - ecps_gate = record["gates"]["ecps_comparison"] - - assert record["summary"]["status"] == "failed" - assert ecps_gate["status"] == "fail" - assert ecps_gate["metrics"]["enhanced_cps_native_loss_delta"] == pytest.approx(0.1) - assert ecps_gate["details"]["reported_candidate_beats_baseline"] is True - assert ecps_gate["details"]["computed_candidate_beats_baseline"] is False diff --git a/tests/pipelines/test_mp300k_gate_inputs.py b/tests/pipelines/test_mp300k_gate_inputs.py deleted file mode 100644 index 80592dba..00000000 --- a/tests/pipelines/test_mp300k_gate_inputs.py +++ /dev/null @@ -1,405 +0,0 @@ -"""Tests for packaging mp-300k gate inputs.""" - -from __future__ import annotations - -import json -import tarfile -from pathlib import Path - -import h5py -import numpy as np - -from microplex_us.pipelines.mp300k_artifact_gates import ( - write_mp300k_artifact_gate_report, -) -from microplex_us.pipelines.mp300k_gate_inputs import ( - main, - package_mp300k_gate_inputs, -) -from microplex_us.pipelines.mp_benchmark_manifest import ( - FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, -) -from microplex_us.policyengine.us import write_policyengine_us_time_period_dataset - -_EXPORT_CONTRACT_PATH = ( - Path(__file__).resolve().parents[2] - / "src" - / "microplex_us" - / "pipelines" - / "ecps_export_contract.json" -) - - -def _write_minimal_policyengine_dataset(path: Path, *, period: int = 2024) -> Path: - arrays = { - "household_id": {str(period): np.asarray([1, 2])}, - "household_weight": {str(period): np.asarray([10.0, 20.0])}, - "person_id": {str(period): np.asarray([1, 2, 3])}, - "person_household_id": {str(period): np.asarray([1, 1, 2])}, - "tax_unit_id": {str(period): np.asarray([10, 20])}, - "person_tax_unit_id": {str(period): np.asarray([10, 10, 20])}, - "spm_unit_id": {str(period): np.asarray([100, 200])}, - "person_spm_unit_id": {str(period): np.asarray([100, 100, 200])}, - "family_id": {str(period): np.asarray([1000, 2000])}, - "person_family_id": {str(period): np.asarray([1000, 1000, 2000])}, - "marital_unit_id": {str(period): np.asarray([10000, 10001, 20000])}, - "person_marital_unit_id": {str(period): np.asarray([10000, 10001, 20000])}, - } - return write_policyengine_us_time_period_dataset(arrays, path) - - -def _write_contract_policyengine_dataset(path: Path, *, period: int = 2024) -> Path: - _write_minimal_policyengine_dataset(path, period=period) - contract = json.loads(_EXPORT_CONTRACT_PATH.read_text()) - with h5py.File(path, "a") as handle: - for variable in contract["required"]: - if variable in handle: - continue - group = handle.create_group(variable) - group.create_dataset(str(period), data=np.asarray([0.0, 0.0])) - return path - - -def _write_manifest( - artifact_dir: Path, - *, - candidate_path: str = "policyengine_us.h5", - baseline_path: str = "baseline/enhanced_cps_2024.h5", -) -> None: - (artifact_dir / "source_weight_diagnostics.json").write_text( - json.dumps(_source_weight_diagnostics_payload()) - ) - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-05-27T00:00:00+00:00", - "config": { - "policyengine_dataset_year": 2024, - "policyengine_baseline_dataset": baseline_path, - }, - "artifacts": { - "policyengine_dataset": candidate_path, - "source_weight_diagnostics": "source_weight_diagnostics.json", - }, - } - ) - ) - - -def _write_benchmark_manifest(path: Path) -> None: - path.write_text( - json.dumps( - { - "schema_version": 1, - "certificate_type": "frozen_production_ecps_baseline", - "period": 2024, - "target_profile": "pe_native_broad", - "target_scope": "all", - "target_surface": { - "target_profile": "pe_native_broad", - "target_scope": "all", - "target_count": 150, - "target_names_sha256": "d" * 64, - }, - "scoring_config": {"sha256": "e" * 64}, - "baseline_metrics": { - "baseline_enhanced_cps_native_loss": 0.20, - "baseline_holdout_loss": 0.04, - "baseline_unweighted_msre": 0.17, - }, - "baseline_dataset": { - "path": "/tmp/enhanced_cps_2024.h5", - "sha256": FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - }, - "policyengine_us_data": { - "repo": "PolicyEngine/policyengine-us-data", - "commit": "b" * 40, - }, - "policyengine_us": {"version": "1.587.0"}, - "target_db": { - "path": "/tmp/policyengine_targets.db", - "sha256": FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, - }, - } - ) - ) - - -def _arch_coverage_payload() -> dict[str, object]: - return { - "profile_name": "pe_native_broad_source_backed", - "period": 2024, - "target_cell_count": 183, - "covered_cell_count": 183, - "uncovered_cell_count": 0, - "coverage_rate": 1.0, - } - - -def _source_weight_diagnostics_payload() -> dict[str, object]: - return { - "schema_version": 1, - "sources": [ - { - "source_name": "cps_asec", - "source_class": "base", - "household_weight_share": 0.95, - }, - { - "source_name": "irs_soi_puf_support_clone", - "source_class": "puf_support", - "household_weight_share": 0.05, - }, - ], - } - - -def _archive_manifest(archive_path: Path) -> dict: - with tarfile.open(archive_path) as archive: - manifest = archive.extractfile("artifact/manifest.json") - assert manifest is not None - return json.loads(manifest.read()) - - -def _sound_ecps_comparison_payload() -> dict[str, object]: - fit_config = { - "lambda_l0": 0.0, - "lambda_l2": 0.0, - "use_gates": False, - "epochs": 2000, - } - protected_family_losses = { - family: {"candidate_loss": 0.01, "baseline_loss": 0.01} - for family in ( - "ssi", - "snap", - "wages", - "self_employment_income", - "capital_gains", - "interest", - "dividends", - "retirement_income", - "disability", - "household_net_income", - ) - } - family_breakdown = [ - { - "family": family, - "candidate_loss_contribution": 0.01, - "baseline_loss_contribution": 0.01, - } - for family in ( - "state_agi_distribution", - "state_age_distribution", - "national_ssa", - "national_irs_other", - "state_aca_spending", - ) - ] - candidate_loss = 0.1 - baseline_loss = 0.2 - candidate_holdout_loss = 0.03 - baseline_holdout_loss = 0.04 - candidate_unweighted_msre = 0.10 - baseline_unweighted_msre = 0.17 - return { - "frozen_ecps_baseline_certificate": { - "schema_version": 1, - "certificate_type": "frozen_production_ecps_baseline", - "period": 2024, - "baseline_dataset": { - "path": "/tmp/enhanced_cps_2024.h5", - "sha256": FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - }, - "target_db": { - "path": "/tmp/policyengine_targets.db", - "sha256": FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, - }, - "policyengine_us_data": { - "repo": "PolicyEngine/policyengine-us-data", - "commit": "b" * 40, - }, - "policyengine_us": {"version": "1.587.0"}, - "target_surface": { - "target_profile": "pe_native_broad", - "target_scope": "all", - "target_count": 150, - "target_names_sha256": "d" * 64, - }, - "scoring_config": {"sha256": "e" * 64}, - "baseline_metrics": { - "baseline_enhanced_cps_native_loss": baseline_loss, - "baseline_holdout_loss": baseline_holdout_loss, - "baseline_unweighted_msre": baseline_unweighted_msre, - }, - }, - "summary": { - "candidate_enhanced_cps_native_loss": candidate_loss, - "baseline_enhanced_cps_native_loss": baseline_loss, - "enhanced_cps_native_loss_delta": candidate_loss - baseline_loss, - "candidate_beats_baseline": candidate_loss < baseline_loss, - "n_targets_kept": 150, - "candidate_household_count": 2, - "baseline_household_count": 2, - "candidate_refit_config": fit_config, - "baseline_refit_config": fit_config, - "refit_objective_matches_scoring": True, - "ecps_refit_effective_passed": True, - "candidate_holdout_loss": candidate_holdout_loss, - "baseline_holdout_loss": baseline_holdout_loss, - "candidate_unweighted_msre": candidate_unweighted_msre, - "baseline_unweighted_msre": baseline_unweighted_msre, - "holdout_target_fraction": 0.2, - "protected_family_losses": protected_family_losses, - }, - "score": {"family_breakdown": family_breakdown}, - } - - -def test_package_mp300k_gate_inputs_rewrites_external_candidate(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_manifest(artifact_dir) - external_candidate = tmp_path / "external" / "pe_l0_candidate.h5" - external_candidate.parent.mkdir() - external_candidate.write_bytes(b"candidate") - baseline_dataset = artifact_dir / "baseline" / "enhanced_cps_2024.h5" - baseline_dataset.parent.mkdir() - baseline_dataset.write_bytes(b"baseline") - ecps_comparison = tmp_path / "scores.json" - ecps_comparison.write_text(json.dumps([{"broad_loss": {}}])) - arch_coverage = tmp_path / "arch_coverage.json" - arch_coverage.write_text(json.dumps(_arch_coverage_payload())) - runtime_smoke = tmp_path / "runtime.json" - runtime_smoke.write_text(json.dumps({"runtime_ratio": 1.0})) - benchmark_manifest = tmp_path / "benchmark.json" - _write_benchmark_manifest(benchmark_manifest) - - metadata = package_mp300k_gate_inputs( - artifact_dir, - tmp_path / "gate-inputs", - candidate_dataset_path=external_candidate, - ecps_comparison_path=ecps_comparison, - arch_coverage_path=arch_coverage, - runtime_smoke_path=runtime_smoke, - benchmark_manifest_path=benchmark_manifest, - ) - - output_dir = tmp_path / "gate-inputs" - archive_path = output_dir / "artifact.tar.gz" - manifest = _archive_manifest(archive_path) - - assert archive_path.exists() - assert (output_dir / "ecps_comparison.json").exists() - assert (output_dir / "arch_coverage.json").exists() - assert (output_dir / "runtime_smoke.json").exists() - assert (output_dir / "benchmark_manifest.json").exists() - assert (output_dir / "gate_inputs.json").exists() - assert manifest["artifacts"]["policyengine_dataset"] == "pe_l0_candidate.h5" - assert ( - manifest["artifacts"]["source_weight_diagnostics"] - == "source_weight_diagnostics.json" - ) - assert ( - manifest["config"]["policyengine_baseline_dataset"] - == "baseline/enhanced_cps_2024.h5" - ) - assert manifest["mp300k_gate_inputs"]["source_candidate_dataset"] == str( - external_candidate.resolve() - ) - assert manifest["mp300k_gate_inputs"]["source_baseline_dataset"] == str( - baseline_dataset.resolve() - ) - assert manifest["mp300k_gate_inputs"]["source_weight_diagnostics"] == str( - (artifact_dir / "source_weight_diagnostics.json").resolve() - ) - assert metadata["artifact_archive"]["path"] == str(archive_path.resolve()) - assert metadata["workflow_call"]["with"]["gate_inputs_artifact"] == "gate-inputs" - - -def test_main_packages_gate_inputs(tmp_path, capsys): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_manifest(artifact_dir) - (artifact_dir / "policyengine_us.h5").write_bytes(b"candidate") - baseline_dataset = artifact_dir / "baseline" / "enhanced_cps_2024.h5" - baseline_dataset.parent.mkdir() - baseline_dataset.write_bytes(b"baseline") - output_dir = tmp_path / "gate-inputs" - - exit_code = main( - [ - "--artifact-dir", - str(artifact_dir), - "--output-dir", - str(output_dir), - ] - ) - - printed_path = Path(capsys.readouterr().out.strip()) - - assert exit_code == 0 - assert printed_path == output_dir / "gate_inputs.json" - assert (output_dir / "artifact.tar.gz").exists() - - -def test_packaged_inputs_run_gates_from_clean_extract(tmp_path): - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - _write_manifest( - artifact_dir, - candidate_path="../candidate.h5", - baseline_path="../baseline.h5", - ) - _write_contract_policyengine_dataset(tmp_path / "candidate.h5") - _write_contract_policyengine_dataset(tmp_path / "baseline.h5") - benchmark_manifest = tmp_path / "benchmark.json" - _write_benchmark_manifest(benchmark_manifest) - arch_coverage = tmp_path / "arch_coverage.json" - arch_coverage.write_text(json.dumps(_arch_coverage_payload())) - output_dir = tmp_path / "gate-inputs" - - package_mp300k_gate_inputs( - artifact_dir, - output_dir, - arch_coverage_path=arch_coverage, - benchmark_manifest_path=benchmark_manifest, - ) - - packaged_manifest = _archive_manifest(output_dir / "artifact.tar.gz") - assert packaged_manifest["artifacts"]["policyengine_dataset"] == "candidate.h5" - assert ( - packaged_manifest["artifacts"]["source_weight_diagnostics"] - == "source_weight_diagnostics.json" - ) - assert ( - packaged_manifest["config"]["policyengine_baseline_dataset"] - == "baseline/baseline.h5" - ) - extract_root = tmp_path / "extract" - with tarfile.open(output_dir / "artifact.tar.gz") as archive: - archive.extractall(extract_root, filter="data") - packaged_artifact_dir = next( - path.parent for path in extract_root.rglob("manifest.json") - ) - - report_path = write_mp300k_artifact_gate_report( - packaged_artifact_dir, - ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=json.loads( - (output_dir / "arch_coverage.json").read_text() - ), - runtime_smoke_payload={"runtime_ratio": 1.0}, - benchmark_manifest_path=output_dir / "benchmark_manifest.json", - compute_native_scores=False, - update_manifest=False, - ) - - report = json.loads(report_path.read_text()) - - assert report["summary"]["status"] == "passed" - assert report["candidate_dataset"]["path"].startswith(str(packaged_artifact_dir)) - assert report["baseline_dataset"]["path"].startswith(str(packaged_artifact_dir)) - assert report["gates"]["artifact_size"]["status"] == "pass" diff --git a/tests/pipelines/test_mp_benchmark_manifest.py b/tests/pipelines/test_mp_benchmark_manifest.py deleted file mode 100644 index dff92cff..00000000 --- a/tests/pipelines/test_mp_benchmark_manifest.py +++ /dev/null @@ -1,254 +0,0 @@ -"""Tests for pinned MP replacement benchmark manifests.""" - -from __future__ import annotations - -import hashlib -import json -import subprocess -from pathlib import Path - -import pytest - -from microplex_us.pipelines.mp_benchmark_manifest import ( - FROZEN_PRODUCTION_ECPS_BASELINE_ENHANCED_CPS_NATIVE_LOSS, - FROZEN_PRODUCTION_ECPS_BASELINE_HOLDOUT_LOSS, - FROZEN_PRODUCTION_ECPS_BASELINE_SHA256, - FROZEN_PRODUCTION_ECPS_BASELINE_UNWEIGHTED_MSRE, - FROZEN_PRODUCTION_ECPS_SCORING_CONFIG_SHA256, - FROZEN_PRODUCTION_ECPS_TARGET_COUNT, - FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256, - FROZEN_PRODUCTION_ECPS_TARGET_NAMES_SHA256, - FROZEN_PRODUCTION_ECPS_TARGET_PROFILE, - FROZEN_PRODUCTION_ECPS_TARGET_SCOPE, - build_mp_benchmark_manifest, - load_frozen_production_ecps_benchmark_manifest, - main, -) - - -def _write_file(path: Path, contents: bytes) -> Path: - path.write_bytes(contents) - return path - - -def _sha256(contents: bytes) -> str: - return hashlib.sha256(contents).hexdigest() - - -def test_build_mp_benchmark_manifest_pins_release_inputs(tmp_path): - baseline_contents = b"baseline h5" - target_contents = b"target db" - baseline = _write_file(tmp_path / "enhanced_cps_2024.h5", baseline_contents) - target_db = _write_file(tmp_path / "policyengine_targets.db", target_contents) - - manifest = build_mp_benchmark_manifest( - baseline_dataset_path=baseline, - target_db_path=target_db, - period=2024, - target_profile="pe_native_broad", - target_scope="national", - target_count=150, - target_names_sha256="d" * 64, - scoring_config_sha256="e" * 64, - baseline_enhanced_cps_native_loss=0.2, - baseline_holdout_loss=0.04, - baseline_unweighted_msre=0.17, - policyengine_us_data_commit="b" * 40, - policyengine_us_version="1.587.0", - enforce_production_pins=False, - ) - - assert manifest["schema_version"] == 1 - assert manifest["certificate_type"] == "frozen_production_ecps_baseline" - assert manifest["period"] == 2024 - assert manifest["target_profile"] == "pe_native_broad" - assert manifest["target_scope"] == "national" - assert manifest["target_surface"] == { - "target_profile": "pe_native_broad", - "target_scope": "national", - "target_count": 150, - "target_names_sha256": "d" * 64, - } - assert manifest["scoring_config"] == {"sha256": "e" * 64} - assert manifest["baseline_metrics"] == { - "baseline_enhanced_cps_native_loss": 0.2, - "baseline_holdout_loss": 0.04, - "baseline_unweighted_msre": 0.17, - } - assert manifest["baseline_dataset"]["path"] == str(baseline.resolve()) - assert manifest["baseline_dataset"]["sha256"] == _sha256(baseline_contents) - assert manifest["target_db"]["path"] == str(target_db.resolve()) - assert manifest["target_db"]["sha256"] == _sha256(target_contents) - assert manifest["policyengine_us_data"]["commit"] == "b" * 40 - assert manifest["policyengine_us"]["version"] == "1.587.0" - - -def test_packaged_frozen_production_manifest_matches_canonical_surface(): - manifest = load_frozen_production_ecps_benchmark_manifest() - - assert manifest["certificate_type"] == "frozen_production_ecps_baseline" - assert manifest["period"] == 2024 - assert manifest["baseline_dataset"]["sha256"] == ( - FROZEN_PRODUCTION_ECPS_BASELINE_SHA256 - ) - assert manifest["target_db"]["sha256"] == FROZEN_PRODUCTION_ECPS_TARGET_DB_SHA256 - assert manifest["target_surface"] == { - "target_profile": FROZEN_PRODUCTION_ECPS_TARGET_PROFILE, - "target_scope": FROZEN_PRODUCTION_ECPS_TARGET_SCOPE, - "target_count": FROZEN_PRODUCTION_ECPS_TARGET_COUNT, - "target_names_sha256": FROZEN_PRODUCTION_ECPS_TARGET_NAMES_SHA256, - } - assert manifest["scoring_config"]["sha256"] == ( - FROZEN_PRODUCTION_ECPS_SCORING_CONFIG_SHA256 - ) - assert manifest["baseline_metrics"]["baseline_enhanced_cps_native_loss"] == ( - FROZEN_PRODUCTION_ECPS_BASELINE_ENHANCED_CPS_NATIVE_LOSS - ) - assert manifest["baseline_metrics"]["baseline_holdout_loss"] == ( - FROZEN_PRODUCTION_ECPS_BASELINE_HOLDOUT_LOSS - ) - assert manifest["baseline_metrics"]["baseline_unweighted_msre"] == ( - FROZEN_PRODUCTION_ECPS_BASELINE_UNWEIGHTED_MSRE - ) - - -def test_main_writes_mp_benchmark_manifest(tmp_path, capsys): - baseline = _write_file(tmp_path / "enhanced_cps_2024.h5", b"baseline") - target_db = _write_file(tmp_path / "policyengine_targets.db", b"targets") - output = tmp_path / "benchmark_manifest.json" - - exit_code = main( - [ - "--baseline-dataset", - str(baseline), - "--target-db", - str(target_db), - "--output-json", - str(output), - "--policyengine-us-data-commit", - "c" * 40, - "--policyengine-us-version", - "1.587.0", - "--target-scope", - "national", - "--target-count", - "150", - "--target-names-sha256", - "d" * 64, - "--scoring-config-sha256", - "e" * 64, - "--baseline-enhanced-cps-native-loss", - "0.2", - "--baseline-holdout-loss", - "0.04", - "--baseline-unweighted-msre", - "0.17", - "--allow-noncanonical-production-pins", - ] - ) - - printed_path = Path(capsys.readouterr().out.strip()) - payload = json.loads(output.read_text()) - - assert exit_code == 0 - assert printed_path == output - assert payload["policyengine_us_data"]["commit"] == "c" * 40 - - -def test_build_mp_benchmark_manifest_rejects_noncanonical_release_pins(tmp_path): - baseline = _write_file(tmp_path / "enhanced_cps_2024.h5", b"baseline") - target_db = _write_file(tmp_path / "policyengine_targets.db", b"targets") - - with pytest.raises(ValueError, match="release-pinned baseline/target surface"): - build_mp_benchmark_manifest( - baseline_dataset_path=baseline, - target_db_path=target_db, - period=2024, - target_profile="pe_native_broad", - target_scope="all", - target_count=150, - target_names_sha256="d" * 64, - scoring_config_sha256="e" * 64, - baseline_enhanced_cps_native_loss=0.2, - baseline_holdout_loss=0.04, - baseline_unweighted_msre=0.17, - policyengine_us_data_commit="b" * 40, - policyengine_us_version="1.587.0", - ) - - -def test_build_mp_benchmark_manifest_requires_baseline_metrics(tmp_path): - baseline = _write_file(tmp_path / "enhanced_cps_2024.h5", b"baseline") - target_db = _write_file(tmp_path / "policyengine_targets.db", b"targets") - - with pytest.raises(ValueError, match="pin baseline metrics"): - build_mp_benchmark_manifest( - baseline_dataset_path=baseline, - target_db_path=target_db, - target_count=150, - target_names_sha256="d" * 64, - scoring_config_sha256="e" * 64, - policyengine_us_data_commit="b" * 40, - policyengine_us_version="1.587.0", - enforce_production_pins=False, - ) - - -def test_dirty_policyengine_us_data_repo_is_rejected_unless_explicit(tmp_path): - baseline = _write_file(tmp_path / "enhanced_cps_2024.h5", b"baseline") - target_db = _write_file(tmp_path / "policyengine_targets.db", b"targets") - repo = tmp_path / "policyengine-us-data" - repo.mkdir() - subprocess.run(["git", "-C", str(repo), "init"], check=True, capture_output=True) - _write_file(repo / "tracked.txt", b"clean") - subprocess.run(["git", "-C", str(repo), "add", "."], check=True) - subprocess.run( - [ - "git", - "-C", - str(repo), - "-c", - "user.name=Codex", - "-c", - "user.email=codex@example.com", - "commit", - "-m", - "init", - ], - check=True, - capture_output=True, - ) - (repo / "tracked.txt").write_text("dirty") - - with pytest.raises(ValueError, match="uncommitted changes"): - build_mp_benchmark_manifest( - baseline_dataset_path=baseline, - target_db_path=target_db, - policyengine_us_data_repo=repo, - policyengine_us_version="1.587.0", - target_count=150, - target_names_sha256="d" * 64, - scoring_config_sha256="e" * 64, - baseline_enhanced_cps_native_loss=0.2, - baseline_holdout_loss=0.04, - baseline_unweighted_msre=0.17, - enforce_production_pins=False, - ) - - manifest = build_mp_benchmark_manifest( - baseline_dataset_path=baseline, - target_db_path=target_db, - policyengine_us_data_repo=repo, - policyengine_us_version="1.587.0", - target_count=150, - target_names_sha256="d" * 64, - scoring_config_sha256="e" * 64, - baseline_enhanced_cps_native_loss=0.2, - baseline_holdout_loss=0.04, - baseline_unweighted_msre=0.17, - allow_dirty_policyengine_us_data=True, - enforce_production_pins=False, - ) - - assert manifest["policyengine_us_data"]["dirty"] is True - assert len(manifest["policyengine_us_data"]["commit"]) == 40 diff --git a/tests/pipelines/test_pe_l0.py b/tests/pipelines/test_pe_l0.py deleted file mode 100644 index c6cdb7ab..00000000 --- a/tests/pipelines/test_pe_l0.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Tests for the PolicyEngine L0 calibrator adapter.""" - -from __future__ import annotations - -import sys -import types - -import numpy as np -import pandas as pd -import pytest -from microplex.calibration import LinearConstraint - -from microplex_us.pipelines.pe_l0 import ( - PolicyEngineL0Calibrator, - make_policyengine_us_data_fit_l0_weights_fn, -) - - -def _install_fake_policyengine_l0(weights: np.ndarray): - calls: dict[str, object] = {} - - def fake_fit_l0_weights(**kwargs): - calls.update(kwargs) - return np.asarray(weights, dtype=float) - - return calls, fake_fit_l0_weights - - -def test_policyengine_l0_calibrator_supports_explicit_linear_constraints(): - calls, fake_fit_l0_weights = _install_fake_policyengine_l0(np.array([1.0, 2.0])) - data = pd.DataFrame({"weight": [1.0, 1.0]}) - constraints = ( - LinearConstraint("row1", np.array([1.0, 0.0]), 1.0), - LinearConstraint("row2", np.array([0.0, 1.0]), 2.0), - ) - - calibrator = PolicyEngineL0Calibrator( - lambda_l0=1e-4, - lambda_l2=1e-12, - beta=0.35, - learning_rate=0.15, - epochs=25, - tol=1e-6, - device="cpu", - fit_l0_weights_fn=fake_fit_l0_weights, - ) - result = calibrator.fit_transform( - data, - {}, - weight_col="weight", - linear_constraints=constraints, - ) - validation = calibrator.validate(result) - - assert result["weight"].tolist() == [1.0, 2.0] - assert calls["X_sparse"].shape == (2, 2) - assert calls["target_names"] == ["row1", "row2"] - assert calls["targets"].tolist() == [1.0, 2.0] - assert calls["initial_weights"].tolist() == [1.0, 1.0] - assert validation["converged"] is True - assert validation["max_error"] < 1e-9 - assert validation["sparsity"] == 0.0 - - -def test_policyengine_l0_calibrator_reports_sparsity(): - _, fake_fit_l0_weights = _install_fake_policyengine_l0(np.array([0.0, 3.0, 0.0])) - data = pd.DataFrame({"weight": [1.0, 1.0, 1.0]}) - constraints = (LinearConstraint("row", np.array([0.0, 1.0, 0.0]), 3.0),) - - calibrator = PolicyEngineL0Calibrator( - epochs=5, - tol=1e-6, - fit_l0_weights_fn=fake_fit_l0_weights, - ) - calibrator.fit( - data, - {}, - weight_col="weight", - linear_constraints=constraints, - ) - - assert calibrator.get_sparsity() == 2 / 3 - - -def test_policyengine_l0_lambda_zero_uses_dense_no_gate_path(monkeypatch): - calls, fake_fit_l0_weights = _install_fake_policyengine_l0(np.array([99.0, 99.0])) - data = pd.DataFrame({"weight": [1.0, 1.0]}) - constraints = ( - LinearConstraint("row1", np.array([1.0, 0.0]), 2.0), - LinearConstraint("row2", np.array([0.0, 1.0]), 3.0), - ) - - calibrator = PolicyEngineL0Calibrator( - lambda_l0=0.0, - lambda_l2=0.0, - epochs=100, - tol=1e-10, - fit_l0_weights_fn=fake_fit_l0_weights, - ) - result = calibrator.fit_transform( - data, - {}, - weight_col="weight", - linear_constraints=constraints, - ) - validation = calibrator.validate(result) - - assert calls == {} - assert calibrator.effective_backend_ == "dense_projected_gradient" - assert validation["backend"] == "dense_projected_gradient" - assert validation["uses_gates"] is False - assert validation["loss_history"][0]["iteration"] == 0 - assert ( - validation["loss_history"][-1]["objective_loss"] - < validation["loss_history"][0]["objective_loss"] - ) - assert result["weight"].to_numpy(dtype=float) == pytest.approx( - [2.0, 3.0], - rel=1e-5, - ) - - -def test_policyengine_l0_requires_explicit_fit_function_for_nonzero_l0(): - data = pd.DataFrame({"weight": [1.0]}) - constraints = (LinearConstraint("row", np.array([1.0]), 1.0),) - - calibrator = PolicyEngineL0Calibrator(lambda_l0=1e-4, epochs=1) - - with pytest.raises(RuntimeError, match="no longer loads policyengine-us-data"): - calibrator.fit( - data, - {}, - weight_col="weight", - linear_constraints=constraints, - ) - - -def test_policyengine_l0_can_wrap_policyengine_us_data_fit_function(monkeypatch): - calls: dict[str, object] = {} - - def fake_policyengine_fit_l0_weights( - *, - X_sparse, - targets, - lambda_l0, - epochs=100, - device="cpu", - verbose_freq=None, - target_groups=None, - ): - kwargs = { - "X_sparse": X_sparse, - "targets": targets, - "lambda_l0": lambda_l0, - "epochs": epochs, - "device": device, - "verbose_freq": verbose_freq, - "target_groups": target_groups, - } - calls.update(kwargs) - return np.array([4.0, 5.0]) - - package = types.ModuleType("policyengine_us_data") - package.__path__ = [] - calibration_package = types.ModuleType("policyengine_us_data.calibration") - calibration_package.__path__ = [] - unified = types.ModuleType("policyengine_us_data.calibration.unified_calibration") - unified.fit_l0_weights = fake_policyengine_fit_l0_weights - monkeypatch.setitem(sys.modules, "policyengine_us_data", package) - monkeypatch.setitem( - sys.modules, - "policyengine_us_data.calibration", - calibration_package, - ) - monkeypatch.setitem( - sys.modules, - "policyengine_us_data.calibration.unified_calibration", - unified, - ) - - fit_l0_weights = make_policyengine_us_data_fit_l0_weights_fn() - result = fit_l0_weights( - X_sparse="matrix", - targets=np.array([1.0]), - lambda_l0=1e-8, - epochs=2, - device="cpu", - verbose_freq=1, - initial_weights=np.array([1.0, 1.0]), - target_names=["target"], - ) - - assert result.tolist() == [4.0, 5.0] - assert calls["X_sparse"] == "matrix" - np.testing.assert_array_equal(calls["targets"], np.array([1.0])) - assert calls["lambda_l0"] == pytest.approx(1e-8) - assert calls["epochs"] == 2 - assert calls["device"] == "cpu" - assert calls["verbose_freq"] == 1 - assert calls["target_groups"] is None - assert "initial_weights" not in calls - assert "target_names" not in calls diff --git a/tests/pipelines/test_pe_native_calibration_benchmark.py b/tests/pipelines/test_pe_native_calibration_benchmark.py deleted file mode 100644 index 77ab020e..00000000 --- a/tests/pipelines/test_pe_native_calibration_benchmark.py +++ /dev/null @@ -1,214 +0,0 @@ -"""Tests for PE-native calibration strategy benchmarking.""" - -from __future__ import annotations - -import shutil -from pathlib import Path - -import h5py -import numpy as np - -from microplex_us.pipelines.pe_native_calibration_benchmark import ( - build_policyengine_us_native_calibration_benchmark, - compute_household_weight_diagnostics, -) - - -def _write_dataset(path: Path, weights: list[float]) -> Path: - household_ids = np.arange(1, len(weights) + 1, dtype=np.int64) - with h5py.File(path, "w") as handle: - household_id = handle.create_group("household_id") - household_id.create_dataset("2024", data=household_ids) - household_weight = handle.create_group("household_weight") - household_weight.create_dataset( - "2024", - data=np.asarray(weights, dtype=np.float32), - ) - return path - - -def test_compute_household_weight_diagnostics_compares_reference_by_id( - tmp_path: Path, -) -> None: - candidate = _write_dataset(tmp_path / "candidate.h5", [3.0, 0.0, 9.0]) - reference = tmp_path / "reference.h5" - with h5py.File(reference, "w") as handle: - household_id = handle.create_group("household_id") - household_id.create_dataset("2024", data=np.asarray([3, 1, 2])) - household_weight = handle.create_group("household_weight") - household_weight.create_dataset( - "2024", - data=np.asarray([6.0, 2.0, 1.0], dtype=np.float32), - ) - - diagnostics = compute_household_weight_diagnostics( - candidate, - reference_dataset_path=reference, - ) - - assert diagnostics["household_count"] == 3 - assert diagnostics["positive_household_count"] == 2 - assert diagnostics["weight_sum"] == 12.0 - assert diagnostics["reference_alignment"] == "matched_by_household_id" - assert diagnostics["reference_weight_sum"] == 9.0 - assert diagnostics["weight_sum_delta"] == 3.0 - assert diagnostics["changed_household_count"] == 3 - assert np.isclose(diagnostics["effective_sample_size"], 1.6) - - -def test_build_policyengine_us_native_calibration_benchmark_scores_variants( - monkeypatch, - tmp_path: Path, -) -> None: - input_dataset = _write_dataset(tmp_path / "input.h5", [1.0, 1.0]) - baseline_dataset = _write_dataset(tmp_path / "baseline.h5", [2.0, 2.0]) - existing_dataset = _write_dataset(tmp_path / "current_weight_diff.h5", [1.2, 0.8]) - output_dir = tmp_path / "benchmark" - - def fake_extract(**kwargs): - assert kwargs["target_scope_filter"] == "national" - return { - "scaled_matrix": np.eye(2), - "scaled_target": np.asarray([1.0, 0.0]), - "initial_weights": np.asarray([1.0, 1.0]), - "metadata": { - "target_names": ["nation/fake", "state/fake"], - "skip_tax_expenditure_targets": True, - }, - } - - def fake_optimize_weights(**kwargs): - penalty = float(kwargs["l2_penalty"]) - weights = np.asarray([1.9, 0.1] if penalty == 0.0 else [1.4, 0.6]) - return weights, { - "initial_loss": 1.25, - "optimized_loss": 0.5 if penalty == 0.0 else 0.75, - "loss_delta": -0.75 if penalty == 0.0 else -0.5, - "initial_weight_sum": 2.0, - "optimized_weight_sum": float(weights.sum()), - "household_count": 2, - "positive_household_count": 2, - "budget": None, - "iterations": 3, - "converged": True, - } - - def fake_rewrite(**kwargs): - output_path = Path(kwargs["output_dataset_path"]) - shutil.copy2(kwargs["input_dataset_path"], output_path) - with h5py.File(output_path, "r+") as handle: - handle["household_weight"]["2024"][...] = np.asarray( - kwargs["household_weights"], - dtype=np.float32, - ) - return output_path.resolve() - - def fake_scores(**kwargs): - assert kwargs["target_scope_filter"] == "national" - results = [] - for candidate_path in kwargs["candidate_dataset_paths"]: - path = Path(candidate_path).resolve() - if path.name == "input.h5": - loss = 1.0 - elif path.name == "current_weight_diff.h5": - loss = 0.8 - elif "unconstrained" in path.name: - loss = 0.4 - else: - loss = 0.6 - results.append( - { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "summary": { - "candidate_enhanced_cps_native_loss": loss, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": loss - 0.5, - "candidate_beats_baseline": loss < 0.5, - "candidate_unweighted_msre": loss + 0.1, - "baseline_unweighted_msre": 0.7, - "unweighted_msre_delta": loss - 0.6, - "n_targets_total": 4, - "n_targets_kept": 3, - "n_targets_zero_dropped": 1, - "n_targets_bad_dropped": 0, - "n_national_targets": 1, - "n_state_targets": 2, - "skip_tax_expenditure_targets": True, - }, - "broad_loss": { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "candidate_dataset": str(path), - "baseline_dataset": str(baseline_dataset.resolve()), - "candidate_enhanced_cps_native_loss": loss, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": loss - 0.5, - "candidate_beats_baseline": loss < 0.5, - "candidate_unweighted_msre": loss + 0.1, - "baseline_unweighted_msre": 0.7, - "unweighted_msre_delta": loss - 0.6, - "n_targets_total": 4, - "n_targets_kept": 3, - "n_targets_zero_dropped": 1, - "n_targets_bad_dropped": 0, - "n_national_targets": 1, - "n_state_targets": 2, - "candidate_weight_sum": 2.0, - "baseline_weight_sum": 4.0, - "skip_tax_expenditure_targets": True, - "family_breakdown": [], - }, - "family_breakdown": [], - } - ) - return results - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_calibration_benchmark." - "_extract_pe_native_loss_inputs", - fake_extract, - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_calibration_benchmark." - "optimize_pe_native_loss_weights", - fake_optimize_weights, - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_calibration_benchmark." - "rewrite_policyengine_us_dataset_weights", - fake_rewrite, - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_calibration_benchmark." - "compute_batch_us_pe_native_scores", - fake_scores, - ) - - payload = build_policyengine_us_native_calibration_benchmark( - input_dataset_path=input_dataset, - baseline_dataset_path=baseline_dataset, - output_dir=output_dir, - l2_penalties=(0.0, 1e-8), - max_iter=5, - target_total_weight_source="baseline", - existing_candidates={"current_weight_diff": existing_dataset}, - skip_tax_expenditure_targets=True, - target_scope_filter="national", - ) - - assert payload["variant_count"] == 4 - assert payload["target_scope_filter"] == "national" - assert payload["target_total_weight"] == 4.0 - assert payload["target_total_weight_resolved_from"] == "baseline" - assert payload["best_variant_label"] == "pe_native_unconstrained_baseline_total" - assert [row["label"] for row in payload["ranking"][:2]] == [ - "pe_native_unconstrained_baseline_total", - "pe_native_l2_1e-08_baseline_total", - ] - unconstrained = next( - row for row in payload["rows"] if row["label"].startswith("pe_native_unconstrained") - ) - assert unconstrained["optimization"]["l2_penalty"] == 0.0 - assert unconstrained["weight_diagnostics"]["reference_alignment"] == "same_order" - assert unconstrained["weight_diagnostics"]["changed_household_count"] == 2 diff --git a/tests/pipelines/test_pe_native_loss.py b/tests/pipelines/test_pe_native_loss.py deleted file mode 100644 index 43fa6dea..00000000 --- a/tests/pipelines/test_pe_native_loss.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Tests for robust PE-native loss helpers.""" - -from __future__ import annotations - -import numpy as np -import pytest - -from microplex_us.pipelines.pe_native_loss import ( - build_pe_native_loss_arrays, - infer_pe_native_target_unit, - pe_native_huber_loss, - pe_native_huber_loss_terms, -) -from microplex_us.pipelines.pe_native_optimization import ( - _project_to_simplex, - optimize_pe_native_loss_weights, -) - - -def test_bucketed_loss_downweights_tiny_baseline_outlier() -> None: - targets = np.asarray([10.0, 1_000_000.0]) - names = [ - "nation/irs/estate losses/total/AGI in 100k-200k/taxable/All", - "nation/irs/adjusted gross income/total/AGI in 500k-1m/taxable/All", - ] - loss_arrays = build_pe_native_loss_arrays(names, targets) - - estimate = np.asarray([410.0, 1_100_000.0]) - terms = pe_native_huber_loss_terms(estimate, loss_arrays) - - assert loss_arrays.target_weight[1] > loss_arrays.target_weight[0] - assert terms[0] / terms.sum() < 0.05 - assert terms[1] > terms[0] - - -def test_cbo_income_by_source_filers_targets_are_dollars() -> None: - assert ( - infer_pe_native_target_unit( - "nation/cbo/income_by_source/self_employment_income/filers" - ) - == "dollars" - ) - assert ( - infer_pe_native_target_unit( - "nation/cbo/income_by_source/taxable_interest_income+" - "non_qualified_dividend_income/filers" - ) - == "dollars" - ) - assert ( - infer_pe_native_target_unit( - "nation/irs/adjusted gross income/count/AGI in 0-25k/taxable/All" - ) - == "returns" - ) - - -def test_robust_pe_native_optimizer_uses_huber_objective() -> None: - matrix = np.asarray([[1.0, 0.0], [0.0, 1.0]]) - target = np.asarray([1.0, 1.0]) - loss_arrays = build_pe_native_loss_arrays( - [ - "nation/irs/example income/total/AGI in 0_1/taxable/All", - "nation/irs/example income/total/AGI in 1_2/taxable/All", - ], - target, - ) - initial_weights = np.asarray([0.0, 2.0]) - - optimized, summary = optimize_pe_native_loss_weights( - scaled_matrix=matrix, - scaled_target=target, - initial_weights=initial_weights, - loss_arrays=loss_arrays, - max_iter=100, - tol=1e-10, - ) - - assert summary["optimized_loss"] < summary["initial_loss"] - assert pe_native_huber_loss(matrix.T @ optimized, loss_arrays) == pytest.approx( - summary["optimized_loss"] - ) - assert optimized == pytest.approx(np.asarray([1.0, 1.0]), abs=1e-3) - - -def test_simplex_projection_preserves_large_population_total_exactly() -> None: - values = np.asarray([50_000_000.0, 50_000_000.0, 53_768_767.0]) - target_total = values.sum() - 1_000.0 - - projected = _project_to_simplex(values, target_total) - - assert projected.sum() == pytest.approx(target_total, abs=1e-6) diff --git a/tests/pipelines/test_pe_native_optimization.py b/tests/pipelines/test_pe_native_optimization.py deleted file mode 100644 index cd82eedc..00000000 --- a/tests/pipelines/test_pe_native_optimization.py +++ /dev/null @@ -1,254 +0,0 @@ -"""Tests for direct PE-native weight optimization helpers.""" - -from __future__ import annotations - -import json -from pathlib import Path -from types import SimpleNamespace - -import h5py -import numpy as np - -from microplex_us.pipelines import pe_native_optimization as pe_opt -from microplex_us.pipelines.pe_native_optimization import ( - PolicyEngineUSNativeWeightOptimizationResult, - optimize_pe_native_loss_weights, - optimize_policyengine_us_native_loss_dataset, - rewrite_policyengine_us_dataset_weights, -) - - -def _write_time_period_array(handle: h5py.File, name: str, values: np.ndarray) -> None: - group = handle.create_group(name) - group.create_dataset("2024", data=values) - - -def _build_stub_dataset(path: Path) -> Path: - with h5py.File(path, "w") as handle: - _write_time_period_array( - handle, - "household_id", - np.asarray([10, 20], dtype=np.int64), - ) - _write_time_period_array( - handle, - "household_weight", - np.asarray([1.0, 2.0], dtype=np.float32), - ) - _write_time_period_array( - handle, - "person_household_id", - np.asarray([10, 10, 20], dtype=np.int64), - ) - _write_time_period_array( - handle, - "person_weight", - np.asarray([1.0, 1.0, 2.0], dtype=np.float32), - ) - _write_time_period_array( - handle, - "tax_unit_id", - np.asarray([100, 200], dtype=np.int64), - ) - _write_time_period_array( - handle, - "person_tax_unit_id", - np.asarray([100, 100, 200], dtype=np.int64), - ) - _write_time_period_array( - handle, - "tax_unit_weight", - np.asarray([1.0, 2.0], dtype=np.float32), - ) - return path - - -def test_optimize_pe_native_loss_weights_reduces_objective_and_respects_budget(): - scaled_matrix = np.eye(3, dtype=np.float64) - scaled_target = np.asarray([1.0, 0.0, 0.0], dtype=np.float64) - initial_weights = np.asarray([1.0 / 3.0] * 3, dtype=np.float64) - - optimized_weights, summary = optimize_pe_native_loss_weights( - scaled_matrix=scaled_matrix, - scaled_target=scaled_target, - initial_weights=initial_weights, - budget=1, - max_iter=200, - ) - - assert np.allclose(optimized_weights, np.asarray([1.0, 0.0, 0.0]), atol=1e-6) - assert summary["optimized_loss"] < summary["initial_loss"] - assert summary["positive_household_count"] == 1 - assert np.isclose(summary["optimized_weight_sum"], initial_weights.sum()) - assert summary["loss_history"][0]["iteration"] == 0 - assert summary["loss_history"][0]["objective_loss"] == summary["initial_loss"] - assert summary["loss_history"][-1]["objective_loss"] == summary["optimized_loss"] - assert all( - next_row["objective_loss"] <= row["objective_loss"] + 1e-12 - for row, next_row in zip( - summary["loss_history"], - summary["loss_history"][1:], - strict=False, - ) - ) - - -def test_optimize_pe_native_loss_weights_respects_target_total_weight(): - """When target_total_weight is given, the simplex projection uses that total.""" - scaled_matrix = np.eye(3, dtype=np.float64) - scaled_target = np.asarray([5.0, 0.0, 0.0], dtype=np.float64) - initial_weights = np.asarray([1.0, 1.0, 1.0], dtype=np.float64) - - optimized_weights, summary = optimize_pe_native_loss_weights( - scaled_matrix=scaled_matrix, - scaled_target=scaled_target, - initial_weights=initial_weights, - target_total_weight=5.0, - max_iter=200, - ) - - assert np.isclose(optimized_weights.sum(), 5.0, atol=1e-6) - assert summary["initial_weight_sum"] == 3.0 - assert summary["target_total_weight"] == 5.0 - assert np.isclose(summary["optimized_weight_sum"], 5.0, atol=1e-6) - assert summary["optimized_loss"] < summary["initial_loss"] - - -def test_optimize_pe_native_loss_weights_rejects_objective_increasing_steps( - monkeypatch, -): - """An overlarge initial step must not make an already-good fit worse.""" - scaled_matrix = np.eye(2, dtype=np.float64) - scaled_target = np.asarray([0.6, 0.4], dtype=np.float64) - initial_weights = np.asarray([0.5, 0.5], dtype=np.float64) - initial_loss = np.square(scaled_matrix.T @ initial_weights - scaled_target).sum() - monkeypatch.setattr( - pe_opt, - "_estimate_quadratic_lipschitz", - lambda matrix, l2_penalty: 0.01, - ) - - optimized_weights, summary = optimize_pe_native_loss_weights( - scaled_matrix=scaled_matrix, - scaled_target=scaled_target, - initial_weights=initial_weights, - max_iter=20, - ) - - optimized_loss = np.square( - scaled_matrix.T @ optimized_weights - scaled_target - ).sum() - assert optimized_loss <= initial_loss + 1e-12 - assert summary["optimized_loss"] <= summary["initial_loss"] + 1e-12 - assert summary["line_search_backtracking_steps"] > 0 - - -def test_rewrite_policyengine_us_dataset_weights_updates_group_weights(tmp_path: Path): - source_path = _build_stub_dataset(tmp_path / "input.h5") - output_path = tmp_path / "output.h5" - - rewritten = rewrite_policyengine_us_dataset_weights( - input_dataset_path=source_path, - output_dataset_path=output_path, - household_weights=np.asarray([7.0, 3.0], dtype=np.float64), - ) - - assert rewritten == output_path.resolve() - with h5py.File(output_path, "r") as handle: - assert np.allclose(handle["household_weight"]["2024"][:], np.asarray([7.0, 3.0])) - assert np.allclose( - handle["person_weight"]["2024"][:], - np.asarray([7.0, 7.0, 3.0]), - ) - assert np.allclose(handle["tax_unit_weight"]["2024"][:], np.asarray([7.0, 3.0])) - - -def test_rewrite_policyengine_us_dataset_weights_skips_empty_derived_weight_group( - tmp_path: Path, -): - """Published datasets (e.g. the production enhanced CPS) can leave derived - entity-weight groups empty because PolicyEngine computes those weights from - household_weight at runtime. Rewriting must skip such groups instead of - raising KeyError on the missing period dataset.""" - source_path = _build_stub_dataset(tmp_path / "input.h5") - # Mirror the production layout: a derived-weight group that exists but has - # no value for the period, with its id arrays present. - with h5py.File(source_path, "a") as handle: - handle.create_group("family_weight") # empty: no "2024" dataset - _write_time_period_array( - handle, "family_id", np.asarray([1000, 2000], dtype=np.int64) - ) - _write_time_period_array( - handle, - "person_family_id", - np.asarray([1000, 1000, 2000], dtype=np.int64), - ) - output_path = tmp_path / "output.h5" - - rewritten = rewrite_policyengine_us_dataset_weights( - input_dataset_path=source_path, - output_dataset_path=output_path, - household_weights=np.asarray([7.0, 3.0], dtype=np.float64), - ) - - with h5py.File(rewritten, "r") as handle: - # The primary household weights are still rewritten. - assert np.allclose(handle["household_weight"]["2024"][:], np.asarray([7.0, 3.0])) - # The empty derived-weight group is left untouched (still has no period). - assert "2024" not in handle["family_weight"] - # A populated sibling derived group is still propagated in the same - # call: the skip is per-group, not all-or-nothing. - assert np.allclose(handle["tax_unit_weight"]["2024"][:], np.asarray([7.0, 3.0])) - - -def test_optimize_policyengine_us_native_loss_dataset_rewrites_dataset(tmp_path: Path, monkeypatch): - source_path = _build_stub_dataset(tmp_path / "input.h5") - output_path = tmp_path / "optimized.h5" - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_optimization.resolve_policyengine_us_data_repo_root", - lambda repo: Path("/tmp/policyengine-us-data"), - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_optimization.build_policyengine_us_data_subprocess_env", - lambda repo: {"PATH": "/usr/bin"}, - ) - - def _fake_run(args, **kwargs): - prefix = Path(args[-1]) - np.save( - prefix.with_suffix(".matrix.npy"), - np.asarray([[1.0, 0.0], [0.0, 1.0]], dtype=np.float64), - ) - np.save( - prefix.with_suffix(".target.npy"), - np.asarray([1.0, 2.0], dtype=np.float64), - ) - np.save( - prefix.with_suffix(".weights.npy"), - np.asarray([1.0, 2.0], dtype=np.float64), - ) - prefix.with_suffix(".meta.json").write_text( - json.dumps({"target_names": ["nation/foo", "state/bar"]}) - ) - return SimpleNamespace(returncode=0, stderr="", stdout="") - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_optimization.subprocess.run", - _fake_run, - ) - - result = optimize_policyengine_us_native_loss_dataset( - input_dataset_path=source_path, - output_dataset_path=output_path, - max_iter=100, - ) - - assert isinstance(result, PolicyEngineUSNativeWeightOptimizationResult) - assert result.output_dataset == str(output_path.resolve()) - assert result.optimized_loss <= result.initial_loss - with h5py.File(output_path, "r") as handle: - assert np.allclose( - handle["household_weight"]["2024"][:], - np.asarray([1.0, 2.0]), - ) diff --git a/tests/pipelines/test_pe_native_scores.py b/tests/pipelines/test_pe_native_scores.py deleted file mode 100644 index cc92c83a..00000000 --- a/tests/pipelines/test_pe_native_scores.py +++ /dev/null @@ -1,926 +0,0 @@ -"""Tests for PE-native scoring helpers.""" - -from __future__ import annotations - -import json -import os -from types import SimpleNamespace - -from microplex_us.pipelines.pe_native_scores import ( - PolicyEngineUSEnhancedCPSNativeScores, - annotate_pe_native_target_db_matches, - build_policyengine_us_data_pythonpath, - build_policyengine_us_data_subprocess_env, - build_us_pe_native_target_diagnostics_payload, - compare_us_pe_native_target_deltas, - compute_batch_us_pe_native_scores, - compute_batch_us_pe_native_support_audits, - compute_batch_us_pe_native_target_deltas, - compute_us_pe_native_scores, - compute_us_pe_native_support_audit, - parse_pe_native_target_lookup_key, - resolve_policyengine_us_data_python, - write_us_pe_native_scores, - write_us_pe_native_target_diagnostics, -) - -_MICROPLEX_SRC = __import__("microplex_us").__path__[0].rsplit("/microplex_us", 1)[0] - - -def test_compute_us_pe_native_scores_wraps_broad_loss(monkeypatch, tmp_path) -> None: - candidate = tmp_path / "candidate.h5" - baseline = tmp_path / "baseline.h5" - candidate.write_text("candidate") - baseline.write_text("baseline") - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.compute_policyengine_us_enhanced_cps_native_scores", - lambda *_args, **_kwargs: PolicyEngineUSEnhancedCPSNativeScores( - metric="enhanced_cps_native_loss", - period=2024, - candidate_dataset=str(candidate), - baseline_dataset=str(baseline), - candidate_enhanced_cps_native_loss=0.25, - baseline_enhanced_cps_native_loss=0.5, - enhanced_cps_native_loss_delta=-0.25, - candidate_unweighted_msre=0.3, - baseline_unweighted_msre=0.6, - unweighted_msre_delta=-0.3, - n_targets_total=2863, - n_targets_kept=2853, - n_targets_zero_dropped=10, - n_targets_bad_dropped=10, - n_national_targets=2000, - n_state_targets=853, - candidate_weight_sum=100.0, - baseline_weight_sum=200.0, - family_breakdown=( - { - "family": "state_age_distribution", - "n_targets": 900, - "candidate_loss_contribution": 0.1, - "baseline_loss_contribution": 0.05, - "loss_contribution_delta": 0.05, - "candidate_mean_weighted_loss": 0.2, - "baseline_mean_weighted_loss": 0.1, - "candidate_mean_unweighted_msre": 0.3, - "baseline_mean_unweighted_msre": 0.2, - "unweighted_msre_delta": 0.1, - }, - ), - ), - ) - - payload = compute_us_pe_native_scores( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - period=2024, - ) - - assert payload["metric"] == "enhanced_cps_native_loss" - assert payload["summary"]["candidate_enhanced_cps_native_loss"] == 0.25 - assert payload["summary"]["baseline_enhanced_cps_native_loss"] == 0.5 - assert payload["summary"]["enhanced_cps_native_loss_delta"] == -0.25 - assert payload["summary"]["candidate_beats_baseline"] is True - assert payload["summary"]["candidate_unweighted_msre"] == 0.3 - assert payload["summary"]["n_targets_kept"] == 2853 - assert payload["family_breakdown"][0]["family"] == "state_age_distribution" - assert payload["broad_loss"]["family_breakdown"][0]["n_targets"] == 900 - - -def test_write_us_pe_native_scores_persists_payload(monkeypatch, tmp_path) -> None: - candidate = tmp_path / "candidate.h5" - baseline = tmp_path / "baseline.h5" - output_path = tmp_path / "native.json" - candidate.write_text("candidate") - baseline.write_text("baseline") - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.compute_us_pe_native_scores", - lambda **_kwargs: { - "metric": "enhanced_cps_native_loss", - "summary": { - "candidate_enhanced_cps_native_loss": 0.2, - "baseline_enhanced_cps_native_loss": 0.4, - "enhanced_cps_native_loss_delta": -0.2, - }, - }, - ) - - written = write_us_pe_native_scores( - output_path, - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - ) - - assert written == output_path - assert output_path.exists() - - -def test_compute_batch_us_pe_native_scores_wraps_multiple_candidates( - monkeypatch, - tmp_path, -) -> None: - candidate_a = tmp_path / "candidate-a.h5" - candidate_b = tmp_path / "candidate-b.h5" - baseline = tmp_path / "baseline.h5" - for path in (candidate_a, candidate_b, baseline): - path.write_text(path.stem) - - payload = [ - { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "candidate_dataset": str(candidate_a), - "baseline_dataset": str(baseline), - "candidate_enhanced_cps_native_loss": 0.25, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": -0.25, - "candidate_beats_baseline": True, - "candidate_unweighted_msre": 0.3, - "baseline_unweighted_msre": 0.6, - "unweighted_msre_delta": -0.3, - "n_targets_total": 2865, - "n_targets_kept": 2853, - "n_targets_zero_dropped": 10, - "n_targets_bad_dropped": 10, - "n_national_targets": 677, - "n_state_targets": 2176, - "candidate_weight_sum": 100.0, - "baseline_weight_sum": 200.0, - "family_breakdown": [ - { - "family": "state_age_distribution", - "n_targets": 900, - "candidate_loss_contribution": 0.1, - "baseline_loss_contribution": 0.05, - "loss_contribution_delta": 0.05, - "candidate_mean_weighted_loss": 0.2, - "baseline_mean_weighted_loss": 0.1, - "candidate_mean_unweighted_msre": 0.3, - "baseline_mean_unweighted_msre": 0.2, - "unweighted_msre_delta": 0.1, - } - ], - }, - { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "candidate_dataset": str(candidate_b), - "baseline_dataset": str(baseline), - "candidate_enhanced_cps_native_loss": 0.75, - "baseline_enhanced_cps_native_loss": 0.5, - "enhanced_cps_native_loss_delta": 0.25, - "candidate_beats_baseline": False, - "candidate_unweighted_msre": 0.8, - "baseline_unweighted_msre": 0.6, - "unweighted_msre_delta": 0.2, - "n_targets_total": 2865, - "n_targets_kept": 2853, - "n_targets_zero_dropped": 10, - "n_targets_bad_dropped": 10, - "n_national_targets": 677, - "n_state_targets": 2176, - "candidate_weight_sum": 120.0, - "baseline_weight_sum": 200.0, - "family_breakdown": [ - { - "family": "state_agi_distribution", - "n_targets": 918, - "candidate_loss_contribution": 0.2, - "baseline_loss_contribution": 0.05, - "loss_contribution_delta": 0.15, - "candidate_mean_weighted_loss": 0.4, - "baseline_mean_weighted_loss": 0.1, - "candidate_mean_unweighted_msre": 0.5, - "baseline_mean_unweighted_msre": 0.2, - "unweighted_msre_delta": 0.3, - } - ], - }, - ] - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.subprocess.run", - lambda *_args, **_kwargs: SimpleNamespace( - returncode=0, - stdout=json.dumps(payload), - stderr="", - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.resolve_policyengine_us_data_repo_root", - lambda _repo=None: tmp_path, - ) - - results = compute_batch_us_pe_native_scores( - candidate_dataset_paths=[candidate_a, candidate_b], - baseline_dataset_path=baseline, - period=2024, - policyengine_us_data_repo=tmp_path, - policyengine_us_data_python=tmp_path / "python", - ) - - assert len(results) == 2 - assert results[0]["summary"]["candidate_beats_baseline"] is True - assert results[1]["summary"]["candidate_beats_baseline"] is False - assert results[1]["broad_loss"]["enhanced_cps_native_loss_delta"] == 0.25 - assert results[0]["family_breakdown"][0]["family"] == "state_age_distribution" - assert ( - results[1]["broad_loss"]["family_breakdown"][0]["family"] - == "state_agi_distribution" - ) - assert results[0]["timing"]["batch_candidate_count"] == 2 - assert results[0]["timing"]["batch_elapsed_seconds"] >= 0.0 - - -def test_build_policyengine_us_data_pythonpath_includes_sibling_microimpute( - tmp_path, -) -> None: - repo = tmp_path / "policyengine-us-data" - (repo / "policyengine_us_data").mkdir(parents=True) - microimpute = tmp_path / "microimpute" - (microimpute / "microimpute").mkdir(parents=True) - - pythonpath = build_policyengine_us_data_pythonpath( - repo, - existing_pythonpath="/tmp/existing-one:/tmp/existing-two", - ) - - assert pythonpath.split(os.pathsep) == [ - str(repo), - _MICROPLEX_SRC, - str(microimpute), - "/tmp/existing-one", - "/tmp/existing-two", - ] - - -def test_build_policyengine_us_data_subprocess_env_strips_outer_uv_markers( - tmp_path, -) -> None: - repo = tmp_path / "policyengine-us-data" - (repo / "policyengine_us_data").mkdir(parents=True) - microimpute = tmp_path / "microimpute" - (microimpute / "microimpute").mkdir(parents=True) - - env = build_policyengine_us_data_subprocess_env( - repo, - base_env={ - "HOME": "/tmp/home", - "PATH": "/usr/bin:/bin", - "VIRTUAL_ENV": "/tmp/outer-venv", - "UV_RUN_RECURSION_DEPTH": "1", - "PYTHONPATH": "/tmp/existing", - "KEEP_ME": "yes", - }, - ) - - assert env["HOME"] == "/tmp/home" - assert env["PATH"] == "/usr/bin:/bin" - assert "KEEP_ME" not in env - assert "VIRTUAL_ENV" not in env - assert "UV_RUN_RECURSION_DEPTH" not in env - assert env["PYTHONPATH"].split(os.pathsep) == [ - str(repo), - _MICROPLEX_SRC, - str(microimpute), - "/tmp/existing", - ] - - -def test_resolve_policyengine_us_data_python_preserves_venv_symlink_path( - tmp_path, -) -> None: - repo = tmp_path / "policyengine-us-data" - (repo / "policyengine_us_data").mkdir(parents=True) - real_python = tmp_path / "real-python" - real_python.write_text("#!/bin/sh\nexit 0\n") - real_python.chmod(0o755) - venv_python = repo / ".venv" / "bin" / "python" - venv_python.parent.mkdir(parents=True) - venv_python.symlink_to(real_python) - - resolved = resolve_policyengine_us_data_python(repo_root=repo) - - assert resolved == venv_python - - -def test_compare_us_pe_native_target_deltas_wraps_subprocess_payload( - monkeypatch, - tmp_path, -) -> None: - before = tmp_path / "before.h5" - after = tmp_path / "after.h5" - for path in (before, after): - path.write_text(path.stem) - - payload = { - "metric": "enhanced_cps_native_loss_target_delta", - "period": 2024, - "from_dataset": str(before), - "to_dataset": str(after), - "summary": { - "n_targets": 2, - "from_wins": 1, - "to_wins": 1, - "ties": 0, - "from_win_rate": 0.5, - "to_win_rate": 0.5, - "from_loss": 0.5, - "to_loss": 1.0, - "loss_delta": 0.5, - "mean_weighted_term_delta": 0.5, - }, - "family_summaries": [ - { - "target_family": "national_irs_other", - "n_targets": 1, - "to_win_rate": 0.0, - "loss_delta": 1.5, - } - ], - "scope_summaries": [ - { - "target_scope": "national", - "n_targets": 1, - "to_win_rate": 0.0, - "loss_delta": 1.5, - } - ], - "targets": [ - { - "target_name": "nation/irs/example", - "target_family": "national_irs_other", - "target_scope": "national", - "winner": "from", - "weighted_term_delta": 1.5, - "from_weighted_term": 0.2, - "to_weighted_term": 1.7, - "target_value": 10.0, - "from_estimate": 1.0, - "to_estimate": 0.0, - "from_rel_error": 0.3, - "to_rel_error": 1.0, - "from_abs_pct_error": 90.0, - "to_abs_pct_error": 100.0, - } - ], - "top_regressions": [ - { - "target_name": "nation/irs/example", - "target_family": "national_irs_other", - "target_scope": "national", - "winner": "from", - "weighted_term_delta": 1.5, - "from_weighted_term": 0.2, - "to_weighted_term": 1.7, - "target_value": 10.0, - "from_estimate": 1.0, - "to_estimate": 0.0, - "from_rel_error": 0.3, - "to_rel_error": 1.0, - } - ], - "top_improvements": [ - { - "target_name": "state/example", - "target_family": "other", - "target_scope": "state", - "winner": "to", - "weighted_term_delta": -0.5, - "from_weighted_term": 0.8, - "to_weighted_term": 0.3, - "target_value": 12.0, - "from_estimate": 4.0, - "to_estimate": 8.0, - "from_rel_error": 0.7, - "to_rel_error": 0.2, - } - ], - } - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.subprocess.run", - lambda *_args, **_kwargs: SimpleNamespace( - returncode=0, - stdout=json.dumps(payload), - stderr="", - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.resolve_policyengine_us_data_repo_root", - lambda _repo=None: tmp_path, - ) - - result = compare_us_pe_native_target_deltas( - from_dataset_path=before, - to_dataset_path=after, - period=2024, - top_k=10, - policyengine_us_data_repo=tmp_path, - policyengine_us_data_python=tmp_path / "python", - ) - - assert result["metric"] == "enhanced_cps_native_loss_target_delta" - assert result["top_regressions"][0]["target_name"] == "nation/irs/example" - assert result["summary"]["to_win_rate"] == 0.5 - assert result["targets"][0]["target_family"] == "national_irs_other" - - -def test_parse_pe_native_target_lookup_key_maps_eitc_agi_child_labels() -> None: - amount_key = parse_pe_native_target_lookup_key("nation/irs/eitc/amount/c3_1_1k") - returns_key = parse_pe_native_target_lookup_key("nation/irs/eitc/returns/c2_1_1k") - - assert amount_key is not None - assert amount_key.variable == "eitc" - assert amount_key.count_children == 3 - assert amount_key.agi_lower == 1.0 - assert amount_key.agi_upper == 1_000.0 - assert amount_key.expected_target()["domain_variable"] == ( - "adjusted_gross_income,eitc,eitc_child_count" - ) - assert returns_key is not None - assert returns_key.variable == "tax_unit_count" - assert returns_key.count_children == 2 - - -def test_annotate_pe_native_target_db_matches_marks_matches_and_gaps( - monkeypatch, - tmp_path, -) -> None: - db_path = tmp_path / "policy_data.db" - db_path.write_text("stub") - matched_name = "nation/irs/eitc/amount/c3_1_1k" - matched_key = parse_pe_native_target_lookup_key(matched_name) - assert matched_key is not None - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores._load_policyengine_target_match_index", - lambda *_args, **_kwargs: { - matched_key.as_tuple(): [ - { - "target_id": 123, - "variable": "eitc", - "period": 2022, - "value": 2_079_000.0, - "source": "IRS SOI", - "notes": "Table 2.5", - "geo_level": "national", - "geographic_id": "US", - "domain_variable": ("adjusted_gross_income,eitc,eitc_child_count"), - "constraints": [ - { - "variable": "eitc_child_count", - "operation": ">", - "value": "2", - } - ], - } - ] - }, - ) - payload = { - "targets": [ - {"target_name": matched_name}, - {"target_name": "nation/irs/eitc/returns/c2_1_1k"}, - {"target_name": "nation/census/infants"}, - ], - "top_improvements": [{"target_name": matched_name}], - "top_regressions": [], - } - - annotate_pe_native_target_db_matches( - payload, - target_db_path=db_path, - period=2024, - ) - - assert payload["targets"][0]["policyengine_target_match"] == "matched" - assert payload["targets"][0]["policyengine_target_id"] == 123 - assert payload["targets"][0]["policyengine_target_geo_level"] == "national" - assert payload["targets"][0]["policyengine_target_geographic_id"] == "US" - assert payload["targets"][1]["policyengine_target_match"] == "legacy_only" - assert payload["targets"][1]["policyengine_target_expected"]["variable"] == ( - "tax_unit_count" - ) - assert payload["targets"][2]["policyengine_target_match"] == "unparsed" - assert payload["top_improvements"][0]["policyengine_target_match"] == "matched" - assert payload["target_db_summary"]["matched"] == 1 - assert payload["target_db_summary"]["legacy_only"] == 1 - assert payload["target_db_summary"]["unparsed"] == 1 - assert payload["target_db_summary"]["match_rate"] == 0.5 - - -def test_write_us_pe_native_target_diagnostics_persists_full_payload( - monkeypatch, - tmp_path, -) -> None: - before = tmp_path / "before.h5" - after = tmp_path / "after.h5" - output_path = tmp_path / "diagnostics.json" - for path in (before, after): - path.write_text(path.stem) - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.compare_us_pe_native_target_deltas", - lambda **_kwargs: { - "metric": "enhanced_cps_native_loss_target_delta", - "period": 2024, - "from_dataset": str(before), - "to_dataset": str(after), - "summary": {"n_targets": 1, "to_win_rate": 1.0}, - "family_summaries": [{"target_family": "national_irs_other"}], - "scope_summaries": [{"target_scope": "national"}], - "targets": [ - { - "target_name": "nation/irs/example", - "target_family": "national_irs_other", - "winner": "to", - "weighted_term_delta": -1.0, - } - ], - "top_regressions": [], - "top_improvements": [], - }, - ) - - written = write_us_pe_native_target_diagnostics( - output_path, - from_dataset_path=before, - to_dataset_path=after, - from_label="baseline", - to_label="candidate", - policyengine_targets_db_path=tmp_path / "missing.db", - ) - - payload = json.loads(written.read_text()) - assert written == output_path - assert payload["diagnostic_schema_version"] == 1 - assert payload["dataset_labels"] == {"from": "baseline", "to": "candidate"} - assert payload["targets"][0]["target_name"] == "nation/irs/example" - assert payload["targets"][0]["policyengine_target_match"] == "unparsed" - assert payload["target_db_summary"]["unparsed"] == 1 - - -def test_build_us_pe_native_target_diagnostics_payload_adds_public_aliases( - tmp_path, -) -> None: - payload = build_us_pe_native_target_diagnostics_payload( - target_delta_payload={ - "metric": "enhanced_cps_native_loss_target_delta", - "period": 2024, - "from_dataset": "/tmp/enhanced_cps_2024.h5", - "to_dataset": "/tmp/policyengine_us.h5", - "summary": {"n_targets": 1}, - "targets": [ - { - "target_name": "nation/irs/eitc/returns/c2_0_1k", - "target_family": "national_irs_other", - "target_scope": "national", - "winner": "to", - "weighted_term_delta": -1.0, - "from_weighted_term": 2.0, - "to_weighted_term": 1.0, - "target_value": 100.0, - "from_estimate": 90.0, - "to_estimate": 95.0, - "from_rel_error": 0.2, - "to_rel_error": 0.1, - } - ], - "top_regressions": [ - { - "target_name": "nation/irs/eitc/returns/c2_0_1k", - "weighted_term_delta": -1.0, - } - ], - "top_improvements": [], - }, - from_label="policyengine-us-data", - to_label="microplex-us", - policyengine_targets_db_path=tmp_path / "missing.db", - artifact_id="artifact-1", - run_id="run-1", - ) - - row = payload["targets"][0] - assert payload["diagnostic_schema_version"] == 1 - assert payload["baseline_dataset"] == "/tmp/enhanced_cps_2024.h5" - assert payload["candidate_dataset"] == "/tmp/policyengine_us.h5" - assert row["target_id"] == "nation/irs/eitc/returns/c2_0_1k" - assert row["period"] == 2024 - assert row["variable"] == "tax_unit_count" - assert row["geo_level"] == "national" - assert row["geography"] == "US" - assert row["state"] is None - assert row["entity"] == "tax_unit" - assert row["artifact_id"] == "artifact-1" - assert row["run_id"] == "run-1" - assert row["us_data_aggregate"] == 90.0 - assert row["microplex_aggregate"] == 95.0 - assert row["us_data_absolute_error"] == 10.0 - assert row["microplex_absolute_error"] == 5.0 - assert row["delta_absolute_error"] == -5.0 - assert round(row["delta_relative_error"], 10) == -0.1 - assert row["loss_contribution"] == 1.0 - assert row["microplex_loss_contribution"] == 1.0 - assert row["candidate_loss_contribution"] == 1.0 - assert row["us_data_loss_contribution"] == 2.0 - assert row["policyengine_us_data_loss_contribution"] == 2.0 - assert row["baseline_loss_contribution"] == 2.0 - assert row["loss_contribution_delta"] == -1.0 - assert row["family"] == "national_irs_other" - assert row["in_loss"] is True - assert row["supported_by_microplex"] is True - top_row = payload["top_regressions"][0] - assert top_row["microplex_aggregate"] == 95.0 - assert top_row["artifact_id"] == "artifact-1" - - -def test_build_us_pe_native_target_diagnostics_payload_infers_legacy_metadata( - tmp_path, -) -> None: - def row(target_name: str, family: str, scope: str = "state") -> dict[str, object]: - return { - "target_name": target_name, - "target_family": family, - "target_scope": scope, - "winner": "to", - "weighted_term_delta": -1.0, - "from_weighted_term": 2.0, - "to_weighted_term": 1.0, - "target_value": 100.0, - "from_estimate": 90.0, - "to_estimate": 95.0, - "from_rel_error": 0.2, - "to_rel_error": 0.1, - } - - payload = build_us_pe_native_target_diagnostics_payload( - target_delta_payload={ - "metric": "enhanced_cps_native_loss_target_delta", - "period": 2024, - "from_dataset": "/tmp/enhanced_cps_2024.h5", - "to_dataset": "/tmp/policyengine_us.h5", - "summary": {"n_targets": 3}, - "targets": [ - row("state/irs/aca_spending/ak", "other"), - row("US39/snap-hhs", "state_snap_households"), - row("nation/irs/count/count/AGI in 20k-25k/taxable/All", "national_irs_other", "national"), - row("state/census/age/AZ/75-79", "state_age_distribution"), - row("nation/cbo/income_by_source/qualified_dividend_income", "other", "national"), - row("nation/soi/filer_count/agi_1m_2m", "other", "national"), - row("nation/hhs/medicaid_enrollment", "other", "national"), - ], - "top_regressions": [], - "top_improvements": [], - }, - policyengine_targets_db_path=tmp_path / "missing.db", - ) - - aca, snap, irs_count, state_age, cbo_income, soi_count, hhs_medicaid = payload[ - "targets" - ] - assert aca["variable"] == "aca_spending" - assert aca["geography"] == "AK" - assert aca["state"] == "AK" - assert aca["entity"] == "tax_unit" - assert aca["family"] == "state_aca_spending" - assert snap["variable"] == "snap_households" - assert snap["geography"] == "US39" - assert snap["state"] == "US39" - assert snap["entity"] == "household" - assert irs_count["variable"] == "tax_unit_count" - assert irs_count["geography"] == "US" - assert irs_count["entity"] == "tax_unit" - assert state_age["variable"] == "age" - assert state_age["geography"] == "AZ" - assert state_age["entity"] == "person" - assert cbo_income["variable"] == "qualified_dividend_income" - assert cbo_income["entity"] == "tax_unit" - assert soi_count["variable"] == "filer_count" - assert soi_count["entity"] == "tax_unit" - assert hhs_medicaid["variable"] == "medicaid_enrollment" - assert hhs_medicaid["entity"] == "person" - - -def test_compute_batch_us_pe_native_target_deltas_wraps_multiple_candidates( - monkeypatch, - tmp_path, -) -> None: - candidate_a = tmp_path / "candidate-a.h5" - candidate_b = tmp_path / "candidate-b.h5" - baseline = tmp_path / "baseline.h5" - for path in (candidate_a, candidate_b, baseline): - path.write_text(path.stem) - - payload = [ - { - "metric": "enhanced_cps_native_loss_target_delta", - "period": 2024, - "from_dataset": str(baseline), - "to_dataset": str(candidate_a), - "top_regressions": [{"target_name": "nation/irs/example-a"}], - "top_improvements": [{"target_name": "state/example-a"}], - }, - { - "metric": "enhanced_cps_native_loss_target_delta", - "period": 2024, - "from_dataset": str(baseline), - "to_dataset": str(candidate_b), - "top_regressions": [{"target_name": "nation/irs/example-b"}], - "top_improvements": [{"target_name": "state/example-b"}], - }, - ] - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.subprocess.run", - lambda *_args, **_kwargs: SimpleNamespace( - returncode=0, - stdout=json.dumps(payload), - stderr="", - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.resolve_policyengine_us_data_repo_root", - lambda _repo=None: tmp_path, - ) - - results = compute_batch_us_pe_native_target_deltas( - candidate_dataset_paths=[candidate_a, candidate_b], - baseline_dataset_path=baseline, - period=2024, - top_k=10, - policyengine_us_data_repo=tmp_path, - policyengine_us_data_python=tmp_path / "python", - ) - - assert len(results) == 2 - assert results[0]["to_dataset"] == str(candidate_a) - assert results[1]["top_regressions"][0]["target_name"] == "nation/irs/example-b" - - -def test_compute_us_pe_native_support_audit_wraps_subprocess_payload( - monkeypatch, - tmp_path, -) -> None: - candidate = tmp_path / "candidate.h5" - baseline = tmp_path / "baseline.h5" - for path in (candidate, baseline): - path.write_text(path.stem) - - payload = { - "metric": "enhanced_cps_support_audit", - "period": 2024, - "candidate_dataset": str(candidate), - "baseline_dataset": str(baseline), - "candidate": { - "critical_input_support": { - "child_support_expense": { - "stored": False, - "weighted_nonzero": 0.0, - } - } - }, - "baseline": { - "critical_input_support": { - "child_support_expense": { - "stored": True, - "weighted_nonzero": 10.0, - } - } - }, - "comparisons": { - "critical_input_support": [ - { - "variable": "child_support_expense", - "candidate_stored": False, - "baseline_stored": True, - "weighted_nonzero_delta": -10.0, - } - ] - }, - } - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.subprocess.run", - lambda *_args, **_kwargs: SimpleNamespace( - returncode=0, - stdout=json.dumps(payload), - stderr="", - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.resolve_policyengine_us_data_repo_root", - lambda _repo=None: tmp_path, - ) - - result = compute_us_pe_native_support_audit( - candidate_dataset_path=candidate, - baseline_dataset_path=baseline, - period=2024, - policyengine_us_data_repo=tmp_path, - policyengine_us_data_python=tmp_path / "python", - ) - - assert result["metric"] == "enhanced_cps_support_audit" - assert ( - result["comparisons"]["critical_input_support"][0]["variable"] - == "child_support_expense" - ) - - -def test_compute_batch_us_pe_native_support_audits_wraps_multiple_candidates( - monkeypatch, - tmp_path, -) -> None: - candidate_a = tmp_path / "candidate-a.h5" - candidate_b = tmp_path / "candidate-b.h5" - baseline = tmp_path / "baseline.h5" - for path in (candidate_a, candidate_b, baseline): - path.write_text(path.stem) - - payload = { - "metric": "enhanced_cps_support_audit_batch", - "period": 2024, - "baseline_dataset": str(baseline), - "baseline": { - "critical_input_support": { - "child_support_expense": {"stored": True, "weighted_nonzero": 10.0} - } - }, - "results": [ - { - "candidate_dataset": str(candidate_a), - "candidate": { - "critical_input_support": { - "child_support_expense": { - "stored": False, - "weighted_nonzero": 0.0, - } - } - }, - "comparisons": { - "critical_input_support": [ - { - "variable": "child_support_expense", - "candidate_stored": False, - "baseline_stored": True, - "weighted_nonzero_delta": -10.0, - } - ] - }, - }, - { - "candidate_dataset": str(candidate_b), - "candidate": { - "critical_input_support": { - "child_support_expense": { - "stored": True, - "weighted_nonzero": 12.0, - } - } - }, - "comparisons": { - "critical_input_support": [ - { - "variable": "child_support_expense", - "candidate_stored": True, - "baseline_stored": True, - "weighted_nonzero_delta": 2.0, - } - ] - }, - }, - ], - } - - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.subprocess.run", - lambda *_args, **_kwargs: SimpleNamespace( - returncode=0, - stdout=json.dumps(payload), - stderr="", - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_native_scores.resolve_policyengine_us_data_repo_root", - lambda _repo=None: tmp_path, - ) - - results = compute_batch_us_pe_native_support_audits( - candidate_dataset_paths=[candidate_a, candidate_b], - baseline_dataset_path=baseline, - period=2024, - policyengine_us_data_repo=tmp_path, - policyengine_us_data_python=tmp_path / "python", - ) - - assert len(results) == 2 - assert results[0]["baseline_dataset"] == str(baseline) - assert results[0]["candidate_dataset"] == str(candidate_a) - assert ( - results[1]["comparisons"]["critical_input_support"][0]["weighted_nonzero_delta"] - == 2.0 - ) diff --git a/tests/pipelines/test_pe_us_data_rebuild.py b/tests/pipelines/test_pe_us_data_rebuild.py deleted file mode 100644 index f60f4af9..00000000 --- a/tests/pipelines/test_pe_us_data_rebuild.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Tests for the PE-US-data rebuild program spec.""" - -from __future__ import annotations - -from microplex_us.data_sources.cps import CPSASECSourceProvider -from microplex_us.data_sources.donor_surveys import ( - ACSSourceProvider, - SCFSourceProvider, - SIPPSourceProvider, -) -from microplex_us.data_sources.puf import ( - PUF_UPRATING_MODE_PE_SOI, - SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF, - PUFSourceProvider, -) -from microplex_us.pipelines.pe_us_data_rebuild import ( - PEUSDataRebuildStatus, - build_policyengine_us_data_rebuild_markdown, - build_policyengine_us_data_rebuild_pipeline, - default_policyengine_us_data_rebuild_config, - default_policyengine_us_data_rebuild_program, - default_policyengine_us_data_rebuild_source_providers, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexPipeline, -) - - -def test_default_policyengine_us_data_rebuild_program_has_expected_core_stages() -> ( - None -): - program = default_policyengine_us_data_rebuild_program() - - assert program.program_id == "pe-us-data-rebuild-v1" - assert "change results only on the margin" in program.principle - - stage_ids = [stage.stage_id for stage in program.stages] - assert len(stage_ids) == len(set(stage_ids)) - assert stage_ids == [ - "source-contracts", - "cps-construction", - "puf-ingestion-uprating", - "extended-cps-qrf", - "family-imputation-parity", - "entity-export-parity", - "weighting-backend", - "targets-and-eval", - ] - - weighting_stage = next( - stage for stage in program.stages if stage.stage_id == "weighting-backend" - ) - assert weighting_stage.current_status is PEUSDataRebuildStatus.CLOSE - assert "policyengine_us_data.calibration.unified_calibration" in ( - weighting_stage.pe_owner_modules - ) - - -def test_build_policyengine_us_data_rebuild_markdown_mentions_parity_rule() -> None: - markdown = build_policyengine_us_data_rebuild_markdown() - - assert "# Rebuild PE-US-data in Microplex" in markdown - assert "Structural improvements are allowed" in markdown - assert "### CPS construction parity" in markdown - assert "`status`: `partial`" in markdown - - -def test_default_policyengine_us_data_rebuild_config_uses_incumbent_defaults() -> None: - config = default_policyengine_us_data_rebuild_config( - random_seed=123, - cps_asec_source_year=2022, - ) - - assert isinstance(config, USMicroplexBuildConfig) - assert config.synthesis_backend == "seed" - assert config.calibration_backend == "entropy" - assert config.policyengine_calibration_min_active_households == 20 - assert config.policyengine_calibration_deferred_stage_min_active_households == ( - 10, - 1, - ) - assert config.policyengine_calibration_deferred_stage_max_constraints == 24 - assert ( - config.policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error - is None - ) - assert config.policyengine_calibration_deferred_stage_top_family_count == 7 - assert config.policyengine_calibration_deferred_stage_top_geography_count == 4 - assert config.donor_imputer_backend == "regime_aware" - assert config.donor_imputer_condition_selection == "pe_prespecified" - assert config.donor_imputer_excluded_variables == () - assert config.puf_support_clone_enabled is True - assert config.puf_support_clone_output_mode == "collapse_to_scaffold" - assert config.policyengine_direct_override_variables == ( - "health_savings_account_ald", - "non_sch_d_capital_gains", - ) - assert config.policyengine_prefer_existing_tax_unit_ids is True - assert config.random_seed == 123 - assert config.cps_asec_source_year == 2022 - - -def test_default_policyengine_us_data_rebuild_config_rejects_legacy_imputer_for_puf_support_clone() -> ( - None -): - try: - default_policyengine_us_data_rebuild_config(donor_imputer_backend="qrf") - except ValueError as exc: - message = str(exc) - assert "PUF support clone rebuilds require" in message - assert "donor_imputer_backend='regime_aware'" in message - else: - raise AssertionError("Expected PUF support clone qrf rebuild to fail") - - -def test_default_policyengine_us_data_rebuild_config_allows_legacy_imputer_when_puf_support_clone_disabled() -> ( - None -): - config = default_policyengine_us_data_rebuild_config( - puf_support_clone_enabled=False, - donor_imputer_backend="qrf", - ) - - assert config.puf_support_clone_enabled is False - assert config.donor_imputer_backend == "qrf" - - -def test_default_policyengine_us_data_rebuild_config_respects_calibration_support_override() -> ( - None -): - config = default_policyengine_us_data_rebuild_config( - policyengine_calibration_min_active_households=5 - ) - - assert config.policyengine_calibration_min_active_households == 5 - - -def test_default_policyengine_us_data_rebuild_source_providers_use_pe_style_bundle() -> ( - None -): - providers = default_policyengine_us_data_rebuild_source_providers( - cps_source_year=2022, - puf_target_year=2024, - cps_download=False, - puf_expand_persons=False, - policyengine_us_data_python="/tmp/pe-python", - ) - - assert len(providers) == 6 - cps_provider, puf_provider = providers[:2] - assert isinstance(cps_provider, CPSASECSourceProvider) - assert cps_provider.year == 2022 - assert cps_provider.download is False - assert isinstance(puf_provider, PUFSourceProvider) - assert puf_provider.target_year == 2024 - assert puf_provider.cps_reference_year == 2022 - assert puf_provider.expand_persons is False - assert puf_provider.uprating_mode == PUF_UPRATING_MODE_PE_SOI - assert puf_provider.policyengine_us_data_python == "/tmp/pe-python" - assert puf_provider.impute_pre_tax_contributions is False - assert puf_provider.require_pre_tax_contribution_model is False - assert puf_provider.social_security_split_strategy == ( - SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF - ) - assert isinstance(providers[2], ACSSourceProvider) - assert providers[2].year == 2024 - assert isinstance(providers[3], SIPPSourceProvider) - assert providers[3].block == "tips" - assert providers[3].target_year == 2024 - assert providers[3].policyengine_us_data_python == "/tmp/pe-python" - assert isinstance(providers[4], SIPPSourceProvider) - assert providers[4].block == "assets" - assert providers[4].target_year == 2024 - assert providers[4].policyengine_us_data_python == "/tmp/pe-python" - assert isinstance(providers[5], SCFSourceProvider) - assert providers[5].target_year == 2024 - - -def test_default_policyengine_us_data_rebuild_source_providers_keeps_acs_when_donor_surveys_disabled() -> ( - None -): - # include_donor_surveys=False disables the SIPP/SCF donors, but the ACS donor is - # always enabled (it supplies the rent / real_estate_taxes imputation), so it - # remains alongside the CPS spine and PUF. - providers = default_policyengine_us_data_rebuild_source_providers( - include_donor_surveys=False, - cps_download=False, - ) - - assert len(providers) == 3 - assert isinstance(providers[0], CPSASECSourceProvider) - assert isinstance(providers[1], PUFSourceProvider) - assert isinstance(providers[2], ACSSourceProvider) - - -def test_default_policyengine_us_data_rebuild_source_providers_can_include_donor_surveys() -> ( - None -): - providers = default_policyengine_us_data_rebuild_source_providers( - include_donor_surveys=True, - cps_download=False, - ) - - assert len(providers) == 6 - assert isinstance(providers[0], CPSASECSourceProvider) - assert isinstance(providers[1], PUFSourceProvider) - assert isinstance(providers[2], ACSSourceProvider) - assert isinstance(providers[3], SIPPSourceProvider) - assert providers[3].block == "tips" - assert providers[3].target_year == 2024 - assert isinstance(providers[4], SIPPSourceProvider) - assert providers[4].block == "assets" - assert providers[4].target_year == 2024 - assert isinstance(providers[5], SCFSourceProvider) - assert providers[5].target_year == 2024 - - -def test_build_policyengine_us_data_rebuild_pipeline_returns_configured_pipeline() -> ( - None -): - pipeline = build_policyengine_us_data_rebuild_pipeline( - random_seed=321, - calibration_max_iter=77, - ) - - assert isinstance(pipeline, USMicroplexPipeline) - assert pipeline.config.random_seed == 321 - assert pipeline.config.calibration_max_iter == 77 - assert pipeline.config.synthesis_backend == "seed" - assert pipeline.config.calibration_backend == "entropy" diff --git a/tests/pipelines/test_pe_us_data_rebuild_audit.py b/tests/pipelines/test_pe_us_data_rebuild_audit.py deleted file mode 100644 index 24ca0624..00000000 --- a/tests/pipelines/test_pe_us_data_rebuild_audit.py +++ /dev/null @@ -1,261 +0,0 @@ -"""Tests for PE-US-data rebuild native audit sidecars.""" - -import json - -from microplex_us.pipelines.pe_us_data_rebuild_audit import ( - build_policyengine_us_data_rebuild_native_audit, - write_policyengine_us_data_rebuild_native_audit, -) - - -def test_build_policyengine_us_data_rebuild_native_audit_summarizes_saved_artifact( - tmp_path, - monkeypatch, -): - artifact_dir = tmp_path / "run-1" - artifact_dir.mkdir() - candidate_dataset = artifact_dir / "policyengine_us.h5" - candidate_dataset.write_text("dataset") - baseline_dataset = tmp_path / "enhanced_cps_2024.h5" - baseline_dataset.write_text("baseline") - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "config": { - "policyengine_baseline_dataset": str(baseline_dataset), - "policyengine_dataset_year": 2024, - "policyengine_target_period": 2024, - }, - "artifacts": { - "policyengine_dataset": candidate_dataset.name, - }, - } - ) - ) - (artifact_dir / "policyengine_native_scores.json").write_text( - json.dumps( - { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "summary": { - "candidate_enhanced_cps_native_loss": 0.98, - "baseline_enhanced_cps_native_loss": 0.02, - "enhanced_cps_native_loss_delta": 0.96, - "candidate_beats_baseline": False, - }, - "family_breakdown": [ - { - "family": "national_irs_other", - "loss_contribution_delta": 0.30, - }, - { - "family": "state_agi_distribution", - "loss_contribution_delta": 0.20, - }, - { - "family": "state_snap_cost", - "loss_contribution_delta": -0.01, - }, - ], - } - ) - ) - (artifact_dir / "imputation_ablation.json").write_text( - json.dumps( - { - "summary": { - "source_count": 5, - "target_count": 94, - "production_variant": "structured_pe_conditioning", - "production_mean_weighted_mae": 34116.09, - "production_mean_support_f1": 0.5375, - "best_mean_weighted_mae_variant": "top_correlated_qrf", - "best_mean_support_f1_variant": "structured_pe_conditioning", - "variant_scorecard": { - "structured_pe_conditioning": { - "mean_weighted_mae": 34116.09, - "mean_support_f1": 0.5375, - }, - "top_correlated_qrf": { - "mean_weighted_mae": 32873.70, - "mean_support_f1": 0.5352, - }, - }, - } - } - ) - ) - - def fake_target_delta(**kwargs): - assert kwargs["from_dataset_path"] == baseline_dataset - assert kwargs["to_dataset_path"] == candidate_dataset - return { - "top_regressions": [ - { - "target_name": "state/CA/adjusted_gross_income/amount/100k_200k", - "weighted_term_delta": 0.12, - } - ], - "top_improvements": [ - { - "target_name": "state/NY/snap-cost", - "weighted_term_delta": -0.03, - } - ], - } - - def fake_support_audit(**kwargs): - assert kwargs["candidate_dataset_path"] == candidate_dataset - assert kwargs["baseline_dataset_path"] == baseline_dataset - return { - "comparisons": { - "critical_input_support": [ - { - "variable": "has_esi", - "candidate_stored": False, - "baseline_stored": True, - "weighted_nonzero_delta": -1250.0, - }, - { - "variable": "rental_income", - "candidate_stored": True, - "baseline_stored": True, - "weighted_nonzero_delta": -250.0, - }, - ], - "filing_status_weighted_delta": [ - { - "filing_status": "SEPARATE", - "weighted_count_delta": -400.0, - } - ], - "mfs_high_agi_delta": [ - { - "agi_bin": "200k_to_500k", - "weighted_count_delta": -75.0, - } - ], - "hoh_agi_delta": [ - { - "agi_bin": "500k_to_1m", - "weighted_count_delta": 500.0, - } - ], - "ssi_by_age_delta": [ - { - "age_bucket": "65_plus", - "weighted_recipient_delta": 400.0, - } - ], - "medicare_part_b_premiums_by_age_delta": [ - { - "age_bucket": "age_10_to_19", - "weighted_positive_delta": 300.0, - } - ], - "state_aca_ptc_spending_top_gaps": [ - { - "state": "CA", - "weighted_aca_ptc_delta": -220.0, - } - ], - "state_marketplace_enrollment_top_gaps": [ - { - "state": "CA", - "weighted_marketplace_enrollment_delta": -210.0, - } - ], - "state_age_bucket_top_gaps": [ - { - "state": "TX", - "age_bucket": "18_to_29", - "weight_delta": -180.0, - } - ], - } - } - - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_audit.compare_us_pe_native_target_deltas", - fake_target_delta, - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_audit.compute_us_pe_native_support_audit", - fake_support_audit, - ) - - audit = build_policyengine_us_data_rebuild_native_audit(artifact_dir, top_k=2) - - assert audit["artifactId"] == "run-1" - assert audit["nativeBroadLossSummary"]["enhanced_cps_native_loss_delta"] == 0.96 - assert audit["topFamilyRegressions"][0]["family"] == "national_irs_other" - assert audit["topFamilyImprovements"][0]["family"] == "state_snap_cost" - assert audit["topTargetRegressions"][0]["target_name"].startswith("state/CA/") - assert audit["supportAuditSummary"]["missingStoredCriticalInputs"] == ["has_esi"] - assert audit["supportAuditSummary"]["topCriticalInputSupportGaps"][0]["variable"] == "has_esi" - assert audit["supportAuditSummary"]["topHoHAgiGaps"][0]["agi_bin"] == "500k_to_1m" - assert audit["supportAuditSummary"]["topSSIByAgeGaps"][0]["age_bucket"] == "65_plus" - assert audit["supportAuditSummary"]["topMedicarePartBByAgeGaps"][0]["age_bucket"] == ( - "age_10_to_19" - ) - assert audit["supportAuditSummary"]["topAcaPtcSpendingGaps"][0]["state"] == "CA" - assert ( - audit["imputationAblationSummary"]["best_mean_weighted_mae_variant"] - == "top_correlated_qrf" - ) - assert audit["verdictHints"]["largestRegressingFamily"] == "national_irs_other" - assert audit["verdictHints"]["largestRegressingTarget"].startswith("state/CA/") - assert audit["verdictHints"]["productionImputationVariant"] == "structured_pe_conditioning" - assert audit["verdictHints"]["productionImputationVariantIsMaeWinner"] is False - assert audit["verdictHints"]["productionImputationVariantIsSupportWinner"] is True - - -def test_write_policyengine_us_data_rebuild_native_audit_writes_default_sidecar( - tmp_path, - monkeypatch, -): - artifact_dir = tmp_path / "run-2" - artifact_dir.mkdir() - candidate_dataset = artifact_dir / "policyengine_us.h5" - candidate_dataset.write_text("dataset") - baseline_dataset = tmp_path / "enhanced_cps_2024.h5" - baseline_dataset.write_text("baseline") - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "config": {"policyengine_baseline_dataset": str(baseline_dataset)}, - "artifacts": {"policyengine_dataset": candidate_dataset.name}, - } - ) - ) - (artifact_dir / "policyengine_native_scores.json").write_text( - json.dumps( - { - "summary": { - "candidate_enhanced_cps_native_loss": 1.0, - "baseline_enhanced_cps_native_loss": 0.1, - "enhanced_cps_native_loss_delta": 0.9, - }, - "family_breakdown": [], - } - ) - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_audit.compare_us_pe_native_target_deltas", - lambda **_kwargs: {"top_regressions": [], "top_improvements": []}, - ) - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_audit.compute_us_pe_native_support_audit", - lambda **_kwargs: {"comparisons": {}}, - ) - - output_path = write_policyengine_us_data_rebuild_native_audit(artifact_dir) - - assert output_path == artifact_dir / "pe_us_data_rebuild_native_audit.json" - payload = json.loads(output_path.read_text()) - assert payload["candidateDatasetPath"] == str(candidate_dataset) - assert payload["baselineDatasetPath"] == str(baseline_dataset.resolve()) - assert payload["imputationAblationSummary"] is None - assert payload["verdictHints"]["productionImputationVariant"] is None - assert payload["verdictHints"]["productionImputationVariantIsMaeWinner"] is None - assert payload["verdictHints"]["productionImputationVariantIsSupportWinner"] is None diff --git a/tests/pipelines/test_pe_us_data_rebuild_checkpoint.py b/tests/pipelines/test_pe_us_data_rebuild_checkpoint.py deleted file mode 100644 index 49abbc8e..00000000 --- a/tests/pipelines/test_pe_us_data_rebuild_checkpoint.py +++ /dev/null @@ -1,2401 +0,0 @@ -"""Tests for the PE-US-data rebuild checkpoint runner.""" - -from __future__ import annotations - -import json -import logging -from dataclasses import dataclass -from pathlib import Path -from types import SimpleNamespace -from typing import Any - -import h5py -import pandas as pd -import pytest -from microplex.core import SourceQuery - -import microplex_us.pipelines.pe_us_data_rebuild_checkpoint_artifacts as checkpoint_artifacts -import microplex_us.pipelines.pe_us_data_rebuild_checkpoint_cli as checkpoint_cli -import microplex_us.pipelines.pe_us_data_rebuild_checkpoint_common as checkpoint_common -import microplex_us.pipelines.pe_us_data_rebuild_checkpoint_resume as checkpoint_resume -import microplex_us.pipelines.pe_us_data_rebuild_checkpoint_runner as checkpoint_runner -from microplex_us.pipelines.artifacts import ( - USMicroplexArtifactPaths, - USMicroplexVersionedBuildArtifacts, -) -from microplex_us.pipelines.pe_us_data_rebuild import ( - default_policyengine_us_data_rebuild_source_providers, -) -from microplex_us.pipelines.pe_us_data_rebuild_checkpoint import ( - attach_policyengine_us_data_rebuild_checkpoint_evidence, - default_policyengine_us_data_rebuild_checkpoint_config, - default_policyengine_us_data_rebuild_queries, - run_policyengine_us_data_rebuild_checkpoint, -) -from microplex_us.pipelines.registry import load_us_microplex_run_registry -from microplex_us.pipelines.stage_contracts import ( - US_CANONICAL_STAGE_IDS, - US_STAGE_CONTRACT_VERSION, - get_us_pipeline_stage_contract, - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, -) -from microplex_us.pipelines.stage_resume import preflight_us_stage_resume - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_sets_pe_context() -> ( - None -): - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - target_profile="pe_native_broad", - n_synthetic=500, - random_seed=123, - ) - - assert config.synthesis_backend == "seed" - assert config.calibration_backend == "entropy" - assert config.policyengine_calibration_min_active_households == 20 - assert config.policyengine_calibration_deferred_stage_min_active_households == ( - 10, - 1, - ) - assert config.policyengine_calibration_deferred_stage_max_constraints == 24 - assert ( - config.policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error - is None - ) - assert config.policyengine_calibration_deferred_stage_top_family_count == 7 - assert config.policyengine_calibration_deferred_stage_top_geography_count == 4 - assert config.donor_imputer_backend == "regime_aware" - assert config.donor_imputer_condition_selection == "pe_prespecified" - assert config.donor_imputer_excluded_variables == () - assert config.policyengine_baseline_dataset == "/tmp/enhanced_cps_2024.h5" - assert config.policyengine_targets_db == "/tmp/policy_data.db" - assert config.policyengine_dataset_year == 2024 - assert config.policyengine_target_period == 2024 - assert config.policyengine_target_profile == "pe_native_broad" - assert config.policyengine_calibration_target_profile == "pe_native_broad" - assert config.policyengine_calibration_target_variables == () - assert config.policyengine_oracle_relative_error_cap == 10.0 - assert config.policyengine_direct_override_variables == ( - "health_savings_account_ald", - "non_sch_d_capital_gains", - ) - assert config.policyengine_prefer_existing_tax_unit_ids is True - assert config.n_synthetic == 500 - assert config.random_seed == 123 - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_rejects_legacy_imputer_for_puf_support_clone() -> ( - None -): - try: - default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - donor_imputer_backend="qrf", - ) - except ValueError as exc: - message = str(exc) - assert "PUF support clone rebuilds require" in message - assert "donor_imputer_backend='regime_aware'" in message - else: - raise AssertionError("Expected checkpoint qrf rebuild to fail") - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_preserves_explicit_calibration_scope() -> ( - None -): - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - calibration_target_variables=("snap",), - ) - - assert config.policyengine_calibration_target_variables == ("snap",) - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_uses_arch_source_backed_calibration_scope() -> ( - None -): - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - arch_targets_db=( - "/tmp/arch/fixtures/consumer_facts.jsonl", - "/tmp/arch/macro/targets.db", - ), - calibration_target_source="arch", - ) - - assert config.policyengine_target_profile == "pe_native_broad" - assert ( - config.policyengine_calibration_target_profile - == "pe_native_broad_source_backed" - ) - assert config.calibration_target_source == "arch" - assert config.arch_targets_db == ( - "/tmp/arch/fixtures/consumer_facts.jsonl", - "/tmp/arch/macro/targets.db", - ) - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_requires_arch_targets_for_arch_calibration() -> ( - None -): - try: - default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - calibration_target_source="arch", - ) - except ValueError as exc: - assert "arch_targets_db is required" in str(exc) - else: - raise AssertionError("Expected arch calibration without targets DB to fail") - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_infers_total_weight_targets( - monkeypatch, -) -> None: - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_config._infer_policyengine_baseline_household_weight_sum", - lambda dataset, *, target_period: 150_000_000.0, - ) - - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - ) - - assert config.policyengine_calibration_target_total_weight == 150_000_000.0 - assert config.policyengine_calibration_rescale_to_target_total_weight is True - assert config.policyengine_selection_target_total_weight == 150_000_000.0 - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_respects_explicit_total_weight_overrides( - monkeypatch, -) -> None: - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_config._infer_policyengine_baseline_household_weight_sum", - lambda dataset, *, target_period: 150_000_000.0, - ) - - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - policyengine_calibration_target_total_weight=123.0, - policyengine_selection_target_total_weight=456.0, - ) - - assert config.policyengine_calibration_target_total_weight == 123.0 - assert config.policyengine_selection_target_total_weight == 456.0 - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_skips_calibration_total_weight_when_rescaling_to_input_sum( - monkeypatch, -) -> None: - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_config._infer_policyengine_baseline_household_weight_sum", - lambda dataset, *, target_period: 150_000_000.0, - ) - - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - policyengine_calibration_rescale_to_input_weight_sum=True, - ) - - assert config.policyengine_calibration_target_total_weight is None - assert config.policyengine_calibration_rescale_to_target_total_weight is False - assert config.policyengine_selection_target_total_weight == 150_000_000.0 - - -def test_default_policyengine_us_data_rebuild_checkpoint_config_skips_inferred_total_weight_targets_for_no_calibration( - monkeypatch, -) -> None: - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_config._infer_policyengine_baseline_household_weight_sum", - lambda dataset, *, target_period: 150_000_000.0, - ) - - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - calibration_backend="none", - ) - - assert config.calibration_backend == "none" - assert config.policyengine_calibration_target_total_weight is None - assert config.policyengine_calibration_rescale_to_target_total_weight is False - assert config.policyengine_selection_target_total_weight is None - - -def test_infer_policyengine_baseline_household_weight_sum_returns_none_when_weight_array_missing( - tmp_path, -) -> None: - from microplex_us.pipelines.pe_us_data_rebuild_checkpoint import ( - _infer_policyengine_baseline_household_weight_sum, - ) - - dataset_path = tmp_path / "baseline.h5" - with h5py.File(dataset_path, "w") as handle: - household_id = handle.create_group("household_id") - household_id.create_dataset("2024", data=[1, 2, 3]) - - inferred = _infer_policyengine_baseline_household_weight_sum( - dataset_path, - target_period=2024, - ) - - assert inferred is None - - -def test_default_policyengine_us_data_rebuild_queries_assign_sample_sizes_by_provider_type() -> ( - None -): - providers = default_policyengine_us_data_rebuild_source_providers( - include_donor_surveys=True, - cps_download=False, - ) - - queries = default_policyengine_us_data_rebuild_queries( - providers, - cps_sample_n=11, - puf_sample_n=22, - donor_sample_n=33, - random_seed=7, - ) - - assert queries[providers[0].descriptor.name].provider_filters == { - "sample_n": 11, - "random_seed": 7, - "state_age_floor": 1, - } - assert queries[providers[1].descriptor.name].provider_filters == { - "sample_n": 22, - "random_seed": 7, - } - for provider in providers[2:]: - assert queries[provider.descriptor.name].provider_filters == { - "sample_n": 33, - "random_seed": 7, - "state_age_floor": 1, - } - - -def test_default_policyengine_us_data_rebuild_queries_derive_donor_sample_size_from_sampled_sources() -> ( - None -): - providers = default_policyengine_us_data_rebuild_source_providers( - include_donor_surveys=True, - cps_download=False, - ) - - queries = default_policyengine_us_data_rebuild_queries( - providers, - cps_sample_n=11, - puf_sample_n=22, - random_seed=7, - ) - - assert queries[providers[0].descriptor.name].provider_filters == { - "sample_n": 11, - "random_seed": 7, - "state_age_floor": 1, - } - for provider in providers[2:]: - assert queries[provider.descriptor.name].provider_filters == { - "sample_n": 22, - "random_seed": 7, - "state_age_floor": 1, - } - - -def test_default_policyengine_us_data_rebuild_queries_can_disable_cps_state_age_floor() -> ( - None -): - providers = default_policyengine_us_data_rebuild_source_providers( - include_donor_surveys=False, - cps_download=False, - ) - - queries = default_policyengine_us_data_rebuild_queries( - providers, - cps_sample_n=11, - puf_sample_n=22, - cps_state_age_floor=None, - random_seed=7, - ) - - assert queries[providers[0].descriptor.name].provider_filters == { - "sample_n": 11, - "random_seed": 7, - } - - -@dataclass(frozen=True) -class _FakeProvider: - descriptor: Any - - def load_frame(self, query: Any | None = None) -> SimpleNamespace: - return SimpleNamespace( - source=SimpleNamespace(name=self.descriptor.name), - query=query, - ) - - -def _fake_resume_dataframe() -> pd.DataFrame: - return pd.DataFrame( - { - "household_id": [1], - "person_id": [1], - "weight": [1.0], - } - ) - - -def _fake_policyengine_entity_tables(): - return checkpoint_resume.PolicyEngineUSEntityTableBundle( - households=pd.DataFrame({"household_id": [1], "weight": [1.0]}), - persons=pd.DataFrame({"person_id": [1], "household_id": [1]}), - tax_units=pd.DataFrame({"tax_unit_id": [1], "household_id": [1]}), - spm_units=pd.DataFrame({"spm_unit_id": [1], "household_id": [1]}), - families=pd.DataFrame({"family_id": [1], "household_id": [1]}), - marital_units=pd.DataFrame({"marital_unit_id": [1], "household_id": [1]}), - ) - - -def _fake_resume_targets(): - return checkpoint_resume.USMicroplexTargets(marginal={}, continuous={}) - - -def _fake_resume_build_result(config, *, scaffold_seed_data=None): - policyengine_tables = _fake_policyengine_entity_tables() - return checkpoint_resume.USMicroplexBuildResult( - config=config, - seed_data=_fake_resume_dataframe(), - synthetic_data=_fake_resume_dataframe(), - calibrated_data=_fake_resume_dataframe(), - targets=_fake_resume_targets(), - calibration_summary={"converged": True}, - synthesis_metadata={"source_names": ["fake_source"]}, - policyengine_tables=policyengine_tables, - pre_calibration_policyengine_tables=policyengine_tables, - scaffold_seed_data=scaffold_seed_data, - ) - - -def _install_resume_stage_test_doubles(monkeypatch, artifact_root, captured) -> None: - captured.setdefault("finalized", []) - captured.setdefault("pipeline_calls", []) - captured.setdefault("runtime_started", []) - captured.setdefault("loaded_dataframes", []) - captured.setdefault("loaded_json", []) - captured.setdefault("loaded_policyengine_tables", []) - captured.setdefault("written_policyengine_tables", []) - captured.setdefault("pipeline_manifest_payloads", []) - captured.setdefault("resolved_source_queries", []) - - class FakeResumePipeline: - def __init__(self, config=None, *, stage_runtime_writer=None): - self.config = ( - config - or default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - ) - ) - self.stage_runtime_writer = stage_runtime_writer - if stage_runtime_writer is not None: - captured["pipeline_manifest_payloads"].append( - dict(stage_runtime_writer.manifest_payload) - ) - - def _resolve_source_query(self, provider, queries): - captured["resolved_source_queries"].append(provider.descriptor.name) - return dict(queries or {}).get(provider.descriptor.name) - - def build_from_source_providers(self, providers, *, queries=None): - captured["pipeline_calls"].append( - ( - "build_from_source_providers", - tuple(provider.descriptor.name for provider in providers), - ) - ) - return _fake_resume_build_result(self.config) - - def build_from_frames( - self, - frames, - *, - resume_from_stage=None, - restored_scaffold_seed_data=None, - ): - captured["pipeline_calls"].append( - ( - "build_from_frames", - resume_from_stage, - restored_scaffold_seed_data is not None, - tuple(frame.source.name for frame in frames), - ) - ) - return _fake_resume_build_result( - self.config, - scaffold_seed_data=restored_scaffold_seed_data, - ) - - def _runtime_start_stage(self, stage_id): - captured["runtime_started"].append(stage_id) - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.start_stage(stage_id) - - def _runtime_fail_stage(self, stage_id, exc): - if self.stage_runtime_writer is not None: - self.stage_runtime_writer.fail_stage(stage_id, exc) - - def build_policyengine_entity_tables(self, synthetic_data): - captured["pipeline_calls"].append( - ("build_policyengine_entity_tables", len(synthetic_data)) - ) - return _fake_policyengine_entity_tables() - - def _check_policyengine_export_column_contract(self, tables, *, stage): - captured["pipeline_calls"].append( - ("check_policyengine_export_column_contract", stage) - ) - - def _has_policyengine_calibration_targets(self): - return False - - def calibrate(self, synthetic_data, targets): - captured["pipeline_calls"].append(("calibrate", len(synthetic_data))) - return _fake_resume_dataframe(), {"converged": True} - - def calibrate_policyengine_tables(self, synthetic_tables): - captured["pipeline_calls"].append(("calibrate_policyengine_tables", None)) - return ( - _fake_policyengine_entity_tables(), - _fake_resume_dataframe(), - {"converged": True}, - ) - - def fake_load_resume_dataframe_artifact( - _artifact_root, - _manifest, - artifact_key, - *, - stage_id, - ): - captured["loaded_dataframes"].append((stage_id, artifact_key)) - return _fake_resume_dataframe() - - def fake_load_resume_json_artifact( - _artifact_root, - _manifest, - artifact_key, - *, - stage_id, - ): - captured["loaded_json"].append((stage_id, artifact_key)) - return {"converged": True} - - def fake_load_resume_policyengine_tables( - _artifact_root, - _manifest, - artifact_key, - *, - stage_id, - expected_stage, - ): - captured["loaded_policyengine_tables"].append( - (stage_id, artifact_key, expected_stage) - ) - return _fake_policyengine_entity_tables() - - def fake_load_resume_targets(*_args, **_kwargs): - return _fake_resume_targets() - - def fake_read_parquet(path, *_args, **_kwargs): - captured.setdefault("read_parquet_paths", []).append(Path(path)) - return _fake_resume_dataframe() - - def fake_write_policyengine_tables( - _tables, - artifact_root_arg, - *, - stage_id, - artifact_key, - checkpoint_stage, - ): - captured["written_policyengine_tables"].append( - (stage_id, artifact_key, checkpoint_stage) - ) - path = resolve_us_stage_artifact_contract_path( - Path(artifact_root_arg), - stage_id, - artifact_key, - ) - path.parent.mkdir(parents=True, exist_ok=True) - metadata = { - "format_version": 1, - "stage": checkpoint_stage, - } - for table_name in ( - "households", - "persons", - "tax_units", - "spm_units", - "families", - "marital_units", - ): - metadata[table_name] = {"rows": 1, "columns": [f"{table_name}_id"]} - (path.parent / f"{table_name}.parquet").write_text("placeholder") - path.write_text(json.dumps(metadata)) - - def fake_finalize(build_result, **kwargs): - captured["finalized"].append(kwargs["version_id"]) - return _fake_versioned_artifacts(artifact_root, build_result) - - def fake_attach(artifact_dir, **kwargs): - captured["attach_build_result"] = kwargs["build_result"] - return _fake_evidence_result(Path(artifact_dir)) - - def fake_load_artifacts(*, build_result, artifact_root, frontier_metric): - captured["loaded_build_result"] = build_result - return _fake_versioned_artifacts(artifact_root, build_result) - - monkeypatch.setattr(checkpoint_resume, "USMicroplexPipeline", FakeResumePipeline) - monkeypatch.setattr( - checkpoint_resume, - "_load_resume_dataframe_artifact", - fake_load_resume_dataframe_artifact, - ) - monkeypatch.setattr( - checkpoint_resume, - "_load_resume_json_artifact", - fake_load_resume_json_artifact, - ) - monkeypatch.setattr( - checkpoint_resume, - "_load_resume_policyengine_tables", - fake_load_resume_policyengine_tables, - ) - monkeypatch.setattr( - checkpoint_resume, - "_load_resume_targets", - fake_load_resume_targets, - ) - monkeypatch.setattr(checkpoint_resume.pd, "read_parquet", fake_read_parquet) - monkeypatch.setattr( - checkpoint_resume, - "write_us_policyengine_entity_stage_artifact", - fake_write_policyengine_tables, - ) - monkeypatch.setattr( - checkpoint_resume, - "_finalize_versioned_build_artifacts", - fake_finalize, - ) - monkeypatch.setattr( - checkpoint_resume, - "attach_policyengine_us_data_rebuild_checkpoint_evidence", - fake_attach, - ) - monkeypatch.setattr( - checkpoint_resume, - "_load_checkpoint_versioned_artifacts", - fake_load_artifacts, - ) - - -@pytest.mark.parametrize("resume_from_stage", US_CANONICAL_STAGE_IDS) -def test_run_policyengine_us_data_rebuild_checkpoint_can_resume_from_each_stage( - monkeypatch, - tmp_path, - resume_from_stage, -) -> None: - artifact_root = _write_complete_resume_artifact_root( - tmp_path / "artifacts" / "run-1" - ) - provider = _FakeProvider(descriptor=SimpleNamespace(name="fake_source")) - captured: dict[str, Any] = {"finalized": []} - _install_resume_stage_test_doubles(monkeypatch, artifact_root, captured) - - result = run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - providers=(provider,), - queries={}, - version_id="run-1", - resume_from_stage=resume_from_stage, - defer_policyengine_harness=True, - defer_policyengine_native_score=True, - defer_native_audit=True, - defer_imputation_ablation=True, - ) - - calls = captured["pipeline_calls"] - if resume_from_stage in {"01_run_profile", "02_source_loading"}: - assert ("build_from_source_providers", ("fake_source",)) in calls - elif resume_from_stage in { - "03_source_planning", - "04_seed_scaffold", - "05_donor_integration_synthesis", - }: - assert captured["resolved_source_queries"] == ["fake_source"] - assert ( - "build_from_frames", - resume_from_stage, - resume_from_stage == "05_donor_integration_synthesis", - ("fake_source",), - ) in calls - if resume_from_stage == "05_donor_integration_synthesis": - assert ( - "04_seed_scaffold", - "scaffold_seed_data", - ) in captured["loaded_dataframes"] - elif resume_from_stage == "06_policyengine_entities": - assert "06_policyengine_entities" in captured["runtime_started"] - assert "07_calibration" in captured["runtime_started"] - assert ( - "06_policyengine_entities", - "pre_calibration_policyengine_entity_tables", - "post_microsim", - ) in captured["written_policyengine_tables"] - assert ( - "07_calibration", - "policyengine_entity_tables", - "post_calibration", - ) in captured["written_policyengine_tables"] - elif resume_from_stage == "07_calibration": - assert "06_policyengine_entities" not in captured["runtime_started"] - assert "07_calibration" in captured["runtime_started"] - assert ( - "06_policyengine_entities", - "pre_calibration_policyengine_entity_tables", - "post_microsim", - ) in captured["loaded_policyengine_tables"] - assert ( - "07_calibration", - "policyengine_entity_tables", - "post_calibration", - ) in captured["written_policyengine_tables"] - else: - assert captured["runtime_started"] == [] - assert ( - "06_policyengine_entities", - "pre_calibration_policyengine_entity_tables", - "post_microsim", - ) in captured["loaded_policyengine_tables"] - assert ( - "07_calibration", - "policyengine_entity_tables", - "post_calibration", - ) in captured["loaded_policyengine_tables"] - assert ( - "07_calibration", - "calibration_summary", - ) in captured["loaded_json"] - - if resume_from_stage == "09_validation_benchmarking": - assert captured["finalized"] == [] - else: - assert captured["finalized"] == ["run-1"] - assert captured["attach_build_result"] is result.artifacts.build_result - assert captured["loaded_build_result"] is result.artifacts.build_result - - -def test_artifact_backed_resume_preflights_before_default_provider_setup( - monkeypatch, - tmp_path, -) -> None: - artifact_root = tmp_path / "artifacts" / "run-1" - manifest_dir = artifact_root / "stage_artifacts" / "manifests" - manifest_dir.mkdir(parents=True) - (artifact_root / "manifest.json").write_text( - json.dumps( - { - "artifacts": { - "synthetic_data": "synthetic_data.parquet", - } - } - ) - ) - (manifest_dir / "05_donor_integration_synthesis.json").write_text( - json.dumps( - { - "contractVersion": US_STAGE_CONTRACT_VERSION, - "stageId": "05_donor_integration_synthesis", - "complete": True, - "lifecycleStatus": "complete", - "requiredOutputs": ["seed_data", "synthetic_data"], - "outputs": { - "synthetic_data": { - "path": "synthetic_data.parquet", - "exists": False, - }, - }, - } - ) - ) - - def fail_provider_setup(**_kwargs): - raise AssertionError("default provider setup should not run before preflight") - - monkeypatch.setattr( - checkpoint_runner, - "default_policyengine_us_data_rebuild_source_providers", - fail_provider_setup, - ) - - with pytest.raises(ValueError, match="US pipeline resume preflight failed"): - run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - version_id="run-1", - resume_from_stage="06_policyengine_entities", - defer_policyengine_harness=True, - defer_policyengine_native_score=True, - defer_native_audit=True, - defer_imputation_ablation=True, - ) - - -def test_stage_resume_preflight_allows_stage1_without_manifest(tmp_path) -> None: - preflight = preflight_us_stage_resume(tmp_path, "01_run_profile") - - assert preflight.ok - - -@pytest.mark.parametrize("use_version_id", [True, False]) -def test_run_policyengine_us_data_rebuild_checkpoint_stage1_resume_allows_missing_manifest( - monkeypatch, - tmp_path, - use_version_id, -) -> None: - artifact_root = tmp_path / "artifacts" / "run-1" - artifact_root.mkdir(parents=True) - provider = _FakeProvider(descriptor=SimpleNamespace(name="fake_source")) - captured: dict[str, Any] = {} - _install_resume_stage_test_doubles(monkeypatch, artifact_root, captured) - - output_root = tmp_path / "artifacts" if use_version_id else artifact_root - version_id = "run-1" if use_version_id else None - result = run_policyengine_us_data_rebuild_checkpoint( - output_root=output_root, - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - providers=(provider,), - queries={}, - version_id=version_id, - resume_from_stage="01_run_profile", - defer_policyengine_harness=True, - defer_policyengine_native_score=True, - defer_native_audit=True, - defer_imputation_ablation=True, - ) - - manifest_path = artifact_root / "manifest.json" - stage_manifest_path = ( - artifact_root / "stage_artifacts" / "manifests" / "01_run_profile.json" - ) - assert manifest_path.exists() - assert stage_manifest_path.exists() - assert captured["pipeline_manifest_payloads"][0] == {} - assert ("build_from_source_providers", ("fake_source",)) in captured[ - "pipeline_calls" - ] - manifest = json.loads(manifest_path.read_text()) - stage_manifest = json.loads(stage_manifest_path.read_text()) - assert manifest["stage_output_manifests"]["01_run_profile"] == ( - "stage_artifacts/manifests/01_run_profile.json" - ) - assert stage_manifest["stageId"] == "01_run_profile" - assert stage_manifest["complete"] is True - assert captured["attach_build_result"] is result.artifacts.build_result - assert captured["loaded_build_result"] is result.artifacts.build_result - - -def test_stage_resume_preflight_reports_missing_policyengine_bundle_member( - tmp_path, -) -> None: - artifact_root = _write_complete_resume_artifact_root(tmp_path / "run-1") - missing_member = ( - artifact_root - / "stage_artifacts" - / "06_policyengine_entities" - / "persons.parquet" - ) - missing_member.unlink() - - preflight = preflight_us_stage_resume( - artifact_root, - "07_calibration", - ) - - assert not preflight.ok - missing = {item.label: item for item in preflight.missing} - assert ( - "06_policyengine_entities.pre_calibration_policyengine_entity_tables" in missing - ) - assert ( - missing[ - "06_policyengine_entities.pre_calibration_policyengine_entity_tables" - ].path - == missing_member - ) - - -def test_run_policyengine_us_data_rebuild_checkpoint_builds_bundle_and_parity( - monkeypatch, - tmp_path, - caplog, -) -> None: - monkeypatch.chdir(tmp_path) - monkeypatch.setattr( - "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_config._infer_policyengine_baseline_household_weight_sum", - lambda dataset, *, target_period: 150_000_000.0, - ) - artifact_dir = tmp_path / "artifacts" / "run-1" - artifact_dir.mkdir(parents=True) - provider = _FakeProvider(descriptor=SimpleNamespace(name="fake_source")) - query = SourceQuery(provider_filters={"sample_n": 5}) - captured: dict[str, Any] = {} - - def fake_build_and_save_versioned_us_microplex_from_source_providers( - *, - providers, - output_root, - config, - queries, - version_id, - frontier_metric, - policyengine_comparison_cache, - policyengine_target_provider, - policyengine_baseline_dataset, - policyengine_harness_slices, - policyengine_harness_metadata, - policyengine_us_data_repo, - defer_policyengine_harness, - require_policyengine_native_score, - defer_policyengine_native_score, - precomputed_policyengine_harness_payload, - precomputed_policyengine_native_scores, - run_registry_path, - run_index_path, - run_registry_metadata, - enable_child_tax_unit_agi_drift, - allow_stage_input_overrides, - stage_input_overrides, - ): - captured.update( - { - "providers": providers, - "output_root": output_root, - "config": config, - "queries": queries, - "version_id": version_id, - "frontier_metric": frontier_metric, - "policyengine_baseline_dataset": policyengine_baseline_dataset, - "policyengine_harness_metadata": policyengine_harness_metadata, - "run_registry_metadata": run_registry_metadata, - "defer_policyengine_harness": defer_policyengine_harness, - "defer_policyengine_native_score": defer_policyengine_native_score, - "enable_child_tax_unit_agi_drift": enable_child_tax_unit_agi_drift, - "allow_stage_input_overrides": allow_stage_input_overrides, - "stage_input_overrides": stage_input_overrides, - } - ) - manifest = { - "created_at": "2026-04-06T00:00:00+00:00", - "config": config.to_dict(), - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 20.0}, - "targets": {"n_marginal_groups": 1, "n_continuous": 0}, - "synthesis": { - "scaffold_source": "fake_source", - "source_names": ["fake_source"], - "backend": "seed", - "condition_vars": [], - "target_vars": [], - "donor_integrated_variables": [], - "state_program_support_proxies": {"available": [], "missing": []}, - }, - "calibration": { - "converged": True, - "n_loaded_targets": 1, - "n_supported_targets": 1, - "full_oracle_capped_mean_abs_relative_error": 0.12, - "full_oracle_mean_abs_relative_error": 0.12, - }, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_dataset": "policyengine_us.h5", - }, - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - (artifact_dir / "policyengine_us.h5").write_text("dataset") - return USMicroplexVersionedBuildArtifacts( - build_result=SimpleNamespace(config=config), - artifact_paths=USMicroplexArtifactPaths( - output_dir=artifact_dir, - version_id="run-1", - seed_data=artifact_dir / "seed_data.parquet", - synthetic_data=artifact_dir / "synthetic_data.parquet", - calibrated_data=artifact_dir / "calibrated_data.parquet", - targets=artifact_dir / "targets.json", - manifest=artifact_dir / "manifest.json", - ), - ) - - def fake_write_policyengine_us_data_rebuild_parity_artifact( - artifact_dir_arg, - output_path=None, - *, - program=None, - manifest_payload=None, - harness_payload=None, - native_scores_payload=None, - ) -> Path: - assert manifest_payload is None - assert harness_payload is None - assert native_scores_payload is None - path = ( - Path(output_path) - if output_path is not None - else Path(artifact_dir_arg) / "pe_us_data_rebuild_parity.json" - ) - path.write_text( - json.dumps( - { - "program": {"programId": program.program_id}, - "verdict": {"hasRealPolicyEngineComparison": False}, - } - ) - ) - return path - - def fake_build_policyengine_us_data_rebuild_parity_artifact( - artifact_dir_arg, - *, - program=None, - manifest_payload=None, - harness_payload=None, - native_scores_payload=None, - ) -> dict[str, Any]: - assert manifest_payload is None - assert harness_payload is None - assert native_scores_payload is None - return { - "artifactId": Path(artifact_dir_arg).name, - "program": {"programId": program.program_id}, - "verdict": {"hasRealPolicyEngineComparison": False}, - } - - module_name = "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_runner" - monkeypatch.setattr( - f"{module_name}.build_and_save_versioned_us_microplex_from_source_providers", - fake_build_and_save_versioned_us_microplex_from_source_providers, - ) - - def fake_attach_policyengine_us_data_rebuild_checkpoint_evidence( - artifact_dir_arg, - **kwargs, - ): - captured["attach_kwargs"] = kwargs - artifact_root = Path(artifact_dir_arg) - registry_path = tmp_path / "artifacts" / "run_registry.jsonl" - run_index_path = tmp_path / "artifacts" / "run_index.duckdb" - manifest_path = artifact_root / "manifest.json" - manifest = json.loads(manifest_path.read_text()) - manifest["artifacts"]["policyengine_harness"] = "policyengine_harness.json" - (artifact_root / "policyengine_harness.json").write_text( - json.dumps( - { - "summary": { - "candidate_mean_abs_relative_error": 0.08, - "baseline_mean_abs_relative_error": 0.10, - "mean_abs_relative_error_delta": -0.02, - } - } - ) - ) - manifest["policyengine_harness"] = { - "candidate_mean_abs_relative_error": 0.08, - "baseline_mean_abs_relative_error": 0.10, - "mean_abs_relative_error_delta": -0.02, - } - registry_path.write_text( - json.dumps( - { - "created_at": "2026-04-06T00:00:00+00:00", - "artifact_id": "run-1", - "artifact_dir": str(artifact_root.resolve()), - "manifest_path": str(manifest_path.resolve()), - "policyengine_harness_path": str( - (artifact_root / "policyengine_harness.json").resolve() - ), - "full_oracle_capped_mean_abs_relative_error": 0.12, - "full_oracle_mean_abs_relative_error": 0.12, - "enhanced_cps_native_loss_delta": 0.5, - } - ) - + "\n" - ) - run_index_path.write_text("") - manifest["run_registry"] = { - "path": "artifacts/run_registry.jsonl", - "artifact_id": "run-1", - } - manifest["run_index"] = { - "path": "artifacts/run_index.duckdb", - "artifact_id": "run-1", - } - if kwargs.get("precomputed_imputation_ablation_payload") is not None: - manifest["artifacts"]["imputation_ablation"] = "imputation_ablation.json" - manifest["imputation_ablation"] = dict( - kwargs["precomputed_imputation_ablation_payload"].get("summary", {}) - ) - (artifact_root / "imputation_ablation.json").write_text( - json.dumps(kwargs["precomputed_imputation_ablation_payload"]) - ) - manifest["artifacts"]["policyengine_native_audit"] = ( - "pe_us_data_rebuild_native_audit.json" - ) - manifest["policyengine_native_audit"] = { - "largestRegressingFamily": None, - } - (artifact_root / "pe_us_data_rebuild_native_audit.json").write_text( - json.dumps({"verdictHints": {"largestRegressingFamily": None}}) - ) - manifest_path.write_text(json.dumps(manifest)) - return SimpleNamespace( - artifact_dir=artifact_root, - manifest_path=manifest_path, - harness_path=artifact_root / "policyengine_harness.json", - native_scores_path=None, - parity_path=fake_write_policyengine_us_data_rebuild_parity_artifact( - artifact_dir_arg, - program=kwargs.get("program"), - ), - parity_payload=fake_build_policyengine_us_data_rebuild_parity_artifact( - artifact_dir_arg, - program=kwargs.get("program"), - ), - native_audit_path=artifact_root / "pe_us_data_rebuild_native_audit.json", - native_audit_payload={"verdictHints": {"largestRegressingFamily": None}}, - imputation_ablation_path=( - artifact_root / "imputation_ablation.json" - if kwargs.get("precomputed_imputation_ablation_payload") is not None - else None - ), - imputation_ablation_payload=kwargs.get( - "precomputed_imputation_ablation_payload" - ), - ) - - monkeypatch.setattr( - f"{module_name}.attach_policyengine_us_data_rebuild_checkpoint_evidence", - fake_attach_policyengine_us_data_rebuild_checkpoint_evidence, - ) - - caplog.set_level( - logging.INFO, - logger="microplex_us.pipelines.pe_us_data_rebuild_checkpoint", - ) - result = run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - providers=(provider,), - queries={"fake_source": query}, - version_id="run-1", - ) - - assert result.provider_names == ("fake_source",) - assert result.queries == {"fake_source": query} - assert result.parity_path == artifact_dir / "pe_us_data_rebuild_parity.json" - assert result.parity_payload["program"]["programId"] == "pe-us-data-rebuild-v1" - assert captured["providers"] == [provider] - assert captured["queries"] == {"fake_source": query} - assert captured["version_id"] == "run-1" - assert captured["frontier_metric"] == "full_oracle_capped_mean_abs_relative_error" - assert captured["policyengine_baseline_dataset"] == "/tmp/enhanced_cps_2024.h5" - assert captured["config"].policyengine_targets_db == "/tmp/policy_data.db" - assert ( - captured["config"].policyengine_calibration_target_total_weight == 150_000_000.0 - ) - assert ( - captured["config"].policyengine_calibration_rescale_to_target_total_weight - is True - ) - assert ( - captured["config"].policyengine_selection_target_total_weight == 150_000_000.0 - ) - assert captured["defer_policyengine_harness"] is True - assert captured["defer_policyengine_native_score"] is True - assert captured["enable_child_tax_unit_agi_drift"] is True - assert captured["allow_stage_input_overrides"] is False - assert captured["stage_input_overrides"] == () - assert captured["policyengine_harness_metadata"]["rebuild_checkpoint"] is True - assert captured["policyengine_harness_metadata"]["rebuild_program_id"] == ( - "pe-us-data-rebuild-v1" - ) - assert captured["policyengine_harness_metadata"]["rebuild_provider_names"] == [ - "fake_source" - ] - assert captured["run_registry_metadata"]["rebuild_profile_expected"] is True - assert captured["attach_kwargs"]["build_result"].config == captured["config"] - assert captured["attach_kwargs"]["compute_native_audit"] is True - assert captured["attach_kwargs"]["compute_imputation_ablation"] is True - assert captured["attach_kwargs"]["precomputed_imputation_ablation_payload"] is None - assert ( - result.artifacts.artifact_paths.policyengine_harness - == artifact_dir / "policyengine_harness.json" - ) - assert ( - result.artifacts.artifact_paths.run_registry - == tmp_path / "artifacts" / "run_registry.jsonl" - ) - assert ( - result.artifacts.artifact_paths.run_index_db - == tmp_path / "artifacts" / "run_index.duckdb" - ) - assert ( - result.artifacts.artifact_paths.policyengine_native_audit - == artifact_dir / "pe_us_data_rebuild_native_audit.json" - ) - assert result.artifacts.current_entry is not None - assert result.artifacts.current_entry.artifact_id == "run-1" - assert result.artifacts.frontier_entry is not None - assert result.artifacts.frontier_entry.artifact_id == "run-1" - assert result.artifacts.frontier_delta == 0.0 - assert ( - result.native_audit_path - == artifact_dir / "pe_us_data_rebuild_native_audit.json" - ) - assert result.native_audit_payload == { - "verdictHints": {"largestRegressingFamily": None} - } - assert result.imputation_ablation_path is None - assert result.imputation_ablation_payload is None - log_messages = [record.getMessage() for record in caplog.records] - assert any( - "PE-US-data rebuild checkpoint: starting build" in message - and "version_id=run-1" in message - and "providers=fake_source" in message - for message in log_messages - ) - assert any( - "PE-US-data rebuild checkpoint: build complete" in message - and str(artifact_dir) in message - for message in log_messages - ) - assert any( - "PE-US-data rebuild checkpoint: attaching PE evidence" in message - and "compute_native_audit=True" in message - for message in log_messages - ) - assert any( - "PE-US-data rebuild checkpoint: evidence complete" in message - and "pe_us_data_rebuild_parity.json" in message - for message in log_messages - ) - assert any( - "PE-US-data rebuild checkpoint: checkpoint ready" in message - and str(artifact_dir) in message - for message in log_messages - ) - - -def test_emit_checkpoint_progress_falls_back_to_stderr_when_no_logger_handlers( - monkeypatch, - capsys, -) -> None: - emitted: list[str] = [] - - class _FakeLogger: - handlers: list[object] = [] - - def info(self, message: str) -> None: - emitted.append(message) - - monkeypatch.setattr(checkpoint_common, "LOGGER", _FakeLogger()) - monkeypatch.setattr(checkpoint_common, "_root_logger_has_handlers", lambda: False) - - checkpoint_common._emit_checkpoint_progress( - "PE-US-data rebuild checkpoint: starting build", - version_id="run-1", - providers="fake_source", - ) - - stderr = capsys.readouterr().err - assert emitted == [ - "PE-US-data rebuild checkpoint: starting build " - "[version_id=run-1, providers=fake_source]" - ] - assert ( - stderr == "PE-US-data rebuild checkpoint: starting build " - "[version_id=run-1, providers=fake_source]\n" - ) - - -def test_main_passes_donor_condition_selection_override(monkeypatch, capsys) -> None: - captured: dict[str, Any] = {} - artifact_dir = Path("/tmp/artifacts/run-1") - parity_path = artifact_dir / "pe_us_data_rebuild_parity.json" - - def fake_run_policyengine_us_data_rebuild_checkpoint(**kwargs): - captured.update(kwargs) - return SimpleNamespace( - artifacts=SimpleNamespace( - artifact_paths=SimpleNamespace(output_dir=artifact_dir) - ), - parity_path=parity_path, - parity_payload={ - "verdict": {"hasRealPolicyEngineComparison": True}, - }, - ) - - monkeypatch.setattr( - checkpoint_cli, - "run_policyengine_us_data_rebuild_checkpoint", - fake_run_policyengine_us_data_rebuild_checkpoint, - ) - - checkpoint_cli.main( - [ - "--output-root", - "/tmp/artifacts", - "--baseline-dataset", - "/tmp/enhanced_cps_2024.h5", - "--targets-db", - "/tmp/policy_data.db", - "--version-id", - "run-1", - "--donor-imputer-condition-selection", - "pe_plus_puf_native_challenger", - "--defer-native-audit", - "--defer-imputation-ablation", - ] - ) - - assert captured["config_overrides"]["donor_imputer_condition_selection"] == ( - "pe_plus_puf_native_challenger" - ) - assert captured["config_overrides"]["n_synthetic"] == 100_000 - assert captured["config_overrides"]["random_seed"] == 42 - assert captured["defer_native_audit"] is True - assert captured["defer_imputation_ablation"] is True - stdout = capsys.readouterr().out - assert "/tmp/artifacts/run-1" in stdout - assert "hasRealPolicyEngineComparison" in stdout - - -def test_main_passes_arch_calibration_target_source(monkeypatch, capsys) -> None: - captured: dict[str, Any] = {} - artifact_dir = Path("/tmp/artifacts/run-1") - parity_path = artifact_dir / "pe_us_data_rebuild_parity.json" - - def fake_run_policyengine_us_data_rebuild_checkpoint(**kwargs): - captured.update(kwargs) - return SimpleNamespace( - artifacts=SimpleNamespace( - artifact_paths=SimpleNamespace(output_dir=artifact_dir) - ), - parity_path=parity_path, - parity_payload={ - "verdict": {"hasRealPolicyEngineComparison": True}, - }, - ) - - monkeypatch.setattr( - checkpoint_cli, - "run_policyengine_us_data_rebuild_checkpoint", - fake_run_policyengine_us_data_rebuild_checkpoint, - ) - - checkpoint_cli.main( - [ - "--output-root", - "/tmp/artifacts", - "--baseline-dataset", - "/tmp/enhanced_cps_2024.h5", - "--targets-db", - "/tmp/policy_data.db", - "--version-id", - "run-1", - "--calibration-target-source", - "arch", - "--arch-targets-db", - "/tmp/arch/fixtures/consumer_facts.jsonl", - "--arch-targets-db", - "/tmp/arch/macro/targets.db", - "--defer-native-audit", - "--defer-imputation-ablation", - ] - ) - - assert captured["target_profile"] == "pe_native_broad" - assert captured["calibration_target_profile"] is None - assert captured["calibration_target_source"] == "arch" - assert captured["arch_targets_db"] == ( - "/tmp/arch/fixtures/consumer_facts.jsonl", - "/tmp/arch/macro/targets.db", - ) - stdout = capsys.readouterr().out - assert "/tmp/artifacts/run-1" in stdout - - -def test_main_passes_resume_from_stage(monkeypatch, capsys) -> None: - captured: dict[str, Any] = {} - artifact_dir = Path("/tmp/artifacts/run-1") - parity_path = artifact_dir / "pe_us_data_rebuild_parity.json" - - def fake_run_policyengine_us_data_rebuild_checkpoint(**kwargs): - captured.update(kwargs) - return SimpleNamespace( - artifacts=SimpleNamespace( - artifact_paths=SimpleNamespace(output_dir=artifact_dir) - ), - parity_path=parity_path, - parity_payload={ - "verdict": {"hasRealPolicyEngineComparison": True}, - }, - ) - - monkeypatch.setattr( - checkpoint_cli, - "run_policyengine_us_data_rebuild_checkpoint", - fake_run_policyengine_us_data_rebuild_checkpoint, - ) - - checkpoint_cli.main( - [ - "--output-root", - "/tmp/artifacts", - "--baseline-dataset", - "/tmp/enhanced_cps_2024.h5", - "--targets-db", - "/tmp/policy_data.db", - "--version-id", - "run-1", - "--resume-from-stage", - "07_calibration", - "--defer-native-audit", - "--defer-imputation-ablation", - ] - ) - - assert captured["resume_from_stage"] == "07_calibration" - stdout = capsys.readouterr().out - assert "/tmp/artifacts/run-1" in stdout - - -def test_run_resume_preflight_reports_missing_required_artifacts(tmp_path) -> None: - artifact_root = tmp_path / "artifacts" / "run-1" - manifest_dir = artifact_root / "stage_artifacts" / "manifests" - manifest_dir.mkdir(parents=True) - (artifact_root / "manifest.json").write_text( - json.dumps( - { - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - } - } - ) - ) - (manifest_dir / "05_donor_integration_synthesis.json").write_text( - json.dumps( - { - "contractVersion": "us-runtime-stages-v2", - "stageId": "05_donor_integration_synthesis", - "complete": True, - "lifecycleStatus": "complete", - "requiredOutputs": ["seed_data", "synthetic_data"], - "outputs": { - "seed_data": { - "path": "seed_data.parquet", - "exists": False, - }, - "synthetic_data": { - "path": "synthetic_data.parquet", - "exists": False, - }, - }, - } - ) - ) - provider = _FakeProvider(descriptor=SimpleNamespace(name="fake_source")) - - with pytest.raises(ValueError) as exc_info: - run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - providers=(provider,), - queries={}, - version_id="run-1", - resume_from_stage="06_policyengine_entities", - defer_policyengine_harness=True, - defer_policyengine_native_score=True, - defer_native_audit=True, - defer_imputation_ablation=True, - ) - - message = str(exc_info.value) - assert "US pipeline resume preflight failed for 06_policyengine_entities" in message - assert "05_donor_integration_synthesis.seed_data" in message - assert "05_donor_integration_synthesis.synthetic_data" in message - - -def test_run_policyengine_us_data_rebuild_checkpoint_rejects_empty_provider_sequence( - tmp_path, -) -> None: - try: - run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - providers=(), - ) - except ValueError as exc: - assert "non-empty provider sequence" in str(exc) - else: - raise AssertionError("Expected empty providers to fail closed") - - -def test_run_policyengine_us_data_rebuild_checkpoint_rejects_unknown_query_keys( - tmp_path, -) -> None: - provider = _FakeProvider(descriptor=SimpleNamespace(name="fake_source")) - try: - run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - providers=(provider,), - queries={"typo_source": SourceQuery(provider_filters={"sample_n": 5})}, - ) - except ValueError as exc: - assert "unknown provider keys" in str(exc) - assert "fake_source" in str(exc) - else: - raise AssertionError("Expected unknown query keys to fail") - - -def test_run_policyengine_us_data_rebuild_checkpoint_rejects_mismatched_explicit_config( - tmp_path, -) -> None: - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - ) - provider = _FakeProvider(descriptor=SimpleNamespace(name="fake_source")) - - try: - run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/other_baseline.h5", - policyengine_targets_db="/tmp/policy_data.db", - config=config, - providers=(provider,), - queries={"fake_source": SourceQuery(provider_filters={"sample_n": 5})}, - ) - except ValueError as exc: - assert "does not match the requested PE rebuild context" in str(exc) - assert "policyengine_baseline_dataset" in str(exc) - else: - raise AssertionError("Expected mismatched explicit config to fail") - - -def test_run_policyengine_us_data_rebuild_checkpoint_accepts_matching_explicit_config_default_calibration_scope( - tmp_path, -) -> None: - config = default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - ) - - try: - run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - config=config, - providers=(), - ) - except ValueError as exc: - assert "non-empty provider sequence" in str(exc) - assert "requested PE rebuild context" not in str(exc) - else: - raise AssertionError("Expected empty providers to fail after validation") - - -def test_run_policyengine_us_data_rebuild_checkpoint_rejects_custom_python_without_native_defer( - tmp_path, -) -> None: - provider = _FakeProvider(descriptor=SimpleNamespace(name="fake_source")) - try: - run_policyengine_us_data_rebuild_checkpoint( - output_root=tmp_path / "artifacts", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - providers=(provider,), - queries={"fake_source": SourceQuery(provider_filters={"sample_n": 5})}, - policyengine_us_data_python="/tmp/venv/bin/python", - ) - except ValueError as exc: - assert "defer_policyengine_native_score=True" in str(exc) - else: - raise AssertionError("Expected unsupported custom PE Python path to fail") - - -def test_attach_policyengine_us_data_rebuild_checkpoint_evidence_updates_manifest( - monkeypatch, - tmp_path, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - manifest = { - "created_at": "2026-04-06T00:00:00+00:00", - "config": default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - ).to_dict(), - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 20.0}, - "targets": {"n_marginal_groups": 1, "n_continuous": 0}, - "synthesis": { - "scaffold_source": "cps_asec_2023", - "source_names": ["cps_asec_2023", "irs_soi_puf"], - "backend": "seed", - "condition_vars": [], - "target_vars": [], - "donor_integrated_variables": [], - "state_program_support_proxies": {"available": [], "missing": []}, - }, - "calibration": { - "converged": True, - "n_loaded_targets": 1, - "n_supported_targets": 1, - "full_oracle_capped_mean_abs_relative_error": 0.12, - "full_oracle_mean_abs_relative_error": 0.12, - }, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_dataset": "policyengine_us.h5", - }, - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - (artifact_dir / "data_flow_snapshot.json").write_text( - json.dumps( - { - "schemaVersion": 1, - "stages": [ - { - "id": "benchmark", - "status": "missing", - "metrics": [], - "outputs": [], - } - ], - } - ) - ) - for name in ( - "seed_data.parquet", - "synthetic_data.parquet", - "calibrated_data.parquet", - "targets.json", - "policyengine_us.h5", - ): - (artifact_dir / name).write_text("{}") - - harness_payload = { - "candidate_label": "microplex", - "baseline_label": "policyengine_us_data", - "period": 2024, - "metadata": {"slice_profile": "pe_native_broad"}, - "summary": { - "candidate_mean_abs_relative_error": 0.08, - "baseline_mean_abs_relative_error": 0.10, - "mean_abs_relative_error_delta": -0.02, - "candidate_composite_parity_loss": 0.14, - "baseline_composite_parity_loss": 0.15, - "composite_parity_loss_delta": -0.01, - "slice_win_rate": 0.55, - "target_win_rate": 0.58, - "supported_target_rate": 0.98, - "baseline_supported_target_rate": 0.99, - "tag_summaries": {}, - "parity_scorecard": {}, - "attribute_cell_summaries": {}, - }, - } - native_scores_payload = { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "summary": { - "candidate_enhanced_cps_native_loss": 0.30, - "baseline_enhanced_cps_native_loss": 0.20, - "enhanced_cps_native_loss_delta": 0.10, - "candidate_beats_baseline": False, - }, - } - imputation_ablation_payload = { - "schema_version": 1, - "artifact_id": "artifact", - "production_variant": "structured_pe_conditioning", - "summary": { - "source_count": 1, - "skipped_source_count": 0, - "target_count": 3, - "production_variant": "structured_pe_conditioning", - "production_mean_weighted_mae": 0.21, - "production_mean_support_f1": 0.88, - "best_mean_weighted_mae_variant": "structured_pe_conditioning", - "best_mean_support_f1_variant": "structured_pe_conditioning", - "variant_scorecard": { - "structured_pe_conditioning": { - "source_count": 1, - "mean_weighted_mae": 0.21, - "mean_support_f1": 0.88, - } - }, - }, - "source_reports": {}, - "skipped_sources": [], - } - - module_name = "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_evidence" - monkeypatch.setattr( - f"{module_name}.write_policyengine_us_data_rebuild_parity_artifact", - lambda artifact_dir_arg, **kwargs: ( - Path(artifact_dir_arg) / "pe_us_data_rebuild_parity.json" - ), - ) - monkeypatch.setattr( - f"{module_name}.build_policyengine_us_data_rebuild_parity_artifact", - lambda artifact_dir_arg, **kwargs: { - "artifactId": Path(artifact_dir_arg).name, - "verdict": {"hasRealPolicyEngineComparison": True}, - }, - ) - native_audit_payload = { - "artifactId": "artifact", - "period": 2024, - "targetDelta": { - "metric": "enhanced_cps_native_loss_target_delta", - "period": 2024, - "from_dataset": "/tmp/enhanced_cps_2024.h5", - "to_dataset": "/tmp/policyengine_us.h5", - "summary": {"n_targets": 1, "to_win_rate": 1.0}, - "family_summaries": [{"target_family": "national_irs_other"}], - "scope_summaries": [{"target_scope": "national"}], - "targets": [ - { - "target_name": "nation/irs/example", - "target_family": "national_irs_other", - "target_scope": "national", - "winner": "to", - "weighted_term_delta": -1.0, - "from_weighted_term": 2.0, - "to_weighted_term": 1.0, - "target_value": 100.0, - "from_estimate": 90.0, - "to_estimate": 95.0, - "from_rel_error": 0.2, - "to_rel_error": 0.1, - } - ], - "top_regressions": [], - "top_improvements": [], - }, - "verdictHints": { - "productionImputationVariantIsMaeWinner": True, - "productionImputationVariantIsSupportWinner": True, - }, - } - monkeypatch.setattr( - f"{module_name}.build_policyengine_us_data_rebuild_native_audit", - lambda artifact_dir_arg, **kwargs: native_audit_payload, - ) - - result = attach_policyengine_us_data_rebuild_checkpoint_evidence( - artifact_dir, - compute_harness=False, - compute_native_scores=False, - precomputed_policyengine_harness_payload=harness_payload, - precomputed_policyengine_native_scores=native_scores_payload, - precomputed_imputation_ablation_payload=imputation_ablation_payload, - run_registry_path=tmp_path / "run_registry.jsonl", - run_index_path=tmp_path, - run_registry_metadata={"checkpoint_test": True}, - ) - - written_manifest = json.loads((artifact_dir / "manifest.json").read_text()) - refreshed_snapshot = json.loads( - (artifact_dir / "data_flow_snapshot.json").read_text() - ) - benchmark_stage = next( - stage - for stage in refreshed_snapshot["stages"] - if stage["id"] == "09_validation_benchmarking" - ) - registry_entries = load_us_microplex_run_registry(tmp_path / "run_registry.jsonl") - assert result.harness_path == artifact_dir / "policyengine_harness.json" - assert result.native_scores_path == artifact_dir / "policyengine_native_scores.json" - assert ( - result.native_audit_path - == artifact_dir / "pe_us_data_rebuild_native_audit.json" - ) - assert ( - result.native_target_diagnostics_path - == artifact_dir / "pe_native_target_diagnostics.json" - ) - assert result.native_audit_payload == native_audit_payload - assert result.native_target_diagnostics_payload is not None - assert result.imputation_ablation_path == artifact_dir / "imputation_ablation.json" - written_native_audit = json.loads( - (artifact_dir / "pe_us_data_rebuild_native_audit.json").read_text() - ) - written_target_diagnostics = json.loads( - (artifact_dir / "pe_native_target_diagnostics.json").read_text() - ) - assert written_target_diagnostics["artifact_id"] == "artifact" - assert written_target_diagnostics["run_id"] == "artifact" - assert written_target_diagnostics["targets"][0]["artifact_id"] == "artifact" - assert ( - written_manifest["artifacts"]["policyengine_harness"] - == "policyengine_harness.json" - ) - assert ( - written_manifest["artifacts"]["policyengine_native_scores"] - == "policyengine_native_scores.json" - ) - assert ( - written_manifest["artifacts"]["policyengine_native_audit"] - == "pe_us_data_rebuild_native_audit.json" - ) - assert ( - written_manifest["artifacts"]["policyengine_native_target_diagnostics"] - == "pe_native_target_diagnostics.json" - ) - assert ( - written_manifest["artifacts"]["imputation_ablation"] - == "imputation_ablation.json" - ) - assert ( - written_manifest["policyengine_harness"]["mean_abs_relative_error_delta"] - == -0.02 - ) - assert ( - written_manifest["policyengine_native_scores"]["enhanced_cps_native_loss_delta"] - == 0.10 - ) - assert written_manifest["run_registry"]["default_frontier_metric"] == ( - "full_oracle_capped_mean_abs_relative_error" - ) - assert ( - written_manifest["imputation_ablation"]["production_mean_weighted_mae"] == 0.21 - ) - assert ( - written_manifest["policyengine_native_audit"][ - "productionImputationVariantIsMaeWinner" - ] - is True - ) - assert ( - written_native_audit["verdictHints"]["productionImputationVariantIsMaeWinner"] - is True - ) - assert written_target_diagnostics["diagnostic_schema_version"] == 1 - assert written_target_diagnostics["dataset_labels"] == { - "from": "policyengine-us-data", - "to": "microplex-us", - } - first_target = written_target_diagnostics["targets"][0] - assert first_target["target_id"] == "nation/irs/example" - assert first_target["us_data_absolute_error"] == 10.0 - assert first_target["microplex_absolute_error"] == 5.0 - assert first_target["delta_absolute_error"] == -5.0 - assert written_manifest["run_registry"]["artifact_id"] == "artifact" - assert written_manifest["run_index"]["artifact_id"] == "artifact" - assert (tmp_path / "run_index.duckdb").exists() - assert len(registry_entries) == 1 - assert registry_entries[0].artifact_id == "artifact" - assert registry_entries[0].full_oracle_capped_mean_abs_relative_error == 0.12 - assert registry_entries[0].full_oracle_mean_abs_relative_error == 0.12 - assert registry_entries[0].metadata["checkpoint_test"] is True - assert benchmark_stage["status"] == "ready" - assert benchmark_stage["outputs"] == [ - "policyengine_harness.json", - "policyengine_native_scores.json", - "imputation_ablation.json", - "pe_us_data_rebuild_native_audit.json", - "pe_native_target_diagnostics.json", - ] - assert {metric["label"]: metric["value"] for metric in benchmark_stage["metrics"]}[ - "Capped full oracle loss" - ] == 0.12 - assert {metric["label"]: metric["value"] for metric in benchmark_stage["metrics"]}[ - "Full oracle loss" - ] == 0.12 - assert {metric["label"]: metric["value"] for metric in benchmark_stage["metrics"]}[ - "Imputation MAE" - ] == 0.21 - assert {metric["label"]: metric["value"] for metric in benchmark_stage["metrics"]}[ - "Imputation F1" - ] == 0.88 - - -def test_attach_policyengine_us_data_rebuild_checkpoint_evidence_registers_calibration_only_runs( - monkeypatch, - tmp_path, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - manifest = { - "created_at": "2026-04-06T00:00:00+00:00", - "config": default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - ).to_dict(), - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 20.0}, - "targets": {"n_marginal_groups": 1, "n_continuous": 0}, - "synthesis": {"source_names": ["cps_asec_2023", "irs_soi_puf"]}, - "calibration": { - "converged": True, - "n_loaded_targets": 1, - "n_supported_targets": 1, - "full_oracle_capped_mean_abs_relative_error": 0.12, - "full_oracle_mean_abs_relative_error": 0.12, - }, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_dataset": "policyengine_us.h5", - }, - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - (artifact_dir / "data_flow_snapshot.json").write_text( - json.dumps({"schemaVersion": 1, "stages": []}) - ) - for name in ( - "seed_data.parquet", - "synthetic_data.parquet", - "calibrated_data.parquet", - "targets.json", - "policyengine_us.h5", - ): - (artifact_dir / name).write_text("{}") - - module_name = "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_evidence" - monkeypatch.setattr( - f"{module_name}.write_policyengine_us_data_rebuild_parity_artifact", - lambda artifact_dir_arg, **kwargs: ( - Path(artifact_dir_arg) / "pe_us_data_rebuild_parity.json" - ), - ) - monkeypatch.setattr( - f"{module_name}.build_policyengine_us_data_rebuild_parity_artifact", - lambda artifact_dir_arg, **kwargs: { - "artifactId": Path(artifact_dir_arg).name, - "verdict": {"hasRealPolicyEngineComparison": False}, - }, - ) - - attach_policyengine_us_data_rebuild_checkpoint_evidence( - artifact_dir, - compute_harness=False, - compute_native_scores=False, - compute_native_audit=False, - compute_imputation_ablation=False, - run_registry_path=tmp_path / "run_registry.jsonl", - run_index_path=tmp_path, - ) - - written_manifest = json.loads((artifact_dir / "manifest.json").read_text()) - registry_entries = load_us_microplex_run_registry(tmp_path / "run_registry.jsonl") - - assert written_manifest["run_registry"]["default_frontier_metric"] == ( - "full_oracle_capped_mean_abs_relative_error" - ) - assert registry_entries[0].artifact_id == "artifact" - assert registry_entries[0].full_oracle_capped_mean_abs_relative_error == 0.12 - assert registry_entries[0].full_oracle_mean_abs_relative_error == 0.12 - - -def test_load_checkpoint_versioned_artifacts_hydrates_stage_sidecar_paths( - tmp_path, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - stage_artifacts = artifact_dir / "stage_artifacts" - for path in ( - artifact_dir / "seed_data.parquet", - artifact_dir / "synthetic_data.parquet", - artifact_dir / "calibrated_data.parquet", - artifact_dir / "targets.json", - artifact_dir / "policyengine_us.h5", - artifact_dir / "stage_manifest.json", - artifact_dir / "data_flow_snapshot.json", - stage_artifacts / "03_source_planning" / "source_plan.json", - stage_artifacts / "04_seed_scaffold" / "scaffold_seed_data.parquet", - stage_artifacts / "06_policyengine_entities" / "metadata.json", - stage_artifacts / "07_calibration" / "calibration_summary.json", - stage_artifacts - / "07_calibration" - / "policyengine_entity_tables" - / "metadata.json", - stage_artifacts / "09_validation_benchmarking" / "evidence_manifest.json", - stage_artifacts / "artifact_inventory.json", - stage_artifacts / "conditional_readiness.json", - artifact_dir / "policyengine_native_scores.json", - artifact_dir / "source_weight_diagnostics.json", - ): - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text("{}") - manifest = { - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_dataset": "policyengine_us.h5", - "stage_manifest": "stage_manifest.json", - "data_flow_snapshot": "data_flow_snapshot.json", - "source_plan": "stage_artifacts/03_source_planning/source_plan.json", - "scaffold_seed_data": ( - "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet" - ), - "pre_calibration_policyengine_entity_tables": ( - "stage_artifacts/06_policyengine_entities/metadata.json" - ), - "policyengine_entity_tables": ( - "stage_artifacts/07_calibration/policyengine_entity_tables/metadata.json" - ), - "calibration_summary": ( - "stage_artifacts/07_calibration/calibration_summary.json" - ), - "validation_evidence": ( - "stage_artifacts/09_validation_benchmarking/evidence_manifest.json" - ), - "artifact_inventory": "stage_artifacts/artifact_inventory.json", - "conditional_readiness": "stage_artifacts/conditional_readiness.json", - "policyengine_native_scores": "policyengine_native_scores.json", - "source_weight_diagnostics": "source_weight_diagnostics.json", - } - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - - loaded = checkpoint_artifacts._load_checkpoint_versioned_artifacts( - build_result=SimpleNamespace(), - artifact_root=artifact_dir, - frontier_metric="full_oracle_mean_abs_relative_error", - ) - paths = loaded.artifact_paths - - assert paths.stage_manifest == artifact_dir / "stage_manifest.json" - assert paths.data_flow_snapshot == artifact_dir / "data_flow_snapshot.json" - assert paths.artifact_inventory == stage_artifacts / "artifact_inventory.json" - assert paths.conditional_readiness == stage_artifacts / "conditional_readiness.json" - assert ( - paths.source_plan == stage_artifacts / "03_source_planning" / "source_plan.json" - ) - assert paths.scaffold_seed_data == ( - stage_artifacts / "04_seed_scaffold" / "scaffold_seed_data.parquet" - ) - assert paths.policyengine_entity_tables == ( - stage_artifacts - / "07_calibration" - / "policyengine_entity_tables" - / "metadata.json" - ) - assert paths.calibration_summary == ( - stage_artifacts / "07_calibration" / "calibration_summary.json" - ) - assert paths.validation_evidence == ( - stage_artifacts / "09_validation_benchmarking" / "evidence_manifest.json" - ) - assert paths.policyengine_native_scores == ( - artifact_dir / "policyengine_native_scores.json" - ) - assert paths.source_weight_diagnostics == ( - artifact_dir / "source_weight_diagnostics.json" - ) - - -def test_attach_policyengine_us_data_rebuild_checkpoint_evidence_computes_imputation_ablation_with_build_result( - monkeypatch, - tmp_path, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - manifest = { - "created_at": "2026-04-06T00:00:00+00:00", - "config": default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - target_period=2024, - ).to_dict(), - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 20.0}, - "targets": {"n_marginal_groups": 1, "n_continuous": 0}, - "synthesis": { - "scaffold_source": "cps_asec_2023", - "source_names": ["cps_asec_2023", "irs_soi_puf"], - "backend": "seed", - "condition_vars": [], - "target_vars": [], - "donor_integrated_variables": [], - "state_program_support_proxies": {"available": [], "missing": []}, - }, - "calibration": { - "converged": True, - "n_loaded_targets": 1, - "n_supported_targets": 1, - "full_oracle_capped_mean_abs_relative_error": 0.12, - "full_oracle_mean_abs_relative_error": 0.12, - }, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_dataset": "policyengine_us.h5", - }, - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - for name in ( - "seed_data.parquet", - "synthetic_data.parquet", - "calibrated_data.parquet", - "targets.json", - "policyengine_us.h5", - ): - (artifact_dir / name).write_text("{}") - - harness_payload = { - "summary": { - "candidate_mean_abs_relative_error": 0.08, - "baseline_mean_abs_relative_error": 0.10, - "mean_abs_relative_error_delta": -0.02, - } - } - native_scores_payload = { - "summary": { - "candidate_enhanced_cps_native_loss": 0.30, - "enhanced_cps_native_loss_delta": 0.10, - } - } - imputation_ablation_payload = { - "schema_version": 1, - "artifact_id": "artifact", - "production_variant": "structured_pe_conditioning", - "summary": { - "source_count": 1, - "production_mean_weighted_mae": 0.19, - "production_mean_support_f1": 0.91, - }, - "source_reports": {}, - "skipped_sources": [], - } - captured: dict[str, Any] = {} - build_result = SimpleNamespace( - config=SimpleNamespace(donor_imputer_condition_selection="pe_prespecified") - ) - - module_name = "microplex_us.pipelines.pe_us_data_rebuild_checkpoint_evidence" - monkeypatch.setattr( - f"{module_name}.write_policyengine_us_data_rebuild_parity_artifact", - lambda artifact_dir_arg, **kwargs: ( - Path(artifact_dir_arg) / "pe_us_data_rebuild_parity.json" - ), - ) - monkeypatch.setattr( - f"{module_name}.build_policyengine_us_data_rebuild_parity_artifact", - lambda artifact_dir_arg, **kwargs: { - "artifactId": Path(artifact_dir_arg).name, - "verdict": {"hasRealPolicyEngineComparison": True}, - }, - ) - native_audit_payload = { - "artifactId": "artifact", - "verdictHints": { - "productionImputationVariantIsMaeWinner": False, - "productionImputationVariantIsSupportWinner": True, - }, - } - monkeypatch.setattr( - f"{module_name}.build_policyengine_us_data_rebuild_native_audit", - lambda artifact_dir_arg, **kwargs: native_audit_payload, - ) - - def fake_build_checkpoint_imputation_ablation_payload( - build_result_arg, - *, - artifact_id, - manifest, - ): - captured["build_result"] = build_result_arg - captured["artifact_id"] = artifact_id - captured["manifest"] = manifest - return imputation_ablation_payload - - monkeypatch.setattr( - f"{module_name}._build_checkpoint_imputation_ablation_payload", - fake_build_checkpoint_imputation_ablation_payload, - ) - - result = attach_policyengine_us_data_rebuild_checkpoint_evidence( - artifact_dir, - build_result=build_result, - compute_harness=False, - compute_native_scores=False, - compute_imputation_ablation=True, - precomputed_policyengine_harness_payload=harness_payload, - precomputed_policyengine_native_scores=native_scores_payload, - ) - - written_manifest = json.loads((artifact_dir / "manifest.json").read_text()) - assert captured["build_result"] is build_result - assert captured["artifact_id"] == "artifact" - assert ( - captured["manifest"]["policyengine_harness"]["mean_abs_relative_error_delta"] - == -0.02 - ) - assert ( - captured["manifest"]["policyengine_native_scores"][ - "enhanced_cps_native_loss_delta" - ] - == 0.10 - ) - assert result.imputation_ablation_payload == imputation_ablation_payload - assert result.native_audit_payload == native_audit_payload - assert ( - result.native_audit_path - == artifact_dir / "pe_us_data_rebuild_native_audit.json" - ) - assert result.imputation_ablation_path == artifact_dir / "imputation_ablation.json" - assert ( - written_manifest["artifacts"]["policyengine_native_audit"] - == "pe_us_data_rebuild_native_audit.json" - ) - assert ( - written_manifest["policyengine_native_audit"][ - "productionImputationVariantIsSupportWinner" - ] - is True - ) - assert ( - written_manifest["artifacts"]["imputation_ablation"] - == "imputation_ablation.json" - ) - - -def test_build_checkpoint_imputation_ablation_payload_returns_none_when_no_donor_reports( - monkeypatch, -) -> None: - from microplex_us.pipelines.pe_us_data_rebuild_checkpoint_ablation import ( - _build_checkpoint_imputation_ablation_payload, - ) - - class FakePipeline: - def __init__(self, config): - self.config = config - - def prepare_source_input(self, frame): - return SimpleNamespace(frame=frame) - - def prepare_seed_data_from_source(self, source_input): - return pd.DataFrame( - {"household_id": [1], "person_id": [1], "hh_weight": [1.0]} - ) - - monkeypatch.setattr( - "microplex_us.pipelines.us.USMicroplexPipeline", - FakePipeline, - ) - scaffold_frame = SimpleNamespace(source=SimpleNamespace(name="scaffold")) - - payload = _build_checkpoint_imputation_ablation_payload( - SimpleNamespace( - config=SimpleNamespace( - donor_imputer_condition_selection="pe_prespecified", - ), - source_frame=scaffold_frame, - source_frames=(scaffold_frame,), - ), - artifact_id="artifact", - manifest={}, - ) - - assert payload is None - - -def _fake_versioned_artifacts( - artifact_root: Path, - build_result: Any, -) -> USMicroplexVersionedBuildArtifacts: - return USMicroplexVersionedBuildArtifacts( - build_result=build_result, - artifact_paths=USMicroplexArtifactPaths( - output_dir=artifact_root, - version_id=artifact_root.name, - seed_data=artifact_root / "seed_data.parquet", - synthetic_data=artifact_root / "synthetic_data.parquet", - calibrated_data=artifact_root / "calibrated_data.parquet", - targets=artifact_root / "targets.json", - manifest=artifact_root / "manifest.json", - policyengine_dataset=artifact_root / "policyengine_us.h5", - ), - ) - - -def _fake_evidence_result(artifact_root: Path) -> SimpleNamespace: - return SimpleNamespace( - artifact_dir=artifact_root, - manifest_path=artifact_root / "manifest.json", - harness_path=None, - native_scores_path=None, - parity_path=artifact_root / "pe_us_data_rebuild_parity.json", - parity_payload={"verdict": {"hasRealPolicyEngineComparison": False}}, - native_audit_path=None, - native_audit_payload=None, - imputation_ablation_path=None, - imputation_ablation_payload=None, - ) - - -def _write_complete_resume_artifact_root(artifact_root: Path) -> Path: - artifact_root.mkdir(parents=True) - artifacts: dict[str, str] = {} - stage_output_manifests: dict[str, str] = {} - for stage_id in US_CANONICAL_STAGE_IDS: - contract = get_us_pipeline_stage_contract(stage_id) - outputs: dict[str, Any] = {} - required_outputs: list[str] = [] - for resource in contract.outputs: - if not resource.required: - continue - required_outputs.append(resource.key) - if resource.kind == "artifact": - artifact_key = resource.artifact_key or resource.key - path = _write_resume_artifact_file( - artifact_root, - resource.stage_id or stage_id, - artifact_key, - ) - artifacts[artifact_key] = str(path.relative_to(artifact_root)) - outputs[resource.key] = { - "path": str(path.relative_to(artifact_root)), - "exists": True, - } - else: - outputs[resource.key] = {"value": True} - - stage_manifest_path = ( - artifact_root / "stage_artifacts" / "manifests" / f"{stage_id}.json" - ) - stage_manifest_path.parent.mkdir(parents=True, exist_ok=True) - stage_manifest_path.write_text( - json.dumps( - { - "contractVersion": US_STAGE_CONTRACT_VERSION, - "stageId": stage_id, - "complete": True, - "lifecycleStatus": "complete", - "requiredOutputs": required_outputs, - "missingRequiredOutputs": [], - "outputs": outputs, - } - ) - ) - stage_output_manifests[stage_id] = str( - stage_manifest_path.relative_to(artifact_root) - ) - - manifest = { - "created_at": "2026-04-06T00:00:00+00:00", - "config": default_policyengine_us_data_rebuild_checkpoint_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - ).to_dict(), - "rows": {"seed": 1, "synthetic": 1, "calibrated": 1}, - "weights": {"nonzero": 1, "total": 1.0}, - "targets": {"n_marginal_groups": 0, "n_continuous": 0}, - "synthesis": { - "source_names": ["fake_source"], - "scaffold_source": "fake_source", - "backend": "seed", - }, - "calibration": {"converged": True}, - "artifacts": artifacts, - "stage_output_manifests": stage_output_manifests, - } - (artifact_root / "manifest.json").write_text(json.dumps(manifest)) - return artifact_root - - -def _write_resume_artifact_file( - artifact_root: Path, - stage_id: str, - artifact_key: str, -) -> Path: - contract = get_us_stage_artifact_contract(stage_id, artifact_key) - path = resolve_us_stage_artifact_contract_path( - artifact_root, - stage_id, - artifact_key, - ) - path.parent.mkdir(parents=True, exist_ok=True) - if contract.format == "policyengine_entity_bundle": - stage = ( - "post_microsim" - if artifact_key == "pre_calibration_policyengine_entity_tables" - else "post_calibration" - ) - metadata = {"format_version": 1, "stage": stage} - for table_name in ( - "households", - "persons", - "tax_units", - "spm_units", - "families", - "marital_units", - ): - metadata[table_name] = {"rows": 1, "columns": [f"{table_name}_id"]} - (path.parent / f"{table_name}.parquet").write_text("placeholder") - path.write_text(json.dumps(metadata)) - return path - if contract.format == "json": - path.write_text("{}") - else: - path.write_text("placeholder") - return path diff --git a/tests/pipelines/test_pe_us_data_rebuild_parity.py b/tests/pipelines/test_pe_us_data_rebuild_parity.py deleted file mode 100644 index 55dc0fe4..00000000 --- a/tests/pipelines/test_pe_us_data_rebuild_parity.py +++ /dev/null @@ -1,259 +0,0 @@ -"""Tests for the PE-US-data rebuild parity artifact helpers.""" - -from __future__ import annotations - -import json - -from microplex_us.pipelines.pe_us_data_rebuild import ( - default_policyengine_us_data_rebuild_config, -) -from microplex_us.pipelines.pe_us_data_rebuild_parity import ( - build_policyengine_us_data_rebuild_parity_artifact, - write_policyengine_us_data_rebuild_parity_artifact, -) - - -def test_build_policyengine_us_data_rebuild_parity_artifact_summarizes_comparison( - tmp_path, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - config = default_policyengine_us_data_rebuild_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - ).to_dict() - manifest = { - "config": config, - "artifacts": { - "policyengine_harness": "policyengine_harness.json", - "policyengine_native_scores": "policyengine_native_scores.json", - "imputation_ablation": "imputation_ablation.json", - }, - } - harness_payload = { - "candidate_label": "microplex", - "baseline_label": "policyengine_us_data", - "period": 2024, - "metadata": {"slice_profile": "pe_native_broad"}, - "summary": { - "candidate_mean_abs_relative_error": 0.12, - "baseline_mean_abs_relative_error": 0.10, - "mean_abs_relative_error_delta": 0.02, - "candidate_composite_parity_loss": 0.20, - "baseline_composite_parity_loss": 0.15, - "composite_parity_loss_delta": 0.05, - "slice_win_rate": 0.40, - "target_win_rate": 0.35, - "supported_target_rate": 0.97, - "baseline_supported_target_rate": 0.99, - "tag_summaries": {"national": {"target_win_rate": 0.5}}, - }, - } - native_scores_payload = { - "metric": "enhanced_cps_native_loss", - "period": 2024, - "summary": { - "candidate_enhanced_cps_native_loss": 0.30, - "baseline_enhanced_cps_native_loss": 0.20, - "enhanced_cps_native_loss_delta": 0.10, - "candidate_beats_baseline": False, - "candidate_unweighted_msre": 0.31, - "baseline_unweighted_msre": 0.21, - "unweighted_msre_delta": 0.10, - "n_targets_total": 100, - "n_targets_kept": 90, - "n_targets_zero_dropped": 5, - "n_targets_bad_dropped": 5, - "n_national_targets": 20, - "n_state_targets": 70, - }, - } - imputation_ablation_payload = { - "schema_version": 1, - "production_variant": "structured_pe_conditioning", - "summary": { - "source_count": 5, - "skipped_source_count": 0, - "target_count": 94, - "production_variant": "structured_pe_conditioning", - "production_mean_weighted_mae": 34116.09, - "production_mean_support_f1": 0.5375, - "best_mean_weighted_mae_variant": "top_correlated_qrf", - "best_mean_support_f1_variant": "structured_pe_conditioning", - "variant_scorecard": { - "structured_pe_conditioning": { - "mean_weighted_mae": 34116.09, - "mean_support_f1": 0.5375, - }, - "top_correlated_qrf": { - "mean_weighted_mae": 32873.70, - "mean_support_f1": 0.5352, - }, - }, - }, - } - - payload = build_policyengine_us_data_rebuild_parity_artifact( - artifact_dir, - manifest_payload=manifest, - harness_payload=harness_payload, - native_scores_payload=native_scores_payload, - imputation_ablation_payload=imputation_ablation_payload, - ) - - assert payload["schemaVersion"] == 1 - assert payload["program"]["programId"] == "pe-us-data-rebuild-v1" - assert payload["profileConformance"]["exactMatch"] is True - assert payload["evidence"]["manifest"]["source"] == "in_memory_override" - assert payload["evidence"]["policyengineHarness"]["source"] == "in_memory_override" - assert payload["evidence"]["policyengineNativeScores"]["source"] == "in_memory_override" - assert payload["evidence"]["imputationAblation"]["source"] == "in_memory_override" - assert payload["baselineSlice"]["baselineDatasetPath"] == "/tmp/enhanced_cps_2024.h5" - assert payload["baselineSlice"]["baselineLabel"] == "policyengine_us_data" - assert payload["comparison"]["policyengineHarness"]["isPolicyEngineComparison"] is True - assert ( - payload["comparison"]["policyengineNativeScores"]["isPolicyEngineComparison"] - is True - ) - assert ( - payload["comparison"]["policyengineHarness"]["mean_abs_relative_error_delta"] - == 0.02 - ) - assert ( - payload["comparison"]["policyengineNativeScores"]["enhanced_cps_native_loss_delta"] - == 0.10 - ) - assert payload["comparison"]["imputationAblation"]["available"] is True - assert ( - payload["comparison"]["imputationAblation"]["best_mean_weighted_mae_variant"] - == "top_correlated_qrf" - ) - assert ( - payload["comparison"]["imputationAblation"]["best_mean_support_f1_variant"] - == "structured_pe_conditioning" - ) - assert payload["verdict"]["candidateBeatsHarnessMeanAbsRelativeError"] is False - assert payload["verdict"]["candidateBeatsNativeBroadLoss"] is False - assert payload["verdict"]["productionImputationVariantIsMaeWinner"] is False - assert payload["verdict"]["productionImputationVariantIsSupportWinner"] is True - assert payload["verdict"]["hasRealPolicyEngineComparison"] is True - assert payload["verdict"]["hasImputationAblation"] is True - - -def test_write_policyengine_us_data_rebuild_parity_artifact_records_config_drift( - tmp_path, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - config = { - **default_policyengine_us_data_rebuild_config( - donor_imputer_condition_selection="top_correlated", - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - ).to_dict(), - "experimental_override": 7, - } - manifest = {"config": config} - harness_payload = { - "candidate_label": "microplex", - "baseline_label": "policyengine_us_data", - "period": 2024, - "metadata": {}, - "summary": { - "candidate_mean_abs_relative_error": 0.08, - "baseline_mean_abs_relative_error": 0.10, - "mean_abs_relative_error_delta": -0.02, - "candidate_composite_parity_loss": 0.14, - "baseline_composite_parity_loss": 0.15, - "composite_parity_loss_delta": -0.01, - "slice_win_rate": 0.55, - "target_win_rate": 0.58, - "supported_target_rate": 0.98, - "baseline_supported_target_rate": 0.99, - "tag_summaries": {}, - }, - } - - output_path = write_policyengine_us_data_rebuild_parity_artifact( - artifact_dir, - manifest_payload=manifest, - harness_payload=harness_payload, - ) - - written = json.loads(output_path.read_text()) - drift = { - item["key"]: item - for item in written["profileConformance"]["differingKeys"] - } - - assert output_path == artifact_dir / "pe_us_data_rebuild_parity.json" - assert written["profileConformance"]["exactMatch"] is False - assert drift["donor_imputer_condition_selection"]["expected"] == "pe_prespecified" - assert drift["donor_imputer_condition_selection"]["observed"] == "top_correlated" - assert drift["experimental_override"]["expected"] is None - assert drift["experimental_override"]["observed"] == 7 - assert written["verdict"]["candidateBeatsHarnessMeanAbsRelativeError"] is True - - -def test_write_policyengine_us_data_rebuild_parity_artifact_uses_bundle_files_and_validates_identity( - tmp_path, -) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - manifest = { - "config": default_policyengine_us_data_rebuild_config( - policyengine_baseline_dataset="/tmp/enhanced_cps_2024.h5", - policyengine_targets_db="/tmp/policy_data.db", - ).to_dict(), - } - harness_payload = { - "candidate_label": "microplex", - "baseline_label": "not_policyengine", - "period": 2024, - "metadata": {}, - "summary": { - "candidate_mean_abs_relative_error": 0.08, - "baseline_mean_abs_relative_error": 0.10, - "mean_abs_relative_error_delta": -0.02, - "candidate_composite_parity_loss": 0.14, - "baseline_composite_parity_loss": 0.15, - "composite_parity_loss_delta": -0.01, - "slice_win_rate": 0.55, - "target_win_rate": 0.58, - "supported_target_rate": 0.98, - "baseline_supported_target_rate": 0.99, - "tag_summaries": {}, - }, - } - native_scores_payload = { - "metric": "not_enhanced_cps_native_loss", - "period": 2024, - "summary": { - "candidate_beats_baseline": True, - }, - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - (artifact_dir / "policyengine_harness.json").write_text(json.dumps(harness_payload)) - (artifact_dir / "policyengine_native_scores.json").write_text( - json.dumps(native_scores_payload) - ) - - output_path = write_policyengine_us_data_rebuild_parity_artifact(artifact_dir) - written = json.loads(output_path.read_text()) - - assert written["evidence"]["manifest"]["source"] == "artifact_bundle" - assert written["evidence"]["manifest"]["exists"] is True - assert written["evidence"]["policyengineHarness"]["exists"] is True - assert written["evidence"]["policyengineNativeScores"]["exists"] is True - assert written["evidence"]["imputationAblation"]["exists"] is False - assert written["comparison"]["policyengineHarness"]["isPolicyEngineComparison"] is False - assert ( - written["comparison"]["policyengineNativeScores"]["isPolicyEngineComparison"] - is False - ) - assert written["comparison"]["imputationAblation"]["available"] is False - assert written["verdict"]["candidateBeatsHarnessMeanAbsRelativeError"] is None - assert written["verdict"]["candidateBeatsNativeBroadLoss"] is None - assert written["verdict"]["productionImputationVariantIsMaeWinner"] is None - assert written["verdict"]["productionImputationVariantIsSupportWinner"] is None - assert written["verdict"]["hasRealPolicyEngineComparison"] is False - assert written["verdict"]["hasImputationAblation"] is False diff --git a/tests/pipelines/test_pe_us_dataset_readiness.py b/tests/pipelines/test_pe_us_dataset_readiness.py deleted file mode 100644 index 89436787..00000000 --- a/tests/pipelines/test_pe_us_dataset_readiness.py +++ /dev/null @@ -1,147 +0,0 @@ -"""Tests for lightweight PE-US H5 readiness audits.""" - -from __future__ import annotations - -import json - -import h5py -import numpy as np - -from microplex_us.pipelines.pe_us_dataset_readiness import ( - DEFAULT_EXPECTED_MATERIALIZED_VARIABLES, - build_policyengine_us_dataset_readiness_audit, - write_policyengine_us_dataset_readiness_audit, -) - - -def test_build_policyengine_us_dataset_readiness_audit_passes_complete_artifact( - tmp_path, -): - artifact_dir = tmp_path / "run" - artifact_dir.mkdir() - dataset_path = artifact_dir / "policyengine_us.h5" - _write_dataset(dataset_path) - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "rows": {"calibrated": 2}, - "weights": {"total": 3.0}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "source_spine_composition": "source_spine_composition.json", - }, - } - ) - ) - (artifact_dir / "source_spine_composition.json").write_text( - json.dumps( - { - "household_count": 2, - "nonzero_household_count": 2, - "total_active_weight": 3.0, - "effective_sample_size": 1.8, - "groups": [ - { - "spine": "cps_asec", - "household_count": 1, - "nonzero_household_count": 1, - "total_active_weight": 2.0, - "total_source_weight": 2.0, - }, - { - "spine": "acs_pums", - "household_count": 1, - "nonzero_household_count": 1, - "total_active_weight": 1.0, - "total_source_weight": 5.0, - }, - ], - } - ) - ) - - audit = build_policyengine_us_dataset_readiness_audit(artifact_dir, period=2024) - - assert audit["valid"] is True - assert audit["entityCounts"] == { - "household": 2, - "person": 3, - "tax_unit": 2, - "spm_unit": 2, - } - assert audit["variableSummaries"]["state_fips"]["entity"] == "household" - assert audit["variableSummaries"]["spm_unit_spm_threshold"]["positiveShare"] == 1.0 - assert audit["sourceSpineComposition"]["groups"][1]["spine"] == "acs_pums" - assert audit["issues"] == [] - - -def test_build_policyengine_us_dataset_readiness_audit_reports_missing_outputs( - tmp_path, -): - dataset_path = tmp_path / "policyengine_us.h5" - _write_dataset(dataset_path, omit=("snap", "county_fips")) - - audit = build_policyengine_us_dataset_readiness_audit( - dataset_path, - expected_spines=(), - ) - issues_by_variable = { - issue.get("variable"): issue for issue in audit["issues"] if issue.get("variable") - } - - assert audit["valid"] is False - assert issues_by_variable["county_fips"]["severity"] == "error" - assert issues_by_variable["snap"]["severity"] == "error" - - -def test_write_policyengine_us_dataset_readiness_audit_writes_sidecar(tmp_path): - dataset_path = tmp_path / "policyengine_us.h5" - _write_dataset(dataset_path) - - output_path = write_policyengine_us_dataset_readiness_audit( - dataset_path, - expected_spines=(), - ) - - assert output_path == tmp_path / "policyengine_us_readiness.json" - payload = json.loads(output_path.read_text()) - assert payload["valid"] is True - assert payload["expectedMaterializedVariables"] == list( - DEFAULT_EXPECTED_MATERIALIZED_VARIABLES - ) - - -def _write_dataset(path, *, omit=()): - omit = set(omit) - arrays = { - "household_id": np.array([1, 2]), - "household_weight": np.array([2.0, 1.0]), - "person_id": np.array([10, 11, 20]), - "person_household_id": np.array([1, 1, 2]), - "tax_unit_id": np.array([100, 200]), - "person_tax_unit_id": np.array([100, 100, 200]), - "spm_unit_id": np.array([500, 600]), - "person_spm_unit_id": np.array([500, 500, 600]), - "state_fips": np.array([6, 36]), - "county_fips": np.array([b"06001", b"36061"]), - "congressional_district_geoid": np.array([605, 3610]), - "spm_unit_spm_threshold": np.array([30_000.0, 36_000.0]), - "spm_unit_tenure_type": np.array([b"OWN_WITH_MORTGAGE", b"RENT"]), - "income_tax": np.array([100.0, 200.0]), - "income_tax_positive": np.array([100.0, 200.0]), - "eitc": np.array([0.0, 50.0]), - "ctc": np.array([1_000.0, 0.0]), - "refundable_ctc": np.array([400.0, 0.0]), - "non_refundable_ctc": np.array([600.0, 0.0]), - "snap": np.array([10.0, 0.0]), - "ssi": np.array([0.0, 100.0, 0.0]), - "tanf": np.array([0.0, 0.0]), - "medicaid": np.array([1.0, 0.0, 1.0]), - "aca_ptc": np.array([0.0, 75.0]), - } - with h5py.File(path, "w") as handle: - for variable, values in arrays.items(): - if variable in omit: - continue - group = handle.create_group(variable) - group.create_dataset("2024", data=values) diff --git a/tests/pipelines/test_performance.py b/tests/pipelines/test_performance.py deleted file mode 100644 index 510c1869..00000000 --- a/tests/pipelines/test_performance.py +++ /dev/null @@ -1,1519 +0,0 @@ -"""Tests for the US microplex performance harness.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass -from pathlib import Path -from types import SimpleNamespace - -import numpy as np -import pandas as pd -from microplex.core import EntityType -from microplex.targets import TargetQuery, TargetSet, TargetSpec - -from microplex_us.pipelines.pe_native_optimization import ( - PolicyEngineUSNativeWeightOptimizationResult, -) -from microplex_us.pipelines.performance import ( - USMicroplexPerformanceHarnessConfig, - USMicroplexPerformanceHarnessRequest, - USMicroplexPerformanceHarnessResult, - USMicroplexPerformanceSession, - _calibration_build_config_key, - _precalibration_build_config_key, - _sample_matched_household_ids, - _write_matched_policyengine_us_baseline_dataset, - default_fast_calibration_target_variables, - run_us_microplex_performance_harness, - warm_us_microplex_parity_cache, -) -from microplex_us.pipelines.us import USMicroplexBuildConfig -from microplex_us.policyengine import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSEntityTableBundle, - load_policyengine_us_entity_tables, - write_policyengine_us_time_period_dataset, -) - - -class _DummyProvider: - def __init__(self, name: str): - self.descriptor = SimpleNamespace(name=name) - - def load_frame(self, query=None): - _ = query - return SimpleNamespace(source=SimpleNamespace(name=self.descriptor.name)) - - -@dataclass -class _FakeHarnessRun: - candidate_composite_parity_loss: float = 0.4 - baseline_composite_parity_loss: float = 0.5 - candidate_mean_abs_relative_error: float = 0.2 - baseline_mean_abs_relative_error: float = 0.25 - target_win_rate: float = 0.75 - slice_win_rate: float = 1.0 - - -class _FakePipeline: - def __init__( - self, - config=None, - stage_log: list[str] | None = None, - stage_log_style: str = "full", - ): - self.config = config - self.stage_log = stage_log - self.stage_log_style = stage_log_style - - def _log(self, message: str) -> None: - if self.stage_log is not None: - self.stage_log.append(message) - - def prepare_source_input(self, frame): - if self.stage_log_style == "short": - self._log(f"prepare_source_input:{frame.source.name}") - else: - self._log("prepare_source_input") - return SimpleNamespace(frame=frame) - - def _select_scaffold_source(self, source_inputs): - self._log("select_scaffold") - return source_inputs[0] - - def prepare_seed_data_from_source(self, source_input): - _ = source_input - self._log("prepare_seed" if self.stage_log_style == "short" else "prepare_seed_data") - return pd.DataFrame({"household_id": [1], "income": [1.0], "hh_weight": [1.0]}) - - def _integrate_donor_sources(self, seed_data, *, scaffold_input, donor_inputs): - _ = scaffold_input - _ = donor_inputs - self._log( - "integrate_donors" - if self.stage_log_style == "short" - else "integrate_donor_sources" - ) - if self.stage_log is not None: - return { - "seed_data": seed_data.assign(dividend_income=[2.0]), - "integrated_variables": ["dividend_income"], - } - return {"seed_data": seed_data, "integrated_variables": []} - - def build_targets(self, seed_data): - _ = seed_data - self._log("build_targets") - return SimpleNamespace(marginal={}, continuous={}) - - def _resolve_synthesis_variables( - self, - source_input=None, - *, - fusion_plan=None, - include_all_observed_targets=False, - available_columns=None, - ): - _ = source_input - _ = fusion_plan - _ = include_all_observed_targets - _ = available_columns - self._log("resolve_synthesis_variables") - return SimpleNamespace(condition_vars=("age",), target_vars=("income",)) - - def synthesize(self, seed_data, synthesis_variables=None): - _ = synthesis_variables - self._log("synthesize") - return seed_data.assign(weight=[1.0]), None, {"backend": "bootstrap"} - - def ensure_target_support(self, synthetic_data, seed_data, targets): - _ = seed_data - _ = targets - self._log("ensure_target_support") - return synthetic_data - - def build_policyengine_entity_tables(self, population): - _ = population - self._log( - "build_policyengine_entity_tables" - if self.stage_log_style == "short" - else "build_policyengine_tables" - ) - if self.stage_log is not None: - return PolicyEngineUSEntityTableBundle( - households=pd.DataFrame({"household_id": [1], "household_weight": [1.0]}), - persons=None, - tax_units=None, - spm_units=None, - families=None, - marital_units=None, - ) - return SimpleNamespace(households=pd.DataFrame({"household_id": [1]})) - - def calibrate_policyengine_tables(self, tables): - _ = tables - self._log("calibrate_policyengine_tables") - if self.stage_log is not None: - return ( - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame({"household_id": [1], "household_weight": [1.0]}), - persons=None, - tax_units=None, - spm_units=None, - families=None, - marital_units=None, - ), - pd.DataFrame({"weight": [1.0]}), - {"backend": "policyengine_db_entropy"}, - ) - return ( - SimpleNamespace(households=pd.DataFrame({"household_id": [1]})), - pd.DataFrame({"weight": [1.0]}), - {"backend": "policyengine_db_entropy"}, - ) - - def export_policyengine_dataset( - self, - result, - path, - *, - period=None, - direct_override_variables=None, - ): - _ = result - _ = period - self._log(f"export_policyengine_dataset:{tuple(direct_override_variables or ())}") - path.write_text("stub") - return path - - -def _patch_fake_harness( - monkeypatch, - *, - stage_log: list[str] | None = None, - stage_log_style: str = "full", -) -> None: - monkeypatch.setattr( - "microplex_us.pipelines.performance.USMicroplexPipeline", - lambda config=None: _FakePipeline( - config=config, - stage_log=stage_log, - stage_log_style=stage_log_style, - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.FusionPlan.from_sources", - lambda sources: SimpleNamespace(source_names=tuple(source.name for source in sources)), - ) - - -def test_default_fast_calibration_target_variables_prefers_income_tax_over_agi(): - assert default_fast_calibration_target_variables( - ("adjusted_gross_income", "income_tax", "dividend_income") - ) == ("income_tax", "dividend_income") - assert default_fast_calibration_target_variables( - ("adjusted_gross_income", "dividend_income") - ) == ("adjusted_gross_income", "dividend_income") - - -def test_run_us_microplex_performance_harness_returns_stage_timings(monkeypatch): - stage_log: list[str] = [] - cache_refs: list[object] = [] - _patch_fake_harness( - monkeypatch, - stage_log=stage_log, - stage_log_style="short", - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.PolicyEngineUSDBTargetProvider", - lambda path: SimpleNamespace(path=path), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.default_policyengine_us_db_harness_slices", - lambda **kwargs: (SimpleNamespace(name="all_targets", query=SimpleNamespace(period=kwargs["period"])),), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.filter_nonempty_policyengine_us_harness_slices", - lambda provider, slices, cache=None: cache_refs.append(cache) or tuple(slices), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.evaluate_policyengine_us_harness", - lambda *args, **kwargs: cache_refs.append(kwargs.get("cache")) or _FakeHarnessRun(), - ) - comparison_cache = PolicyEngineUSComparisonCache() - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps"), _DummyProvider("puf")], - config=USMicroplexPerformanceHarnessConfig( - targets_db="/tmp/policy_data.db", - baseline_dataset="/tmp/enhanced_cps.h5", - ), - comparison_cache=comparison_cache, - ) - - assert result.source_names == ("cps", "puf") - assert result.candidate_composite_parity_loss == 0.4 - assert result.baseline_composite_parity_loss == 0.5 - assert result.target_win_rate == 0.75 - assert result.slice_win_rate == 1.0 - assert result.total_seconds >= 0.0 - assert cache_refs == [comparison_cache, comparison_cache] - assert set(result.stage_timings) >= { - "load_frames", - "prepare_source_inputs", - "prepare_seed_data", - "integrate_donor_sources", - "build_targets", - "resolve_synthesis_variables", - "synthesize", - "ensure_target_support", - "build_policyengine_tables", - "calibrate_policyengine_tables", - "evaluate_parity_harness", - } - assert stage_log == [ - "prepare_source_input:cps", - "prepare_source_input:puf", - "select_scaffold", - "prepare_seed", - "integrate_donors", - "build_targets", - "resolve_synthesis_variables", - "synthesize", - "ensure_target_support", - "build_policyengine_entity_tables", - "calibrate_policyengine_tables", - ] - - -def test_run_us_microplex_performance_harness_requires_targets_db_and_baseline_for_parity(): - try: - run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig(), - ) - except ValueError as exc: - assert "requires both targets_db and baseline_dataset" in str(exc) - else: - raise AssertionError("Expected ValueError when parity inputs are missing") - - -def test_run_us_microplex_performance_harness_can_skip_parity(monkeypatch): - _patch_fake_harness(monkeypatch) - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig(evaluate_parity=False), - ) - - assert result.parity_run is None - assert "evaluate_parity_harness" not in result.stage_timings - - -def test_run_us_microplex_performance_harness_can_enable_fast_calibration_targets( - monkeypatch, -): - _patch_fake_harness(monkeypatch) - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - fast_inner_loop_calibration=True, - target_variables=("adjusted_gross_income", "income_tax", "dividend_income"), - ), - ) - - assert result.build_config.policyengine_target_variables == ( - "adjusted_gross_income", - "income_tax", - "dividend_income", - ) - assert result.build_config.policyengine_calibration_target_variables == ( - "income_tax", - "dividend_income", - ) - - -def test_run_us_microplex_performance_harness_can_keep_exact_calibration_targets( - monkeypatch, -): - _patch_fake_harness(monkeypatch) - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - fast_inner_loop_calibration=False, - target_variables=("adjusted_gross_income", "income_tax", "dividend_income"), - ), - ) - - assert result.build_config.policyengine_calibration_target_variables == ( - "adjusted_gross_income", - "income_tax", - "dividend_income", - ) - - -def test_run_us_microplex_performance_harness_preserves_target_profiles(monkeypatch): - _patch_fake_harness(monkeypatch) - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - target_profile="pe_native_broad", - calibration_target_profile="pe_native_broad", - ), - ) - - assert result.build_config.policyengine_target_profile == "pe_native_broad" - assert result.build_config.policyengine_calibration_target_profile == "pe_native_broad" - assert result.build_config.policyengine_target_variables == () - assert result.build_config.policyengine_target_geo_levels == () - assert result.build_config.policyengine_calibration_target_variables == () - assert result.build_config.policyengine_calibration_target_geo_levels == () - - -def test_calibration_cache_key_includes_iteration_and_tolerance_settings(): - base = USMicroplexBuildConfig( - calibration_backend="entropy", - calibration_tol=1e-6, - calibration_max_iter=100, - ) - updated = USMicroplexBuildConfig( - calibration_backend="entropy", - calibration_tol=1e-5, - calibration_max_iter=500, - ) - - assert _precalibration_build_config_key(base) == _precalibration_build_config_key( - updated - ) - assert _calibration_build_config_key(base) != _calibration_build_config_key( - updated - ) - - -def test_calibration_cache_key_includes_household_budget_selection(): - base = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_selection_household_budget=None, - ) - updated = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_selection_household_budget=29_999, - ) - - assert _precalibration_build_config_key(base) == _precalibration_build_config_key( - updated - ) - assert _calibration_build_config_key(base) != _calibration_build_config_key( - updated - ) - - -def test_calibration_cache_key_includes_pe_native_selection_hyperparameters(): - base = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_selection_backend="pe_native_loss", - policyengine_selection_household_budget=29_999, - policyengine_selection_max_iter=200, - policyengine_selection_tol=1e-8, - policyengine_selection_l2_penalty=0.0, - ) - updated = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_selection_backend="pe_native_loss", - policyengine_selection_household_budget=29_999, - policyengine_selection_max_iter=1_000, - policyengine_selection_tol=1e-7, - policyengine_selection_l2_penalty=1e-5, - ) - - assert _precalibration_build_config_key(base) == _precalibration_build_config_key( - updated - ) - assert _calibration_build_config_key(base) != _calibration_build_config_key( - updated - ) - - -def test_run_us_microplex_performance_harness_allows_full_source_queries(monkeypatch): - captured_queries: list[dict[str, object]] = [] - _patch_fake_harness(monkeypatch) - - class CapturingProvider(_DummyProvider): - def load_frame(self, query=None): - provider_filters = dict(getattr(query, "provider_filters", {}) or {}) - captured_queries.append(provider_filters) - return super().load_frame(query) - - run_us_microplex_performance_harness( - providers=[CapturingProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - sample_n=None, - evaluate_parity=False, - ), - ) - - assert captured_queries == [{"sample_n": None, "random_seed": 42}] - - -def test_run_us_microplex_performance_harness_can_evaluate_native_loss(monkeypatch): - _patch_fake_harness(monkeypatch) - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_us_pe_native_scores", - lambda **kwargs: { - "summary": { - "candidate_enhanced_cps_native_loss": 0.2, - "baseline_enhanced_cps_native_loss": 0.3, - "enhanced_cps_native_loss_delta": -0.1, - }, - "kwargs": kwargs, - }, - ) - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_pe_native_loss=True, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - ), - ) - - assert result.candidate_enhanced_cps_native_loss == 0.2 - assert result.baseline_enhanced_cps_native_loss == 0.3 - assert result.enhanced_cps_native_loss_delta == -0.1 - assert "evaluate_pe_native_loss" in result.stage_timings - - -def test_run_us_microplex_performance_harness_can_evaluate_matched_native_loss( - monkeypatch, - tmp_path, -): - _patch_fake_harness(monkeypatch) - matched_calls: list[dict[str, object]] = [] - score_calls: list[dict[str, object]] = [] - - def _fake_write_matched_baseline( - baseline_dataset_path, - output_dataset_path, - *, - period, - household_count, - random_seed, - ): - matched_calls.append( - { - "baseline_dataset_path": baseline_dataset_path, - "period": period, - "household_count": household_count, - "random_seed": random_seed, - } - ) - path = Path(output_dataset_path) - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text("matched") - return str(path.resolve()) - - def _fake_score(**kwargs): - score_calls.append(kwargs) - return { - "summary": { - "candidate_enhanced_cps_native_loss": 0.18, - "baseline_enhanced_cps_native_loss": 0.22, - "enhanced_cps_native_loss_delta": -0.04, - } - } - - monkeypatch.setattr( - "microplex_us.pipelines.performance._write_matched_policyengine_us_baseline_dataset", - _fake_write_matched_baseline, - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_us_pe_native_scores", - _fake_score, - ) - - baseline_output = tmp_path / "matched_baseline.h5" - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_matched_pe_native_loss=True, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - output_matched_baseline_dataset_path=baseline_output, - ), - ) - - assert matched_calls == [ - { - "baseline_dataset_path": "/tmp/enhanced_cps.h5", - "period": 2024, - "household_count": 1, - "random_seed": 42, - } - ] - assert score_calls[0]["baseline_dataset_path"] == str(baseline_output.resolve()) - assert result.matched_pe_native_scores is not None - assert result.matched_baseline_dataset_path == str(baseline_output.resolve()) - assert "build_matched_baseline_dataset" in result.stage_timings - assert "evaluate_matched_pe_native_loss" in result.stage_timings - - -def test_run_us_microplex_performance_harness_can_reweight_matched_native_loss( - monkeypatch, - tmp_path, -): - _patch_fake_harness(monkeypatch) - matched_calls: list[dict[str, object]] = [] - reweight_calls: list[dict[str, object]] = [] - score_calls: list[dict[str, object]] = [] - - def _fake_write_matched_baseline( - baseline_dataset_path, - output_dataset_path, - *, - period, - household_count, - random_seed, - ): - matched_calls.append( - { - "baseline_dataset_path": baseline_dataset_path, - "period": period, - "household_count": household_count, - "random_seed": random_seed, - } - ) - path = Path(output_dataset_path) - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text("matched") - return str(path.resolve()) - - def _fake_reweight( - input_dataset_path, - output_dataset_path, - *, - period, - epochs, - l0_lambda, - seed, - policyengine_us_data_repo, - ): - reweight_calls.append( - { - "input_dataset_path": input_dataset_path, - "output_dataset_path": str(Path(output_dataset_path)), - "period": period, - "epochs": epochs, - "l0_lambda": l0_lambda, - "seed": seed, - "policyengine_us_data_repo": policyengine_us_data_repo, - } - ) - path = Path(output_dataset_path) - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text("reweighted") - return str(path.resolve()) - - def _fake_score(**kwargs): - score_calls.append(kwargs) - return { - "summary": { - "candidate_enhanced_cps_native_loss": 0.17, - "baseline_enhanced_cps_native_loss": 0.21, - "enhanced_cps_native_loss_delta": -0.04, - } - } - - monkeypatch.setattr( - "microplex_us.pipelines.performance._write_matched_policyengine_us_baseline_dataset", - _fake_write_matched_baseline, - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance._reweight_matched_policyengine_us_baseline_dataset", - _fake_reweight, - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_us_pe_native_scores", - _fake_score, - ) - - baseline_output = tmp_path / "matched_baseline_reweighted.h5" - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_matched_pe_native_loss=True, - reweight_matched_pe_native_loss=True, - matched_baseline_reweight_epochs=300, - matched_baseline_reweight_l0_lambda=1e-6, - matched_baseline_reweight_seed=123, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - output_matched_baseline_dataset_path=baseline_output, - ), - ) - - assert len(matched_calls) == 1 - assert len(reweight_calls) == 1 - assert reweight_calls[0]["epochs"] == 300 - assert reweight_calls[0]["l0_lambda"] == 1e-6 - assert reweight_calls[0]["seed"] == 123 - assert score_calls[0]["baseline_dataset_path"] == str(baseline_output.resolve()) - assert result.matched_baseline_dataset_path == str(baseline_output.resolve()) - assert "reweight_matched_baseline_dataset" in result.stage_timings - - -def test_write_matched_policyengine_us_baseline_dataset_preserves_variables( - tmp_path, -): - baseline_path = tmp_path / "baseline.h5" - matched_path = tmp_path / "matched.h5" - full_copy_path = tmp_path / "matched_full.h5" - - write_policyengine_us_time_period_dataset( - { - "household_id": {"2024": [10, 20]}, - "household_weight": {"2024": [1.5, 2.5]}, - "person_id": {"2024": [1, 2, 3]}, - "person_household_id": {"2024": [10, 10, 20]}, - "person_weight": {"2024": [1.5, 1.5, 2.5]}, - "tax_unit_id": {"2024": [100, 200]}, - "person_tax_unit_id": {"2024": [100, 100, 200]}, - "tax_unit_weight": {"2024": [1.5, 2.5]}, - "state_code": {"2024": [1, 2]}, - "age": {"2024": [34, 12, 45]}, - "employment_income": {"2024": [100.0, 0.0, 55.0]}, - }, - baseline_path, - ) - - matched_dataset_path = _write_matched_policyengine_us_baseline_dataset( - baseline_path, - matched_path, - period=2024, - household_count=1, - random_seed=42, - ) - matched_tables = load_policyengine_us_entity_tables(matched_dataset_path, period=2024) - assert "state_code" in matched_tables.households.columns - assert "age" in matched_tables.persons.columns - assert "employment_income" in matched_tables.persons.columns - - copied_dataset_path = _write_matched_policyengine_us_baseline_dataset( - baseline_path, - full_copy_path, - period=2024, - household_count=2, - random_seed=42, - ) - assert Path(copied_dataset_path).read_bytes() == baseline_path.read_bytes() - - -def test_sample_matched_household_ids_supports_weighted_methods(): - household_ids = np.asarray([10, 20, 30]) - weights = np.asarray([0.0, 0.0, 5.0]) - - assert _sample_matched_household_ids( - household_ids, - weights, - household_count=1, - random_seed=42, - sample_method="weight_proportional", - ).tolist() == [30] - assert _sample_matched_household_ids( - household_ids, - np.asarray([1.0, 9.0, 2.0]), - household_count=2, - random_seed=42, - sample_method="largest_weight", - ).tolist() == [20, 30] - - -def test_run_us_microplex_performance_harness_can_write_output_bundle(monkeypatch, tmp_path): - _patch_fake_harness(monkeypatch) - - result_path = tmp_path / "result.json" - dataset_path = tmp_path / "candidate.h5" - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - output_json_path=result_path, - output_policyengine_dataset_path=dataset_path, - ), - ) - - assert result_path.exists() - assert dataset_path.exists() - assert result.policyengine_dataset_path == str(dataset_path) - - payload = json.loads(result_path.read_text()) - assert payload["policyengine_dataset_path"] == str(dataset_path) - assert payload["source_names"] == ["cps"] - assert payload["calibration_summary"]["backend"] == "policyengine_db_entropy" - - -def test_run_us_microplex_performance_harness_can_write_pe_native_target_delta_output( - monkeypatch, - tmp_path, -): - _patch_fake_harness(monkeypatch) - - delta_path = tmp_path / "target_deltas.json" - compare_calls: list[dict[str, object]] = [] - - def _fake_compare(**kwargs): - compare_calls.append(kwargs) - return { - "metric": "enhanced_cps_native_loss_target_delta", - "top_regressions": [{"target_name": "nation/foo", "weighted_term_delta": 0.5}], - "top_improvements": [{"target_name": "state/bar", "weighted_term_delta": -0.25}], - } - - monkeypatch.setattr( - "microplex_us.pipelines.performance.compare_us_pe_native_target_deltas", - _fake_compare, - ) - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - output_pe_native_target_delta_path=delta_path, - pe_native_target_delta_top_k=7, - ), - ) - - assert result.pe_native_target_deltas is not None - assert delta_path.exists() - assert compare_calls[0]["from_dataset_path"] == "/tmp/enhanced_cps.h5" - assert compare_calls[0]["top_k"] == 7 - payload = json.loads(delta_path.read_text()) - assert payload["metric"] == "enhanced_cps_native_loss_target_delta" - assert payload["top_regressions"][0]["target_name"] == "nation/foo" - assert "evaluate_pe_native_target_deltas" in result.stage_timings - assert "write_pe_native_target_delta_json" in result.stage_timings - - -def test_run_us_microplex_performance_harness_can_write_pe_native_support_audit_output( - monkeypatch, - tmp_path, -): - _patch_fake_harness(monkeypatch) - - audit_path = tmp_path / "support_audit.json" - audit_calls: list[dict[str, object]] = [] - - def _fake_audit(**kwargs): - audit_calls.append(kwargs) - return { - "metric": "enhanced_cps_support_audit", - "comparisons": { - "critical_input_support": [ - { - "variable": "child_support_expense", - "candidate_stored": False, - "baseline_stored": True, - } - ] - }, - } - - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_us_pe_native_support_audit", - _fake_audit, - ) - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - output_pe_native_support_audit_path=audit_path, - ), - ) - - assert result.pe_native_support_audit is not None - assert audit_path.exists() - assert audit_calls[0]["baseline_dataset_path"] == "/tmp/enhanced_cps.h5" - payload = json.loads(audit_path.read_text()) - assert payload["metric"] == "enhanced_cps_support_audit" - assert ( - payload["comparisons"]["critical_input_support"][0]["variable"] - == "child_support_expense" - ) - assert "evaluate_pe_native_support_audit" in result.stage_timings - assert "write_pe_native_support_audit_json" in result.stage_timings - - -def test_run_us_microplex_performance_harness_passes_export_direct_overrides(monkeypatch): - stage_log: list[str] = [] - _patch_fake_harness(monkeypatch, stage_log=stage_log) - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_us_pe_native_scores", - lambda **kwargs: {"summary": {}, "kwargs": kwargs}, - ) - - run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_pe_native_loss=True, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - build_config=USMicroplexBuildConfig( - policyengine_direct_override_variables=("filing_status", "snap") - ), - ), - ) - - assert "export_policyengine_dataset:('filing_status', 'snap')" in stage_log - - -def test_run_us_microplex_performance_harness_can_optimize_native_loss(monkeypatch): - stage_log: list[str] = [] - _patch_fake_harness(monkeypatch, stage_log=stage_log) - optimization_calls: list[dict[str, object]] = [] - score_calls: list[dict[str, object]] = [] - - def _fake_optimize(**kwargs): - optimization_calls.append(kwargs) - Path(kwargs["output_dataset_path"]).write_text("optimized") - return PolicyEngineUSNativeWeightOptimizationResult( - metric="enhanced_cps_native_loss_weight_optimization", - period=2024, - input_dataset=str(kwargs["input_dataset_path"]), - output_dataset=str(Path(kwargs["output_dataset_path"]).resolve()), - initial_loss=0.4, - optimized_loss=0.2, - loss_delta=-0.2, - initial_weight_sum=10.0, - optimized_weight_sum=10.0, - household_count=3, - positive_household_count=2, - budget=2, - converged=True, - iterations=12, - target_names=("nation/foo", "state/bar"), - ) - - def _fake_score(**kwargs): - score_calls.append(kwargs) - return { - "summary": { - "candidate_enhanced_cps_native_loss": 0.2, - "baseline_enhanced_cps_native_loss": 0.3, - "enhanced_cps_native_loss_delta": -0.1, - } - } - - monkeypatch.setattr( - "microplex_us.pipelines.performance.optimize_policyengine_us_native_loss_dataset", - _fake_optimize, - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_us_pe_native_scores", - _fake_score, - ) - - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_pe_native_loss=True, - optimize_pe_native_loss=True, - pe_native_household_budget=2, - pe_native_optimizer_max_iter=50, - pe_native_optimizer_l2_penalty=0.25, - pe_native_optimizer_tol=1e-6, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - ), - ) - - assert len(optimization_calls) == 1 - assert optimization_calls[0]["budget"] == 2 - assert optimization_calls[0]["max_iter"] == 50 - assert optimization_calls[0]["l2_penalty"] == 0.25 - assert optimization_calls[0]["tol"] == 1e-6 - assert str(score_calls[0]["candidate_dataset_path"]).endswith( - "candidate_policyengine_us_optimized.h5" - ) - assert result.pe_native_scores is not None - assert result.pe_native_scores["optimization"]["optimized_loss"] == 0.2 - assert result.pe_native_scores["optimization"]["rescored_loss_abs_error"] == 0.0 - assert "optimize_pe_native_loss_weights" in result.stage_timings - - -def test_run_us_microplex_performance_harness_writes_optimized_dataset_output( - monkeypatch, - tmp_path, -): - _patch_fake_harness(monkeypatch) - - def _fake_optimize(**kwargs): - Path(kwargs["output_dataset_path"]).write_text("optimized") - return PolicyEngineUSNativeWeightOptimizationResult( - metric="enhanced_cps_native_loss_weight_optimization", - period=2024, - input_dataset=str(kwargs["input_dataset_path"]), - output_dataset=str(Path(kwargs["output_dataset_path"]).resolve()), - initial_loss=0.4, - optimized_loss=0.2, - loss_delta=-0.2, - initial_weight_sum=10.0, - optimized_weight_sum=10.0, - household_count=3, - positive_household_count=2, - budget=2, - converged=True, - iterations=12, - target_names=("nation/foo", "state/bar"), - ) - - monkeypatch.setattr( - "microplex_us.pipelines.performance.optimize_policyengine_us_native_loss_dataset", - _fake_optimize, - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_us_pe_native_scores", - lambda **kwargs: { - "summary": { - "candidate_enhanced_cps_native_loss": 0.2, - "baseline_enhanced_cps_native_loss": 0.3, - "enhanced_cps_native_loss_delta": -0.1, - } - }, - ) - - dataset_path = tmp_path / "candidate_optimized.h5" - result = run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_pe_native_loss=True, - optimize_pe_native_loss=True, - pe_native_household_budget=2, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - output_policyengine_dataset_path=dataset_path, - ), - ) - - assert result.policyengine_dataset_path == str(dataset_path.resolve()) - assert dataset_path.read_text() == "optimized" - - -def test_run_us_microplex_performance_harness_rejects_native_optimization_without_scoring( - monkeypatch, -): - _patch_fake_harness(monkeypatch) - - try: - run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_pe_native_loss=False, - optimize_pe_native_loss=True, - ), - ) - except ValueError as exc: - assert "evaluate_pe_native_loss" in str(exc) - else: # pragma: no cover - defensive assertion - raise AssertionError("expected optimize_pe_native_loss validation error") - - -def test_run_us_microplex_performance_harness_rejects_native_loss_mismatch(monkeypatch): - _patch_fake_harness(monkeypatch) - - monkeypatch.setattr( - "microplex_us.pipelines.performance.optimize_policyengine_us_native_loss_dataset", - lambda **kwargs: PolicyEngineUSNativeWeightOptimizationResult( - metric="enhanced_cps_native_loss_weight_optimization", - period=2024, - input_dataset=str(kwargs["input_dataset_path"]), - output_dataset=str(Path(kwargs["output_dataset_path"]).resolve()), - initial_loss=0.4, - optimized_loss=0.2, - loss_delta=-0.2, - initial_weight_sum=10.0, - optimized_weight_sum=10.0, - household_count=3, - positive_household_count=2, - budget=None, - converged=True, - iterations=12, - target_names=("nation/foo", "state/bar"), - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_us_pe_native_scores", - lambda **kwargs: { - "summary": { - "candidate_enhanced_cps_native_loss": 0.25, - "baseline_enhanced_cps_native_loss": 0.3, - "enhanced_cps_native_loss_delta": -0.05, - } - }, - ) - - try: - run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_pe_native_loss=True, - optimize_pe_native_loss=True, - pe_native_score_consistency_tol=1e-6, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - ), - ) - except ValueError as exc: - assert "does not match rescored loss" in str(exc) - else: # pragma: no cover - defensive assertion - raise AssertionError("expected PE-native loss consistency validation error") - - -def test_run_us_microplex_performance_harness_rejects_nonpositive_target_delta_top_k( - monkeypatch, -): - _patch_fake_harness(monkeypatch) - - try: - run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - pe_native_target_delta_top_k=0, - ), - ) - except ValueError as exc: - assert "pe_native_target_delta_top_k" in str(exc) - else: # pragma: no cover - defensive assertion - raise AssertionError("expected target delta top-k validation error") - - -def test_run_us_microplex_performance_harness_rejects_nonpositive_matched_baseline_household_count( - monkeypatch, -): - _patch_fake_harness(monkeypatch) - - try: - run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - matched_baseline_household_count=0, - ), - ) - except ValueError as exc: - assert "matched_baseline_household_count" in str(exc) - else: # pragma: no cover - defensive assertion - raise AssertionError( - "expected matched baseline household count validation error" - ) - - -def test_run_us_microplex_performance_harness_rejects_reweighted_matched_loss_without_matched_loss( - monkeypatch, -): - _patch_fake_harness(monkeypatch) - - try: - run_us_microplex_performance_harness( - providers=[_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - reweight_matched_pe_native_loss=True, - ), - ) - except ValueError as exc: - assert "evaluate_matched_pe_native_loss" in str(exc) - else: # pragma: no cover - defensive assertion - raise AssertionError( - "expected reweighted matched baseline validation error" - ) - - -def test_warm_us_microplex_parity_cache_preloads_baseline(monkeypatch): - cache = PolicyEngineUSComparisonCache() - load_target_set_calls: list[tuple[int, tuple[str, ...] | None]] = [] - baseline_calls: list[dict[str, object]] = [] - - class FakeProvider: - def load_target_set(self, query=None): - period = query.period if query is not None else 2024 - names = tuple(query.names) if query is not None else () - load_target_set_calls.append((period, names or None)) - return TargetSet( - [ - TargetSpec( - name="policyengine_us_target_1", - entity=EntityType.HOUSEHOLD, - value=1.0, - period=period, - aggregation="count", - ) - ] - ) - - monkeypatch.setattr( - "microplex_us.pipelines.performance.PolicyEngineUSDBTargetProvider", - lambda path: FakeProvider(), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.default_policyengine_us_db_harness_slices", - lambda **kwargs: ( - SimpleNamespace( - name="all_targets", - query=TargetQuery(period=kwargs["period"]), - ), - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.PolicyEngineUSComparisonCache.load_baseline_report", - lambda self, **kwargs: baseline_calls.append(kwargs) or SimpleNamespace(), - ) - - warmed_cache = warm_us_microplex_parity_cache( - config=USMicroplexPerformanceHarnessConfig( - targets_db="/tmp/policy_data.db", - baseline_dataset="/tmp/enhanced_cps.h5", - ), - comparison_cache=cache, - ) - - assert warmed_cache is cache - assert load_target_set_calls == [(2024, None)] - assert baseline_calls - assert baseline_calls[0]["baseline_dataset"] == "/tmp/enhanced_cps.h5" - - -def test_warm_us_microplex_parity_cache_uses_resolved_scope_for_named_profile(monkeypatch): - cache = PolicyEngineUSComparisonCache() - slice_kwargs: dict[str, object] = {} - baseline_calls: list[dict[str, object]] = [] - - class FakeProvider: - def load_target_set(self, query=None): - period = query.period if query is not None else 2024 - return TargetSet( - [ - TargetSpec( - name="policyengine_us_target_1", - entity=EntityType.HOUSEHOLD, - value=1.0, - period=period, - aggregation="count", - ) - ] - ) - - monkeypatch.setattr( - "microplex_us.pipelines.performance.PolicyEngineUSDBTargetProvider", - lambda path: FakeProvider(), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.default_policyengine_us_db_harness_slices", - lambda **kwargs: slice_kwargs.update(kwargs) - or ( - SimpleNamespace( - name="all_targets", - query=TargetQuery(period=kwargs["period"]), - ), - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.PolicyEngineUSComparisonCache.load_baseline_report", - lambda self, **kwargs: baseline_calls.append(kwargs) or SimpleNamespace(), - ) - - warm_us_microplex_parity_cache( - config=USMicroplexPerformanceHarnessConfig( - targets_db="/tmp/policy_data.db", - baseline_dataset="/tmp/enhanced_cps.h5", - target_profile="pe_native_broad", - calibration_target_profile="pe_native_broad", - build_config=USMicroplexBuildConfig( - policyengine_target_profile="pe_native_broad", - policyengine_calibration_target_profile="pe_native_broad", - ), - ), - comparison_cache=cache, - ) - - assert slice_kwargs["variables"] == () - assert slice_kwargs["domain_variables"] == () - assert slice_kwargs["geo_levels"] == () - assert baseline_calls - - -def test_us_microplex_performance_session_reuses_comparison_cache(monkeypatch): - session = USMicroplexPerformanceSession() - run_calls: list[tuple[PolicyEngineUSComparisonCache, object, object, object]] = [] - - def fake_run( - providers, - *, - config, - queries=None, - comparison_cache=None, - frame_cache=None, - precalibration_cache=None, - calibration_cache=None, - ): - _ = providers - _ = config - _ = queries - run_calls.append( - ( - comparison_cache, - frame_cache, - precalibration_cache, - calibration_cache, - ) - ) - return "ok" - - monkeypatch.setattr( - "microplex_us.pipelines.performance.run_us_microplex_performance_harness", - fake_run, - ) - - result = session.run( - [_DummyProvider("cps")], - config=USMicroplexPerformanceHarnessConfig(evaluate_parity=False), - ) - - assert result == "ok" - assert run_calls == [ - ( - session.comparison_cache, - session.frame_cache, - session.precalibration_cache, - session.calibration_cache, - ) - ] - - -def test_us_microplex_performance_session_run_batch_uses_native_batch_scorer( - monkeypatch, - tmp_path, -): - session = USMicroplexPerformanceSession() - run_configs: list[USMicroplexPerformanceHarnessConfig] = [] - batch_calls: list[dict[str, object]] = [] - - fake_build_result = SimpleNamespace(calibration_summary={"backend": "entropy"}) - fake_build_config = USMicroplexBuildConfig() - - def fake_run( - providers, - *, - config, - queries=None, - comparison_cache=None, - frame_cache=None, - precalibration_cache=None, - calibration_cache=None, - ): - _ = providers - _ = queries - _ = comparison_cache - _ = frame_cache - _ = precalibration_cache - _ = calibration_cache - run_configs.append(config) - dataset_path = Path(config.output_policyengine_dataset_path) - dataset_path.parent.mkdir(parents=True, exist_ok=True) - dataset_path.write_text("stub") - return USMicroplexPerformanceHarnessResult( - config=config, - build_config=fake_build_config, - build_result=fake_build_result, - source_names=("cps",), - stage_timings={"load_frames": 0.0}, - total_seconds=0.0, - parity_run=None, - pe_native_scores=None, - pe_native_target_deltas=None, - policyengine_dataset_path=str(dataset_path), - ) - - def fake_batch_score(**kwargs): - batch_calls.append(kwargs) - return [ - { - "summary": { - "candidate_enhanced_cps_native_loss": 0.2, - "baseline_enhanced_cps_native_loss": 0.3, - "enhanced_cps_native_loss_delta": -0.1, - }, - "timing": { - "batch_elapsed_seconds": 1.25, - "batch_candidate_count": 2, - }, - }, - { - "summary": { - "candidate_enhanced_cps_native_loss": 0.25, - "baseline_enhanced_cps_native_loss": 0.3, - "enhanced_cps_native_loss_delta": -0.05, - }, - "timing": { - "batch_elapsed_seconds": 1.25, - "batch_candidate_count": 2, - }, - }, - ] - - monkeypatch.setattr( - "microplex_us.pipelines.performance.run_us_microplex_performance_harness", - fake_run, - ) - monkeypatch.setattr( - "microplex_us.pipelines.performance.compute_batch_us_pe_native_scores", - fake_batch_score, - ) - - requests = ( - USMicroplexPerformanceHarnessRequest( - providers=(_DummyProvider("cps"),), - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_pe_native_loss=True, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - output_policyengine_dataset_path=tmp_path / "candidate_a.h5", - ), - ), - USMicroplexPerformanceHarnessRequest( - providers=(_DummyProvider("cps"),), - config=USMicroplexPerformanceHarnessConfig( - evaluate_parity=False, - evaluate_pe_native_loss=True, - baseline_dataset="/tmp/enhanced_cps.h5", - policyengine_us_data_repo="/tmp/policyengine-us-data", - output_policyengine_dataset_path=tmp_path / "candidate_b.h5", - ), - ), - ) - - results = session.run_batch(requests) - - assert len(run_configs) == 2 - assert all(config.evaluate_pe_native_loss is False for config in run_configs) - assert len(batch_calls) == 1 - assert batch_calls[0]["baseline_dataset_path"] == "/tmp/enhanced_cps.h5" - assert batch_calls[0]["candidate_dataset_paths"] == [ - str(tmp_path / "candidate_a.h5"), - str(tmp_path / "candidate_b.h5"), - ] - assert results[0].pe_native_scores["summary"]["candidate_enhanced_cps_native_loss"] == 0.2 - assert results[1].pe_native_scores["summary"]["candidate_enhanced_cps_native_loss"] == 0.25 - assert results[0].stage_timings["evaluate_pe_native_loss"] == 1.25 - - -def test_us_microplex_performance_session_reuses_loaded_frames(monkeypatch): - session = USMicroplexPerformanceSession() - load_calls: list[str] = [] - - class CountingProvider(_DummyProvider): - def load_frame(self, query=None): - _ = query - load_calls.append(self.descriptor.name) - return SimpleNamespace(source=SimpleNamespace(name=self.descriptor.name)) - _patch_fake_harness(monkeypatch) - - provider = CountingProvider("cps") - config = USMicroplexPerformanceHarnessConfig(evaluate_parity=False) - - first = session.run([provider], config=config) - second = session.run([provider], config=config) - - assert first.source_names == ("cps",) - assert second.source_names == ("cps",) - assert load_calls == ["cps"] - - -def test_us_microplex_performance_session_reuses_precalibration_state(monkeypatch): - stage_calls: list[str] = [] - - class CountingProvider(_DummyProvider): - def load_frame(self, query=None): - _ = query - return SimpleNamespace(source=SimpleNamespace(name=self.descriptor.name)) - _patch_fake_harness(monkeypatch, stage_log=stage_calls) - - provider = CountingProvider("cps") - config = USMicroplexPerformanceHarnessConfig(evaluate_parity=False) - - frame_cache = {} - precalibration_cache = {} - - first = run_us_microplex_performance_harness( - [provider], - config=config, - frame_cache=frame_cache, - precalibration_cache=precalibration_cache, - calibration_cache=None, - ) - second = run_us_microplex_performance_harness( - [provider], - config=config, - frame_cache=frame_cache, - precalibration_cache=precalibration_cache, - calibration_cache=None, - ) - - assert first.source_names == ("cps",) - assert second.source_names == ("cps",) - assert stage_calls.count("prepare_source_input") == 1 - assert stage_calls.count("prepare_seed_data") == 1 - assert stage_calls.count("integrate_donor_sources") == 1 - assert stage_calls.count("build_targets") == 1 - assert stage_calls.count("resolve_synthesis_variables") == 1 - assert stage_calls.count("synthesize") == 1 - assert stage_calls.count("ensure_target_support") == 1 - assert stage_calls.count("build_policyengine_tables") == 1 - assert stage_calls.count("calibrate_policyengine_tables") == 2 - - -def test_us_microplex_performance_session_reuses_calibration_state(monkeypatch): - session = USMicroplexPerformanceSession() - stage_calls: list[str] = [] - - class CountingProvider(_DummyProvider): - def load_frame(self, query=None): - _ = query - return SimpleNamespace(source=SimpleNamespace(name=self.descriptor.name)) - _patch_fake_harness(monkeypatch, stage_log=stage_calls) - - provider = CountingProvider("cps") - config = USMicroplexPerformanceHarnessConfig(evaluate_parity=False) - - first = session.run([provider], config=config) - second = session.run([provider], config=config) - - assert first.source_names == ("cps",) - assert second.source_names == ("cps",) - assert stage_calls.count("prepare_source_input") == 1 - assert stage_calls.count("calibrate_policyengine_tables") == 1 - assert len(session.calibration_cache) == 1 - assert first.stage_timings["calibrate_policyengine_tables"] >= 0.0 - assert second.stage_timings["calibrate_policyengine_tables"] == 0.0 diff --git a/tests/pipelines/test_pre_sim_parity.py b/tests/pipelines/test_pre_sim_parity.py deleted file mode 100644 index d0c9c48c..00000000 --- a/tests/pipelines/test_pre_sim_parity.py +++ /dev/null @@ -1,101 +0,0 @@ -"""Tests for PE pre-sim parity auditing.""" - -from __future__ import annotations - -import h5py -import numpy as np -import pytest - -from microplex_us.pipelines.pre_sim_parity import ( - PreSimParityVariableSpec, - build_us_pre_sim_parity_audit, -) - - -def _write_period_dataset(path, data: dict[str, np.ndarray], *, period: int = 2024) -> None: - with h5py.File(path, "w") as handle: - for variable, values in data.items(): - group = handle.create_group(variable) - group.create_dataset(str(period), data=values) - - -def test_build_us_pre_sim_parity_audit_reports_schema_and_support(tmp_path) -> None: - reference_path = tmp_path / "reference.h5" - candidate_path = tmp_path / "candidate.h5" - - _write_period_dataset( - reference_path, - { - "household_id": np.array([1, 2], dtype=int), - "household_weight": np.array([10.0, 20.0], dtype=float), - "person_id": np.array([101, 102, 103], dtype=int), - "person_household_id": np.array([1, 1, 2], dtype=int), - "tax_unit_id": np.array([11, 12], dtype=int), - "person_tax_unit_id": np.array([11, 11, 12], dtype=int), - "age": np.array([4, 37, 42], dtype=int), - "state_fips": np.array([1, 2], dtype=int), - "county_fips": np.array([1, 3], dtype=int), - "is_household_head": np.array([1, 0, 1], dtype=int), - "has_esi": np.array([True, False, True], dtype=bool), - "employment_income_before_lsr": np.array([0.0, 10.0, 100.0], dtype=float), - }, - ) - _write_period_dataset( - candidate_path, - { - "household_id": np.array([1, 2], dtype=int), - "household_weight": np.array([1.0, 1.0], dtype=float), - "person_id": np.array([201, 202], dtype=int), - "person_household_id": np.array([1, 2], dtype=int), - "tax_unit_id": np.array([21, 22], dtype=int), - "person_tax_unit_id": np.array([21, 22], dtype=int), - "age": np.array([4, 42], dtype=int), - "state_fips": np.array([1, 2], dtype=int), - "has_esi": np.array([1.0, 0.0], dtype=float), - "employment_income_before_lsr": np.array([0.0, 100.0], dtype=float), - }, - ) - - audit = build_us_pre_sim_parity_audit( - candidate_path, - reference_path, - focus_variables=( - PreSimParityVariableSpec("age", "age", value_kind="numeric"), - PreSimParityVariableSpec("state_fips", "state_fips", value_kind="categorical"), - PreSimParityVariableSpec("has_esi", "has_esi", value_kind="categorical"), - "county_fips", - ), - critical_reference_variables=("county_fips",), - ) - - assert audit["schema"]["reference_variable_count"] == 12 - assert audit["schema"]["candidate_variable_count"] == 10 - assert audit["schema"]["missing_in_candidate_count"] == 2 - assert audit["schema"]["missing_critical_reference_variables"] == ["county_fips"] - - reference_structure = audit["entity_structure"]["reference"] - candidate_structure = audit["entity_structure"]["candidate"] - assert reference_structure["share_multi_person_tax_units"] == 0.5 - assert candidate_structure["share_multi_person_tax_units"] == 0.0 - - state_age = audit["state_age_support"] - assert state_age["reference"]["nonempty_cell_count"] == 3 - assert state_age["candidate"]["nonempty_cell_count"] == 2 - assert state_age["support_recall"] == 2 / 3 - - county = audit["focus_variables"]["county_fips"] - assert county["reference_present"] is True - assert county["candidate_present"] is False - age = audit["focus_variables"]["age"] - assert age["candidate"]["kind"] == "numeric" - assert age["reference"]["kind"] == "numeric" - assert age["comparison"]["type"] == "numeric" - assert age["comparison"]["weighted_mean_ratio"] == pytest.approx(23.0 / 31.25) - state = audit["focus_variables"]["state_fips"] - assert state["candidate"]["kind"] == "categorical" - assert state["reference"]["kind"] == "categorical" - assert state["comparison"]["type"] == "categorical" - has_esi = audit["focus_variables"]["has_esi"] - assert has_esi["comparison"]["type"] == "categorical" - assert has_esi["comparison"]["support_recall"] == 1.0 - assert has_esi["comparison"]["support_precision"] == 1.0 diff --git a/tests/pipelines/test_r2_artifacts.py b/tests/pipelines/test_r2_artifacts.py deleted file mode 100644 index 5d3f7790..00000000 --- a/tests/pipelines/test_r2_artifacts.py +++ /dev/null @@ -1,183 +0,0 @@ -"""Tests for R2 artifact archiving.""" - -from __future__ import annotations - -import json -from pathlib import Path - -import pytest - -from microplex_us.pipelines.r2_artifacts import ( - R2_ARCHIVE_MANIFEST_FILENAME, - R2ArchiveConfig, - append_archive_index_entry, - build_archive_manifest, - build_r2_object_key, - upload_artifact_manifest_to_r2, -) - - -class MissingObjectError(Exception): - def __init__(self) -> None: - self.response = {"Error": {"Code": "404"}} - super().__init__("missing") - - -class FakeS3Client: - def __init__(self, *, existing_keys: set[str] | None = None) -> None: - self.existing_keys = existing_keys or set() - self.head_calls: list[tuple[str, str]] = [] - self.upload_calls: list[tuple[str, str, str]] = [] - - def head_object(self, *, Bucket: str, Key: str) -> dict[str, object]: - self.head_calls.append((Bucket, Key)) - if Key not in self.existing_keys: - raise MissingObjectError() - return {} - - def upload_file(self, filename: str, bucket: str, key: str) -> None: - self.upload_calls.append((filename, bucket, key)) - self.existing_keys.add(key) - - -def test_build_r2_object_key_normalizes_prefix() -> None: - assert ( - build_r2_object_key("/microplex-us/artifacts/", "run-a", "scores.json") - == "microplex-us/artifacts/run-a/scores.json" - ) - - -def test_build_archive_manifest_hashes_files_and_excludes_r2_sidecar( - tmp_path: Path, -) -> None: - artifact_dir = tmp_path / "run-a" - artifact_dir.mkdir() - (artifact_dir / "scores.json").write_text('{"loss": 0.1}\n') - (artifact_dir / "data").mkdir() - (artifact_dir / "data" / "weights.npy").write_bytes(b"weights") - (artifact_dir / R2_ARCHIVE_MANIFEST_FILENAME).write_text("{}") - config = R2ArchiveConfig( - bucket="microplex-artifacts", - endpoint_url="https://example.r2.cloudflarestorage.com", - prefix="experiments", - ) - - manifest = build_archive_manifest(artifact_dir, config) - - assert manifest["artifact_id"] == "run-a" - assert manifest["file_count"] == 2 - files = {entry["path"]: entry for entry in manifest["files"]} - assert files["scores.json"]["summary"] is True - assert files["scores.json"]["object_key"] == "experiments/run-a/scores.json" - assert len(files["scores.json"]["sha256"]) == 64 - assert "r2_archive_manifest.json" not in files - assert files["data/weights.npy"]["summary"] is False - - -def test_upload_artifact_manifest_to_r2_uploads_files_and_sidecar( - tmp_path: Path, -) -> None: - artifact_dir = tmp_path / "run-a" - artifact_dir.mkdir() - (artifact_dir / "scores.json").write_text('{"loss": 0.1}\n') - (artifact_dir / "summary.md").write_text("# Run\n") - config = R2ArchiveConfig( - bucket="microplex-artifacts", - endpoint_url="https://example.r2.cloudflarestorage.com", - prefix="experiments", - ) - client = FakeS3Client() - - manifest = upload_artifact_manifest_to_r2( - artifact_dir, - config, - client=client, - hash_files=False, - ) - - assert manifest["status"] == "uploaded" - assert {entry["status"] for entry in manifest["files"]} == {"uploaded"} - uploaded_keys = [key for _, _, key in client.upload_calls] - assert "experiments/run-a/scores.json" in uploaded_keys - assert "experiments/run-a/summary.md" in uploaded_keys - assert "experiments/run-a/r2_archive_manifest.json" in uploaded_keys - local_manifest = json.loads( - (artifact_dir / R2_ARCHIVE_MANIFEST_FILENAME).read_text() - ) - assert local_manifest["r2"]["bucket"] == "microplex-artifacts" - - -def test_upload_artifact_manifest_to_r2_skips_existing_objects( - tmp_path: Path, -) -> None: - artifact_dir = tmp_path / "run-a" - artifact_dir.mkdir() - (artifact_dir / "scores.json").write_text('{"loss": 0.1}\n') - config = R2ArchiveConfig( - bucket="microplex-artifacts", - endpoint_url="https://example.r2.cloudflarestorage.com", - prefix="experiments", - ) - client = FakeS3Client(existing_keys={"experiments/run-a/scores.json"}) - - manifest = upload_artifact_manifest_to_r2( - artifact_dir, - config, - client=client, - hash_files=False, - ) - - assert manifest["files"][0]["status"] == "already_exists" - uploaded_keys = [key for _, _, key in client.upload_calls] - assert uploaded_keys == ["experiments/run-a/r2_archive_manifest.json"] - - -def test_append_archive_index_entry_records_compact_upload( - tmp_path: Path, -) -> None: - artifact_dir = tmp_path / "run-a" - artifact_dir.mkdir() - (artifact_dir / "scores.json").write_text('{"loss": 0.1}\n') - config = R2ArchiveConfig( - bucket="microplex-artifacts", - endpoint_url="https://example.r2.cloudflarestorage.com", - prefix="experiments", - ) - manifest = build_archive_manifest(artifact_dir, config, hash_files=False) - - index_path = append_archive_index_entry( - tmp_path / "r2_archive_index.jsonl", - manifest, - pruned_local=True, - ) - - rows = [json.loads(line) for line in index_path.read_text().splitlines()] - assert rows == [ - { - "recorded_at": rows[0]["recorded_at"], - "artifact_id": "run-a", - "artifact_dir": str(artifact_dir.resolve()), - "bucket": "microplex-artifacts", - "prefix": "experiments", - "manifest_object_key": "experiments/run-a/r2_archive_manifest.json", - "file_count": 1, - "total_bytes": 14, - "status": None, - "pruned_local": True, - } - ] - - -def test_r2_archive_config_from_env_uses_account_endpoint( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setenv("MICROPLEX_R2_BUCKET", "microplex-artifacts") - monkeypatch.setenv("CLOUDFLARE_ACCOUNT_ID", "abc123") - monkeypatch.setenv("R2_ACCESS_KEY_ID", "key") - monkeypatch.setenv("R2_SECRET_ACCESS_KEY", "secret") - - config = R2ArchiveConfig.from_env() - - assert config.endpoint_url == "https://abc123.r2.cloudflarestorage.com" - assert config.access_key_id == "key" - assert config.secret_access_key == "secret" diff --git a/tests/pipelines/test_recalibrate_from_checkpoint.py b/tests/pipelines/test_recalibrate_from_checkpoint.py deleted file mode 100644 index 9a7dfcbf..00000000 --- a/tests/pipelines/test_recalibrate_from_checkpoint.py +++ /dev/null @@ -1,379 +0,0 @@ -"""Recalibrate-from-checkpoint helper. - -Loads a post-imputation bundle previously saved by -``save_us_pipeline_checkpoint`` and calls -``pipeline.calibrate_policyengine_tables`` on it. Used by operators to -iterate on calibration config (backend, lambda schedule, targets) -without paying the ~11 h synthesis + donor-imputation cost that -produced the bundle. - -These tests drive: - -1. The helper loads a post-imputation checkpoint and dispatches the - bundle to a fresh pipeline's calibrate method. -2. The helper also accepts post-microsim checkpoints, where materialized - target columns already exist on the bundle. -3. The helper raises a clear error if the checkpoint directory is - missing. -""" - -from __future__ import annotations - -import os -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -import pytest - -from microplex_us.pipelines.us import USMicroplexBuildConfig -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - save_us_pipeline_checkpoint, -) - - -def _make_bundle(n: int = 50) -> PolicyEngineUSEntityTableBundle: - rng = np.random.default_rng(0) - household_ids = np.arange(n) + 1 - return PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": household_ids, - "household_weight": rng.uniform(0.5, 2.0, size=n), - } - ), - persons=pd.DataFrame( - { - "person_id": household_ids * 10, - "household_id": household_ids, - "age": rng.integers(0, 85, size=n), - } - ), - ) - - -class TestRecalibrateFromPipelineCheckpoint: - @pytest.mark.parametrize("stage", ["post_imputation", "post_microsim"]) - def test_checkpoint_dispatches_to_calibrate( - self, - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, - stage: str, - ) -> None: - """Both supported stages load their bundle and dispatch to calibrate. - - For ``post_microsim``, microsim is skipped inside - ``_resolve_policyengine_calibration_targets`` because all - materialized vars are present as columns; for - ``post_imputation``, microsim runs normally. The helper only - orchestrates the load and hand-off, so the parametrized test - covers both paths. - """ - from microplex_us.pipelines.us import ( - recalibrate_policyengine_us_from_checkpoint, - ) - - bundle = _make_bundle(n=40) - save_us_pipeline_checkpoint( - bundle, tmp_path / "checkpoint", stage=stage - ) - - observed_tables: list[PolicyEngineUSEntityTableBundle] = [] - - def _fake_calibrate( - self: Any, - tables: PolicyEngineUSEntityTableBundle, - ) -> tuple[PolicyEngineUSEntityTableBundle, pd.DataFrame, dict[str, Any]]: - observed_tables.append(tables) - return ( - tables, - tables.households.assign(weight=tables.households["household_weight"]), - {"mock": True}, - ) - - monkeypatch.setattr( - "microplex_us.pipelines.us.USMicroplexPipeline.calibrate_policyengine_tables", - _fake_calibrate, - ) - - cfg = USMicroplexBuildConfig( - calibration_backend="pe_l0", - policyengine_targets_db=tmp_path / "targets.db", - ) - result = recalibrate_policyengine_us_from_checkpoint(cfg, tmp_path / "checkpoint") - - assert len(observed_tables) == 1 - pd.testing.assert_frame_equal( - observed_tables[0].households, bundle.households - ) - assert result.calibration_summary == {"mock": True} - assert result.loaded_stage == stage - pd.testing.assert_frame_equal( - result.policyengine_tables.households, bundle.households - ) - - def test_unsupported_stage_raises(self, tmp_path: Path) -> None: - """A metadata.json with an unknown stage is rejected.""" - from microplex_us.pipelines.us import ( - recalibrate_policyengine_us_from_checkpoint, - ) - - (tmp_path / "checkpoint").mkdir() - import json - - (tmp_path / "checkpoint" / "metadata.json").write_text( - json.dumps({"format_version": 1, "stage": "bogus"}) - ) - cfg = USMicroplexBuildConfig(policyengine_targets_db=tmp_path / "targets.db") - with pytest.raises(ValueError, match="Cannot resume"): - recalibrate_policyengine_us_from_checkpoint(cfg, tmp_path / "checkpoint") - - def test_missing_checkpoint_raises(self, tmp_path: Path) -> None: - from microplex_us.pipelines.us import ( - recalibrate_policyengine_us_from_checkpoint, - ) - - cfg = USMicroplexBuildConfig(policyengine_targets_db=tmp_path / "targets.db") - with pytest.raises(FileNotFoundError): - recalibrate_policyengine_us_from_checkpoint(cfg, tmp_path / "nope") - - -class TestRecalibrateFromCheckpointCli: - def test_prepare_output_root_accepts_existing_empty_directory( - self, - tmp_path: Path, - ) -> None: - from microplex_us.pipelines.pe_us_recalibrate_from_checkpoint import ( - _prepare_output_root, - ) - - output_root = tmp_path / "output" - output_root.mkdir() - - assert _prepare_output_root(output_root) == output_root - assert output_root.is_dir() - assert list(output_root.iterdir()) == [] - - def test_prepare_output_root_rejects_missing_directory( - self, - tmp_path: Path, - ) -> None: - from microplex_us.pipelines.pe_us_recalibrate_from_checkpoint import ( - _prepare_output_root, - ) - - output_root = tmp_path / "output" - - with pytest.raises(FileNotFoundError, match="--output-root does not exist"): - _prepare_output_root(output_root) - assert not output_root.exists() - - def test_prepare_output_root_rejects_unwritable_directory( - self, - tmp_path: Path, - ) -> None: - from microplex_us.pipelines.pe_us_recalibrate_from_checkpoint import ( - _prepare_output_root, - ) - - output_root = tmp_path / "output" - output_root.mkdir() - original_mode = output_root.stat().st_mode - try: - output_root.chmod(0o500) - if os.access(output_root, os.W_OK | os.X_OK): - pytest.skip("current platform still reports chmod 0500 as writable") - with pytest.raises(PermissionError, match="--output-root is not writable"): - _prepare_output_root(output_root) - finally: - output_root.chmod(original_mode) - - def test_main_rejects_output_file_before_recalibration( - self, - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, - ) -> None: - import microplex_us.pipelines.pe_us_recalibrate_from_checkpoint as cli - - called = False - - def _fail_if_called(*args: Any, **kwargs: Any) -> None: - nonlocal called - called = True - raise AssertionError("recalibration should not start") - - monkeypatch.setattr( - cli, - "recalibrate_policyengine_us_from_checkpoint", - _fail_if_called, - ) - output_root = tmp_path / "output" - output_root.write_text("not a directory") - - with pytest.raises(NotADirectoryError, match="--output-root is not a directory"): - cli.main( - [ - "--checkpoint-path", - str(tmp_path / "checkpoint"), - "--output-root", - str(output_root), - "--targets-db", - str(tmp_path / "targets.db"), - ] - ) - - assert called is False - - def test_main_rejects_missing_output_directory_before_recalibration( - self, - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, - ) -> None: - import microplex_us.pipelines.pe_us_recalibrate_from_checkpoint as cli - - called = False - - def _fail_if_called(*args: Any, **kwargs: Any) -> None: - nonlocal called - called = True - raise AssertionError("recalibration should not start") - - monkeypatch.setattr( - cli, - "recalibrate_policyengine_us_from_checkpoint", - _fail_if_called, - ) - output_root = tmp_path / "output" - - with pytest.raises(FileNotFoundError, match="--output-root does not exist"): - cli.main( - [ - "--checkpoint-path", - str(tmp_path / "checkpoint"), - "--output-root", - str(output_root), - "--targets-db", - str(tmp_path / "targets.db"), - ] - ) - - assert called is False - assert not output_root.exists() - - def test_main_threads_arch_target_options_into_config( - self, - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, - ) -> None: - import microplex_us.pipelines.pe_us_recalibrate_from_checkpoint as cli - - observed: dict[str, Any] = {} - - def _fake_recalibrate( - config: USMicroplexBuildConfig, - checkpoint_path: Path, - ) -> Any: - observed["config"] = config - observed["checkpoint_path"] = checkpoint_path - bundle = _make_bundle(n=3) - return type( - "FakeRecalibrateResult", - (), - { - "loaded_stage": "post_imputation", - "calibrated_data": bundle.households.assign( - weight=bundle.households["household_weight"] - ), - "policyengine_tables": bundle, - "calibration_summary": {"mock": True}, - }, - )() - - monkeypatch.setattr( - cli, - "recalibrate_policyengine_us_from_checkpoint", - _fake_recalibrate, - ) - export_calls: list[tuple[Any, Path, int | None]] = [] - - def _fake_export( - self: Any, - result: Any, - path: Path, - period: int | None = None, - ) -> Path: - export_calls.append((result, path, period)) - path.write_text("fake h5") - return path - - monkeypatch.setattr( - cli.USMicroplexPipeline, - "export_policyengine_dataset", - _fake_export, - ) - output_root = tmp_path / "output" - output_root.mkdir() - arch_a = tmp_path / "arch-a.jsonl" - arch_b = tmp_path / "arch-b.db" - dataset_output = tmp_path / "policyengine_us.h5" - - assert ( - cli.main( - [ - "--checkpoint-path", - str(tmp_path / "checkpoint"), - "--output-root", - str(output_root), - "--targets-db", - str(tmp_path / "targets.db"), - "--arch-targets-db", - str(arch_a), - "--arch-targets-db", - str(arch_b), - "--target-period", - "2024", - "--target-profile", - "pe_native_broad", - "--calibration-target-source", - "arch", - "--calibration-target-profile", - "pe_native_broad_source_backed", - "--calibration-backend", - "microcalibrate", - "--calibration-max-iter", - "64", - "--policyengine-materialize-batch-size", - "25000", - "--pipeline-checkpoint-save-post-microsim-path", - str(tmp_path / "post-microsim"), - "--policyengine-dataset-output", - str(dataset_output), - ] - ) - == 0 - ) - - config = observed["config"] - assert observed["checkpoint_path"] == tmp_path / "checkpoint" - assert config.arch_targets_db == (str(arch_a), str(arch_b)) - assert config.policyengine_target_period == 2024 - assert config.policyengine_target_profile == "pe_native_broad" - assert config.calibration_target_source == "arch" - assert ( - config.policyengine_calibration_target_profile - == "pe_native_broad_source_backed" - ) - assert config.calibration_backend == "microcalibrate" - assert config.calibration_max_iter == 64 - assert config.policyengine_materialize_batch_size == 25000 - assert ( - config.pipeline_checkpoint_save_post_microsim_path - == tmp_path / "post-microsim" - ) - assert (output_root / "calibration_summary.json").exists() - assert len(export_calls) == 1 - assert export_calls[0][1] == dataset_output - assert export_calls[0][2] == 2024 - assert dataset_output.read_text() == "fake h5" diff --git a/tests/pipelines/test_reduced_benchmark.py b/tests/pipelines/test_reduced_benchmark.py deleted file mode 100644 index 4e943552..00000000 --- a/tests/pipelines/test_reduced_benchmark.py +++ /dev/null @@ -1,876 +0,0 @@ -"""Tests for the staged reduced benchmark harness.""" - -from __future__ import annotations - -import json -from pathlib import Path -from types import SimpleNamespace - -import numpy as np -import pandas as pd -import pytest -from microplex.core import EntityType -from microplex.targets import FilterOperator - -from microplex_us.pipelines.performance import ( - USMicroplexPerformanceHarnessConfig, - USMicroplexPerformanceHarnessResult, -) -from microplex_us.pipelines.reduced_benchmark import ( - USMicroplexReducedBenchmarkHarnessConfig, - USMicroplexReducedBenchmarkSpec, - USMicroplexReducedCalibrationReport, - USMicroplexReducedDimensionSpec, - USMicroplexReducedMeasureSpec, - USMicroplexReducedMultiCalibrationReport, - calibrate_and_evaluate_us_reduced_benchmark_specs, - calibrate_and_evaluate_us_reduced_benchmarks, - default_us_atomic_rung0_benchmarks, - default_us_atomic_rung1_benchmarks, - default_us_atomic_rung2_calibration, - default_us_atomic_rung3_calibration, - default_us_atomic_rung4_calibration, - default_us_atomic_rung5_calibration, - evaluate_us_reduced_benchmark, - reduced_benchmark_specs_to_calibration_targets, - reduced_benchmark_to_calibration_targets, - run_us_microplex_reduced_benchmark_harness, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexTargets, -) -from microplex_us.policyengine import ( - PolicyEngineUSEntityTableBundle, - build_policyengine_us_time_period_arrays, - write_policyengine_us_time_period_dataset, -) - - -def _sample_bundle( - *, - household_weights: tuple[float, ...], - state_fips: tuple[int, ...], - ages_by_household: tuple[tuple[float, ...], ...], - female_by_household: tuple[tuple[bool, ...], ...] | None = None, - employment_income_by_household: tuple[tuple[float, ...], ...] | None = None, -) -> PolicyEngineUSEntityTableBundle: - household_ids = list(range(1, len(household_weights) + 1)) - households = pd.DataFrame( - { - "household_id": household_ids, - "household_weight": list(household_weights), - "state_fips": list(state_fips), - } - ) - person_rows: list[dict[str, int | float]] = [] - person_id = 10 - female_groups = female_by_household or tuple( - tuple(False for _ in ages) for ages in ages_by_household - ) - employment_groups = employment_income_by_household or tuple( - tuple(0.0 for _ in ages) for ages in ages_by_household - ) - for household_id, ages, female_flags, incomes in zip( - household_ids, - ages_by_household, - female_groups, - employment_groups, - strict=True, - ): - for age, is_female, employment_income in zip( - ages, - female_flags, - incomes, - strict=True, - ): - person_rows.append( - { - "person_id": person_id, - "household_id": household_id, - "tax_unit_id": household_id * 100, - "spm_unit_id": household_id * 1000, - "family_id": household_id * 5000, - "marital_unit_id": household_id * 7000, - "age": age, - "is_female": is_female, - "employment_income_before_lsr": employment_income, - } - ) - person_id += 1 - persons = pd.DataFrame(person_rows) - return PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=None, - spm_units=None, - families=None, - marital_units=None, - ) - - -def _write_dataset(bundle: PolicyEngineUSEntityTableBundle, path: Path) -> Path: - person_variable_map = {"age": "age"} - if "is_female" in bundle.persons.columns: - person_variable_map["is_female"] = "is_female" - if "employment_income_before_lsr" in bundle.persons.columns: - person_variable_map["employment_income_before_lsr"] = ( - "employment_income_before_lsr" - ) - arrays = build_policyengine_us_time_period_arrays( - bundle, - period=2024, - household_variable_map={"state_fips": "state_fips"}, - person_variable_map=person_variable_map, - ) - return write_policyengine_us_time_period_dataset(arrays, path) - - -def test_evaluate_us_reduced_benchmark_compares_weighted_household_counts(tmp_path): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((40.0,), (70.0,)), - ), - tmp_path / "candidate.h5", - ) - spec = USMicroplexReducedBenchmarkSpec( - name="household_count_by_state", - entity="household", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - ), - measures=( - USMicroplexReducedMeasureSpec(name="weighted_household_count"), - ), - ) - - report = evaluate_us_reduced_benchmark( - candidate_path, - baseline_path, - spec, - period=2024, - ) - - summary = report.measure_summaries["weighted_household_count"] - assert summary["candidate_total"] == pytest.approx(2.0) - assert summary["baseline_total"] == pytest.approx(3.0) - assert summary["support_recall"] == pytest.approx(1.0) - assert report.top_cell_gaps["weighted_household_count"][0]["state_fips"] == "06" - assert report.top_cell_gaps["weighted_household_count"][0]["delta"] == pytest.approx( - -1.0 - ) - - -def test_evaluate_us_reduced_benchmark_supports_binned_person_counts(tmp_path): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((40.0,), (70.0,)), - ), - tmp_path / "candidate.h5", - ) - spec = USMicroplexReducedBenchmarkSpec( - name="person_count_by_state_age", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - USMicroplexReducedDimensionSpec( - variable="age", - label="age_bucket", - bins=(0.0, 18.0, 65.0, 200.0), - bin_labels=("0_to_17", "18_to_64", "65_plus"), - ), - ), - measures=( - USMicroplexReducedMeasureSpec(name="weighted_person_count"), - ), - ) - - report = evaluate_us_reduced_benchmark( - candidate_path, - baseline_path, - spec, - period=2024, - ) - - top_gap = report.top_cell_gaps["weighted_person_count"][0] - assert top_gap["state_fips"] == "06" - assert top_gap["age_bucket"] == "0_to_17" - assert top_gap["delta"] == pytest.approx(-2.0) - assert report.summary["n_dimensions"] == 2 - - -def test_run_us_microplex_reduced_benchmark_harness_wraps_performance_harness( - monkeypatch, - tmp_path, -): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((40.0,), (70.0,)), - ), - tmp_path / "candidate.h5", - ) - captured: dict[str, USMicroplexPerformanceHarnessConfig] = {} - - def _fake_run_us_microplex_performance_harness( - providers, - *, - config, - queries=None, - **kwargs, - ): - _ = providers - _ = queries - _ = kwargs - captured["config"] = config - build_config = USMicroplexBuildConfig() - build_result = USMicroplexBuildResult( - config=build_config, - seed_data=pd.DataFrame(), - synthetic_data=pd.DataFrame(), - calibrated_data=pd.DataFrame(), - targets=USMicroplexTargets(marginal={}, continuous={}), - calibration_summary={"backend": "policyengine_db_none"}, - policyengine_tables=_sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((40.0,), (70.0,)), - ), - source_frame=None, - source_frames=(), - fusion_plan=None, - ) - return USMicroplexPerformanceHarnessResult( - config=config, - build_config=build_config, - build_result=build_result, - source_names=("stub",), - stage_timings={"write_policyengine_dataset": 0.1}, - total_seconds=0.2, - policyengine_dataset_path=str(candidate_path), - ) - - monkeypatch.setattr( - "microplex_us.pipelines.reduced_benchmark.run_us_microplex_performance_harness", - _fake_run_us_microplex_performance_harness, - ) - spec = USMicroplexReducedBenchmarkSpec( - name="household_count_by_state", - entity="household", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - ), - measures=( - USMicroplexReducedMeasureSpec(name="weighted_household_count"), - ), - ) - output_json = tmp_path / "reduced_harness.json" - result = run_us_microplex_reduced_benchmark_harness( - [SimpleNamespace(descriptor=SimpleNamespace(name="stub"))], - config=USMicroplexReducedBenchmarkHarnessConfig( - performance_config=USMicroplexPerformanceHarnessConfig( - baseline_dataset=baseline_path, - evaluate_parity=True, - evaluate_pe_native_loss=True, - ), - benchmark_specs=(spec,), - output_json_path=output_json, - ), - ) - - assert captured["config"].evaluate_parity is False - assert captured["config"].evaluate_pe_native_loss is False - assert captured["config"].output_policyengine_dataset_path is not None - assert result.candidate_dataset_path == str(candidate_path) - assert "household_count_by_state" in result.benchmark_reports - payload = json.loads(output_json.read_text()) - assert "benchmark_reports" in payload - assert payload["benchmark_reports"]["household_count_by_state"]["summary"]["n_cells"] == 2 - - -def test_evaluate_us_reduced_benchmark_weighted_sum(tmp_path): - """Weighted sum aggregation correctly sums age * weight per state.""" - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((40.0,), (70.0,)), - ), - tmp_path / "candidate.h5", - ) - spec = USMicroplexReducedBenchmarkSpec( - name="person_age_sum_by_state", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - ), - measures=( - USMicroplexReducedMeasureSpec( - name="weighted_age_sum", - aggregation="weighted_sum", - variable="age", - ), - ), - ) - - report = evaluate_us_reduced_benchmark( - candidate_path, - baseline_path, - spec, - period=2024, - ) - - summary = report.measure_summaries["weighted_age_sum"] - # Baseline: state 06 has person age 10 (w=2) + age 40 (w=2) = 100, - # state 36 has person age 70 (w=1) = 70 → total 170 - # Candidate: state 06 has person age 40 (w=1) = 40, - # state 36 has person age 70 (w=1) = 70 → total 110 - assert summary["baseline_total"] == pytest.approx(170.0) - assert summary["candidate_total"] == pytest.approx(110.0) - assert summary["total_delta"] == pytest.approx(-60.0) - - -def test_validate_duplicate_dimension_output_names(): - """Duplicate dimension output names are rejected.""" - spec = USMicroplexReducedBenchmarkSpec( - name="bad_spec", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - USMicroplexReducedDimensionSpec( - variable="state_fips", label="state_fips", zero_pad=2 - ), - ), - ) - with pytest.raises(ValueError, match="duplicate dimension output name"): - from microplex_us.pipelines.reduced_benchmark import ( - _validate_reduced_benchmark_spec, - ) - - _validate_reduced_benchmark_spec(spec) - - -def test_evaluate_us_reduced_benchmark_weighted_mean_by_state_sex(tmp_path): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((100.0, 80.0), (40.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((120.0, 60.0), (20.0,)), - ), - tmp_path / "candidate.h5", - ) - spec = USMicroplexReducedBenchmarkSpec( - name="employment_income_mean_by_state_sex", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - USMicroplexReducedDimensionSpec(variable="is_female"), - ), - measures=( - USMicroplexReducedMeasureSpec( - name="weighted_employment_income_mean", - aggregation="weighted_mean", - variable="employment_income_before_lsr", - ), - ), - ) - - report = evaluate_us_reduced_benchmark( - candidate_path, - baseline_path, - spec, - period=2024, - ) - - summary = report.measure_summaries["weighted_employment_income_mean"] - assert summary["baseline_nonzero_cell_count"] == 3 - assert summary["candidate_nonzero_cell_count"] == 3 - assert report.top_cell_gaps["weighted_employment_income_mean"][0]["state_fips"] == "36" - assert report.top_cell_gaps["weighted_employment_income_mean"][0]["is_female"] is True - assert report.top_cell_gaps["weighted_employment_income_mean"][0]["delta"] == pytest.approx( - -20.0 - ) - - -def test_evaluate_us_reduced_benchmark_weighted_mean_asymmetric_cells(tmp_path): - """Weighted mean with asymmetric cell coverage produces valid MARE, not NaN.""" - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((100.0, 80.0), (40.0,)), - ), - tmp_path / "baseline.h5", - ) - # Candidate has only state 06 — state 36 cell will be missing. - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0,), - state_fips=(6,), - ages_by_household=((40.0, 20.0),), - female_by_household=((True, False),), - employment_income_by_household=((120.0, 60.0),), - ), - tmp_path / "candidate.h5", - ) - spec = USMicroplexReducedBenchmarkSpec( - name="income_mean_by_state_sex", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - USMicroplexReducedDimensionSpec(variable="is_female"), - ), - measures=( - USMicroplexReducedMeasureSpec( - name="weighted_employment_income_mean", - aggregation="weighted_mean", - variable="employment_income_before_lsr", - ), - ), - ) - - report = evaluate_us_reduced_benchmark( - candidate_path, - baseline_path, - spec, - period=2024, - ) - - mare = report.measure_summaries["weighted_employment_income_mean"] - assert not np.isnan(mare["mean_abs_relative_error"]) - assert not np.isnan(mare["max_abs_relative_error"]) - assert not np.isnan(report.summary["mean_measure_mare"]) - # The missing (36, True) cell should surface in top gaps with NaN candidate. - assert mare["n_cells"] == 3 - assert mare["shared_nonzero_cell_count"] == 2 - - -def test_default_us_atomic_rung1_benchmarks_returns_expected_specs(): - specs = default_us_atomic_rung1_benchmarks() - assert [spec.name for spec in specs] == [ - "person_count_by_state_sex", - "employment_income_sum_by_state", - "employment_income_mean_by_state_sex", - ] - - -def test_reduced_benchmark_to_calibration_targets_emits_state_count_targets(tmp_path): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0,), (70.0,)), - ), - tmp_path / "baseline.h5", - ) - spec = USMicroplexReducedBenchmarkSpec( - name="household_count_by_state", - entity="household", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - ), - measures=(USMicroplexReducedMeasureSpec(name="weighted_household_count"),), - ) - - targets = reduced_benchmark_to_calibration_targets(spec, baseline_path, period=2024) - - assert len(targets) == 2 - assert all(target.entity is EntityType.HOUSEHOLD for target in targets) - assert all(target.aggregation.value == "count" for target in targets) - assert {target.value for target in targets} == {1.0, 2.0} - state_filters = { - target.filters[0].value: target.filters[0].operator for target in targets - } - assert state_filters == {6: FilterOperator.EQ, 36: FilterOperator.EQ} - - -def test_reduced_benchmark_to_calibration_targets_rejects_non_count_measures(tmp_path): - """weighted_sum / weighted_mean specs are rejected for calibration targets.""" - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0,), (70.0,)), - employment_income_by_household=((100.0,), (40.0,)), - ), - tmp_path / "baseline.h5", - ) - spec = USMicroplexReducedBenchmarkSpec( - name="income_sum_by_state", - entity="person", - dimensions=( - USMicroplexReducedDimensionSpec(variable="state_fips", zero_pad=2), - ), - measures=( - USMicroplexReducedMeasureSpec( - name="weighted_income_sum", - aggregation="weighted_sum", - variable="employment_income_before_lsr", - ), - ), - ) - with pytest.raises(ValueError, match="weighted_count measures only"): - reduced_benchmark_to_calibration_targets(spec, baseline_path, period=2024) - - -def test_default_us_atomic_rung2_calibration_returns_expected_structure(): - """Rung 2 returns household_count_by_state calibration spec and rung 0+1 evaluation specs.""" - calibration_spec, evaluation_specs = default_us_atomic_rung2_calibration() - assert calibration_spec.name == "household_count_by_state" - assert calibration_spec.entity == "household" - assert len(calibration_spec.measures) == 1 - assert calibration_spec.measures[0].aggregation == "weighted_count" - - rung0_names = {spec.name for spec in default_us_atomic_rung0_benchmarks()} - rung1_names = {spec.name for spec in default_us_atomic_rung1_benchmarks()} - eval_names = {spec.name for spec in evaluation_specs} - assert eval_names == rung0_names | rung1_names - - -def test_default_us_atomic_rung3_calibration_returns_expected_structure(): - """Rung 3 returns person_count_by_state_age calibration spec and rung 0+1 evaluation specs.""" - - calibration_spec, evaluation_specs = default_us_atomic_rung3_calibration() - assert calibration_spec.name == "person_count_by_state_age" - assert calibration_spec.entity == "person" - assert len(calibration_spec.measures) == 1 - assert calibration_spec.measures[0].aggregation == "weighted_count" - - rung0 = default_us_atomic_rung0_benchmarks() - rung0_names = {spec.name for spec in rung0} - rung1_names = {spec.name for spec in default_us_atomic_rung1_benchmarks()} - eval_names = {spec.name for spec in evaluation_specs} - assert eval_names == rung0_names | rung1_names - assert calibration_spec.name in eval_names - - -def test_default_us_atomic_rung4_calibration_returns_expected_structure(): - """Rung 4 returns person_count_by_age_employment_income_bucket and rung 0+1 evaluation specs.""" - - calibration_spec, evaluation_specs = default_us_atomic_rung4_calibration() - assert calibration_spec.name == "person_count_by_age_employment_income_bucket" - assert calibration_spec.entity == "person" - assert len(calibration_spec.measures) == 1 - assert calibration_spec.measures[0].aggregation == "weighted_count" - assert [dimension.output_name for dimension in calibration_spec.dimensions] == [ - "age_bucket", - "employment_income_bucket", - ] - - rung0_names = {spec.name for spec in default_us_atomic_rung0_benchmarks()} - rung1_names = {spec.name for spec in default_us_atomic_rung1_benchmarks()} - eval_names = {spec.name for spec in evaluation_specs} - assert eval_names == rung0_names | rung1_names - - -def test_default_us_atomic_rung5_calibration_returns_expected_structure(): - """Rung 5 jointly calibrates age-state and age-income person counts.""" - - calibration_specs, evaluation_specs = default_us_atomic_rung5_calibration() - calibration_names = [spec.name for spec in calibration_specs] - assert calibration_names == [ - "person_count_by_state_age", - "person_count_by_age_employment_income_bucket", - ] - assert all(spec.entity == "person" for spec in calibration_specs) - assert all(spec.measures[0].aggregation == "weighted_count" for spec in calibration_specs) - - rung0_names = {spec.name for spec in default_us_atomic_rung0_benchmarks()} - rung1_names = {spec.name for spec in default_us_atomic_rung1_benchmarks()} - eval_names = {spec.name for spec in evaluation_specs} - assert eval_names == rung0_names | rung1_names | { - "person_count_by_age_employment_income_bucket" - } - - -def test_reduced_benchmark_to_calibration_targets_emits_age_income_bucket_filters( - tmp_path, -): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - employment_income_by_household=((-5.0, 8_000.0), (60_000.0,)), - ), - tmp_path / "baseline.h5", - ) - calibration_spec, _ = default_us_atomic_rung4_calibration() - - targets = reduced_benchmark_to_calibration_targets( - calibration_spec, - baseline_path, - period=2024, - ) - - assert len(targets) == 3 - zero_or_less_target = next( - target - for target in targets - if "age_bucket=0_to_17" in target.name - and "employment_income_bucket=zero_or_less" in target.name - ) - assert zero_or_less_target.entity is EntityType.PERSON - assert zero_or_less_target.value == pytest.approx(2.0) - assert [(item.feature, item.operator, item.value) for item in zero_or_less_target.filters] == [ - ("age", FilterOperator.GTE, 0.0), - ("age", FilterOperator.LT, 18.0), - ("employment_income_before_lsr", FilterOperator.GTE, -1_000_000_000.0), - ("employment_income_before_lsr", FilterOperator.LT, 0.01), - ] - - -def test_reduced_benchmark_specs_to_calibration_targets_tracks_counts_by_spec(tmp_path): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((-5.0, 8_000.0), (60_000.0,)), - ), - tmp_path / "baseline.h5", - ) - calibration_specs, _ = default_us_atomic_rung5_calibration() - - targets, target_counts = reduced_benchmark_specs_to_calibration_targets( - calibration_specs, - baseline_path, - period=2024, - ) - - assert len(targets) == sum(target_counts.values()) - assert target_counts == { - "person_count_by_state_age": 3, - "person_count_by_age_employment_income_bucket": 3, - } - - -def test_calibrate_and_evaluate_us_reduced_benchmarks_improves_state_count_surface( - tmp_path, -): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((100.0, 80.0), (40.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((100.0, 80.0), (40.0,)), - ), - tmp_path / "candidate.h5", - ) - calibration_spec, evaluation_specs = default_us_atomic_rung2_calibration() - output_path = tmp_path / "reweighted.h5" - - report = calibrate_and_evaluate_us_reduced_benchmarks( - candidate_path, - baseline_path, - calibration_spec, - evaluation_specs=(evaluation_specs[0], evaluation_specs[1]), - period=2024, - output_reweighted_dataset_path=output_path, - ) - - assert isinstance(report, USMicroplexReducedCalibrationReport) - assert report.reweighting_summary["constraint_count"] == 2 - assert report.reweighted_dataset_path == str(output_path.resolve()) - assert output_path.exists() - state_spec_name = evaluation_specs[0].name - age_spec_name = evaluation_specs[1].name - assert ( - report.benchmark_deltas[state_spec_name]["post_mean_measure_mare"] - < report.benchmark_deltas[state_spec_name]["pre_mean_measure_mare"] - ) - assert ( - report.benchmark_deltas[age_spec_name]["post_mean_measure_mare"] - < report.benchmark_deltas[age_spec_name]["pre_mean_measure_mare"] - ) - - -def test_calibrate_and_evaluate_us_reduced_benchmarks_materializes_household_state_for_person_targets( - tmp_path, -): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((100.0, 80.0), (40.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((100.0, 80.0), (40.0,)), - ), - tmp_path / "candidate.h5", - ) - calibration_spec, evaluation_specs = default_us_atomic_rung3_calibration() - - report = calibrate_and_evaluate_us_reduced_benchmarks( - candidate_path, - baseline_path, - calibration_spec, - evaluation_specs=(evaluation_specs[1],), - period=2024, - ) - - assert report.target_count > 0 - assert report.reweighting_summary["constraint_count"] > 0 - skipped = report.reweighting_summary["skipped_targets"] - assert not any(reason == "missing_features:state_fips" for _, reason in skipped) - - -def test_calibrate_and_evaluate_us_reduced_benchmarks_improves_age_income_bucket_surface( - tmp_path, -): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((-5.0, 8_000.0), (60_000.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((-5.0, 8_000.0), (60_000.0,)), - ), - tmp_path / "candidate.h5", - ) - calibration_spec, _ = default_us_atomic_rung4_calibration() - - report = calibrate_and_evaluate_us_reduced_benchmarks( - candidate_path, - baseline_path, - calibration_spec, - evaluation_specs=(calibration_spec,), - period=2024, - ) - - assert report.target_count > 0 - assert report.reweighting_summary["constraint_count"] == 3 - spec_name = calibration_spec.name - assert ( - report.benchmark_deltas[spec_name]["post_mean_measure_mare"] - < report.benchmark_deltas[spec_name]["pre_mean_measure_mare"] - ) - - -def test_calibrate_and_evaluate_us_reduced_benchmark_specs_improves_joint_surfaces( - tmp_path, -): - baseline_path = _write_dataset( - _sample_bundle( - household_weights=(2.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((-5.0, 8_000.0), (60_000.0,)), - ), - tmp_path / "baseline.h5", - ) - candidate_path = _write_dataset( - _sample_bundle( - household_weights=(1.0, 1.0), - state_fips=(6, 36), - ages_by_household=((10.0, 40.0), (70.0,)), - female_by_household=((True, False), (True,)), - employment_income_by_household=((-5.0, 8_000.0), (60_000.0,)), - ), - tmp_path / "candidate.h5", - ) - calibration_specs, evaluation_specs = default_us_atomic_rung5_calibration() - - report = calibrate_and_evaluate_us_reduced_benchmark_specs( - candidate_path, - baseline_path, - calibration_specs, - evaluation_specs=(evaluation_specs[1], evaluation_specs[-1]), - period=2024, - ) - - assert isinstance(report, USMicroplexReducedMultiCalibrationReport) - assert report.target_count == 6 - assert report.calibration_target_counts == { - "person_count_by_state_age": 3, - "person_count_by_age_employment_income_bucket": 3, - } - for spec_name in ("person_count_by_state_age", "person_count_by_age_employment_income_bucket"): - assert ( - report.benchmark_deltas[spec_name]["post_mean_measure_mare"] - < report.benchmark_deltas[spec_name]["pre_mean_measure_mare"] - ) diff --git a/tests/pipelines/test_regime_aware_donor_imputer.py b/tests/pipelines/test_regime_aware_donor_imputer.py deleted file mode 100644 index bacdbc47..00000000 --- a/tests/pipelines/test_regime_aware_donor_imputer.py +++ /dev/null @@ -1,444 +0,0 @@ -"""Regime-aware donor imputer integration for v9. - -v7 had a `y > 0` bug that dropped negative training rows — fixed -minimally in v8 (commit 8c88277) by relabelling the gate to `y != 0`. -v8's fix makes the QRF see both signs, but it fits ONE QRF over mixed -positive and negative training rows, which allows predictions to land -in the interior band (``max(train_negatives)``, ``min(train_positives)``) -— a region no real record occupies. - -v9 upgrades to canonical `microimpute.Imputer`, which at fit time -auto-detects the three-sign regime per target and routes -predictions through separate positive and negative QRFs. The -interior-band gap becomes a structural guarantee, not a statistical -averaging hope. - -Downstream integration lives under a new `--donor-imputer-backend -regime_aware` option; the existing `qrf` and `zi_qrf` backends stay -unchanged for regression comparison. - -Tests pin: - -1. The new backend value resolves through the factory to a donor - imputer that uses canonical regime-gated microimpute internally. -2. On a three-sign training fixture, predictions preserve negatives - (as v8's `y != 0` fix already does). -3. On the same fixture, predictions NEVER land in the interior band - between the positive and negative training regimes — the upgrade - v9 provides over v8. -""" - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pytest - -pytest.importorskip("quantile_forest") -pytest.importorskip("microimpute") - -from microimpute import Imputer as CanonicalMicroImputer -from microimpute.models.regime_gated import REGIME_THREE_SIGN - - -def _three_sign_frame_with_gap(n: int = 1500, seed: int = 0) -> pd.DataFrame: - """Fixture with a hard gap between positive and negative training values. - - Positives live in [100, ∞), negatives in (-∞, -100], zeros exactly - at 0. Any prediction that lands in (-100, 100) excluding zero is - an "interior-band violation" — the test metric for the tripartite - advantage. - """ - rng = np.random.default_rng(seed) - age = rng.integers(18, 80, size=n).astype(float) - is_female = rng.integers(0, 2, size=n).astype(float) - - # Three-way regime assignment driven by (age, is_female). - logit_pos = -0.3 + 0.04 * (age - 50) - logit_neg = 0.3 - 0.04 * (age - 50) - logit_zero = 0.2 * (1 - is_female) - logits = np.stack([logit_neg, logit_zero, logit_pos], axis=1) - logits -= logits.max(axis=1, keepdims=True) - probs = np.exp(logits) - probs /= probs.sum(axis=1, keepdims=True) - u = rng.random(n) - cum = np.cumsum(probs, axis=1) - regime_idx = (cum >= u[:, None]).argmax(axis=1) - - y = np.zeros(n) - pos_mask = regime_idx == 2 - neg_mask = regime_idx == 0 - y[pos_mask] = 100.0 + rng.exponential(250, size=pos_mask.sum()) - y[neg_mask] = -(100.0 + rng.exponential(250, size=neg_mask.sum())) - - return pd.DataFrame( - { - "age": age, - "is_female": is_female, - "short_term_capital_gains": y, - } - ) - - -def _count_interior_violations( - predictions: np.ndarray, band: float = 100.0, atol: float = 1e-6 -) -> int: - """Count predictions in the (-band, band) interior, excluding exact zero.""" - interior = (np.abs(predictions) < band) & (np.abs(predictions) > atol) - return int(interior.sum()) - - -class TestRegimeAwareDonorImputerClassExists: - """The new donor imputer must be importable from microplex_us.pipelines.us.""" - - def test_canonical_microimpute_api_is_required(self) -> None: - imputer = CanonicalMicroImputer() - assert imputer.signregime is True - assert REGIME_THREE_SIGN == "THREE_SIGN" - - def test_importable_from_us_module(self) -> None: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - assert RegimeAwareDonorImputer is not None - - -class TestRegimeAwareBackendFactory: - """`_build_donor_imputer(backend='regime_aware')` returns the new class.""" - - def test_factory_dispatches_to_regime_aware(self) -> None: - from microplex_us.pipelines.us import ( - RegimeAwareDonorImputer, - USMicroplexBuildConfig, - USMicroplexPipeline, - ) - - config = USMicroplexBuildConfig( - donor_imputer_backend="regime_aware", - donor_imputer_qrf_n_estimators=25, - ) - pipeline = USMicroplexPipeline(config=config) - imputer = pipeline._build_donor_imputer( - condition_vars=["is_female", "cps_race"], - target_vars=("qualified_dividend_income", "age"), - ) - assert isinstance(imputer, RegimeAwareDonorImputer) - - -class TestRegimeAwareFitGenerate: - """Fit/generate contract and tripartite-specific guarantees.""" - - def test_qrf_budget_reaches_microimpute_base(self, monkeypatch) -> None: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - captured: dict[str, object] = {} - - class FakeQRF: - def __init__(self, *args, **kwargs): - captured["init_args"] = args - captured["init_kwargs"] = kwargs - - def fit(self, *args, **kwargs): - captured["fit_args"] = args - captured["fit_kwargs"] = kwargs - return self - - monkeypatch.setattr("microimpute.models.qrf.QRF", FakeQRF) - - train = pd.DataFrame( - { - "age": [25.0, 35.0, 45.0, 55.0] * 10, - "income_leaf": [100.0, 200.0, 300.0, 400.0] * 10, - } - ) - imputer = RegimeAwareDonorImputer( - condition_vars=["age"], - target_vars=["income_leaf"], - n_estimators=7, - max_train_samples=17, - ) - imputer.fit(train) - - assert captured["init_kwargs"]["max_train_samples"] == 17 - assert captured["fit_kwargs"]["n_estimators"] == 7 - assert captured["fit_kwargs"]["n_jobs"] == -1 - - def test_multi_target_fit_uses_one_chained_regime_gated_imputer(self) -> None: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - rng = np.random.default_rng(20260606) - n = 300 - age = rng.integers(18, 80, size=n).astype(float) - first = rng.normal(loc=age * 400.0, scale=2_000.0, size=n) - second = 0.75 * first + rng.normal(scale=250.0, size=n) - train = pd.DataFrame( - { - "age": age, - "first_income_leaf": first, - "second_income_leaf": second, - } - ) - - imputer = RegimeAwareDonorImputer( - condition_vars=["age"], - target_vars=["first_income_leaf", "second_income_leaf"], - n_estimators=25, - ) - imputer.fit(train) - - first_fitted = imputer._fitted["first_income_leaf"] - second_fitted = imputer._fitted["second_income_leaf"] - assert first_fitted is second_fitted - - second_bundle = second_fitted._per_variable["second_income_leaf"] - assert second_bundle["predictors"] == ["age", "first_income_leaf"] - - def test_target_predictor_overlap_is_owned_by_sequential_chain(self) -> None: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - rng = np.random.default_rng(2026060601) - n = 300 - age = rng.integers(18, 80, size=n).astype(float) - first = rng.normal(loc=age * 300.0, scale=1_000.0, size=n) - second = 0.5 * first + rng.normal(scale=250.0, size=n) - train = pd.DataFrame( - { - "age": age, - "first_income_leaf": first, - "second_income_leaf": second, - } - ) - - imputer = RegimeAwareDonorImputer( - condition_vars=["age", "first_income_leaf"], - target_vars=["first_income_leaf", "second_income_leaf"], - n_estimators=25, - ) - imputer.fit(train) - - fitted = imputer._fitted["first_income_leaf"] - first_bundle = fitted._per_variable["first_income_leaf"] - second_bundle = fitted._per_variable["second_income_leaf"] - assert first_bundle["predictors"] == ["age"] - assert second_bundle["predictors"] == ["age", "first_income_leaf"] - - conditions = pd.DataFrame({"age": [25.0, 45.0, 65.0]}) - synthetic = imputer.generate(conditions, seed=20260606) - assert list(synthetic.columns) == [ - "age", - "first_income_leaf", - "second_income_leaf", - ] - assert ( - synthetic[["first_income_leaf", "second_income_leaf"]].notna().all().all() - ) - - def test_duplicate_input_columns_are_collapsed_before_microimpute(self) -> None: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - rng = np.random.default_rng(2026060602) - n = 300 - age = rng.integers(18, 80, size=n).astype(float) - first = rng.normal(loc=age * 300.0, scale=1_000.0, size=n) - second = 0.5 * first + rng.normal(scale=250.0, size=n) - train = pd.DataFrame( - np.column_stack([age, first, first, second]), - columns=[ - "age", - "first_income_leaf", - "first_income_leaf", - "second_income_leaf", - ], - ) - assert not train.columns.is_unique - - imputer = RegimeAwareDonorImputer( - condition_vars=["age", "first_income_leaf"], - target_vars=[ - "first_income_leaf", - "first_income_leaf", - "second_income_leaf", - ], - n_estimators=25, - ) - imputer.fit(train) - - assert imputer._fitted_columns == ( - "first_income_leaf", - "second_income_leaf", - ) - fitted = imputer._fitted["first_income_leaf"] - first_bundle = fitted._per_variable["first_income_leaf"] - second_bundle = fitted._per_variable["second_income_leaf"] - assert first_bundle["predictors"] == ["age"] - assert second_bundle["predictors"] == ["age", "first_income_leaf"] - - conditions = pd.DataFrame( - np.column_stack([[25.0, 45.0, 65.0], [26.0, 46.0, 66.0]]), - columns=["age", "age"], - ) - synthetic = imputer.generate(conditions, seed=20260606) - assert list(synthetic.columns) == [ - "age", - "first_income_leaf", - "second_income_leaf", - ] - assert synthetic.columns.is_unique - assert ( - synthetic[["first_income_leaf", "second_income_leaf"]].notna().all().all() - ) - - def test_nonnumeric_targets_do_not_require_numeric_regimes(self) -> None: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - rng = np.random.default_rng(2026060603) - n = 300 - age = rng.integers(18, 80, size=n).astype(float) - income = rng.normal(loc=age * 250.0, scale=1_000.0, size=n) - train = pd.DataFrame( - { - "age": age, - "self_employment_income": income, - "business_is_sstb": income > np.median(income), - } - ) - - imputer = RegimeAwareDonorImputer( - condition_vars=["age"], - target_vars=["self_employment_income", "business_is_sstb"], - n_estimators=25, - ) - imputer.fit(train) - - assert "self_employment_income" in imputer._regimes - assert "business_is_sstb" not in imputer._regimes - - conditions = pd.DataFrame({"age": [25.0, 45.0, 65.0]}) - synthetic = imputer.generate(conditions, seed=20260606) - assert list(synthetic.columns) == [ - "age", - "self_employment_income", - "business_is_sstb", - ] - assert ( - synthetic[["self_employment_income", "business_is_sstb"]] - .notna() - .all() - .all() - ) - - def _fit_generate( - self, n_train: int = 1500, n_gen: int = 2000, seed: int = 0 - ) -> np.ndarray: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - train = _three_sign_frame_with_gap(n=n_train, seed=seed) - # Precondition: fixture genuinely three-sign. - y = train["short_term_capital_gains"].to_numpy() - assert (y > 100).sum() > 100 - assert (y < -100).sum() > 100 - assert (y == 0).sum() > 100 - - imputer = RegimeAwareDonorImputer( - condition_vars=["age", "is_female"], - target_vars=["short_term_capital_gains"], - n_estimators=25, - ) - imputer.fit(train) - - rng = np.random.default_rng(42) - conditions = pd.DataFrame( - { - "age": rng.integers(18, 80, size=n_gen).astype(float), - "is_female": rng.integers(0, 2, size=n_gen).astype(float), - } - ) - synthetic = imputer.generate(conditions, seed=42) - return synthetic["short_term_capital_gains"].to_numpy() - - def test_generates_negative_predictions(self) -> None: - """Drop-negatives bug must not recur under regime-aware path.""" - synth_y = self._fit_generate() - n_neg = int((synth_y < 0).sum()) - assert n_neg > 0, ( - "Regime-aware donor imputer produced no negatives on a " - "three-sign training fixture — regression." - ) - assert n_neg / len(synth_y) > 0.05 - - def test_generates_positive_predictions(self) -> None: - synth_y = self._fit_generate() - n_pos = int((synth_y > 0).sum()) - assert n_pos / len(synth_y) > 0.05 - - def test_generates_zero_predictions(self) -> None: - synth_y = self._fit_generate() - n_zero = int((np.abs(synth_y) < 1e-6).sum()) - assert n_zero > 0, "Gate must emit some exact zeros." - - def test_no_interior_band_violations(self) -> None: - """Core v9 advantage over v8. - - v8's `y != 0` fix keeps negatives but fits ONE QRF over mixed - pos+neg training rows, so predictions can interpolate into the - (-100, 100) interior band. v9's regime-aware path fits - separate positive and negative QRFs and routes through a - three-way gate, so the interior is empty by construction. - """ - synth_y = self._fit_generate() - violations = _count_interior_violations(synth_y, band=100.0) - assert violations == 0, ( - f"Regime-aware imputer produced {violations} predictions in " - f"the (-100, 100) interior band, which should be empty by " - f"construction. Sample offenders: " - f"{sorted(synth_y[(np.abs(synth_y) < 100) & (np.abs(synth_y) > 1e-6)][:10])}" - ) - - def test_same_seed_repeats_identically(self) -> None: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - train = _three_sign_frame_with_gap(n=1200, seed=3) - conditions = train[["age", "is_female"]].head(300).reset_index(drop=True) - imputer = RegimeAwareDonorImputer( - condition_vars=["age", "is_female"], - target_vars=["short_term_capital_gains"], - n_estimators=25, - ) - imputer.fit(train) - - first = imputer.generate(conditions, seed=123)[ - "short_term_capital_gains" - ].to_numpy() - second = imputer.generate(conditions, seed=123)[ - "short_term_capital_gains" - ].to_numpy() - third = imputer.generate(conditions, seed=999)[ - "short_term_capital_gains" - ].to_numpy() - - np.testing.assert_array_equal(first, second) - assert not np.array_equal(first, third) - - def test_same_seed_repeats_identically_for_multiple_targets(self) -> None: - from microplex_us.pipelines.us import RegimeAwareDonorImputer - - train = _three_sign_frame_with_gap(n=1200, seed=4) - train["rental_income"] = -0.5 * train["short_term_capital_gains"] - conditions = train[["age", "is_female"]].head(300).reset_index(drop=True) - imputer = RegimeAwareDonorImputer( - condition_vars=["age", "is_female"], - target_vars=["short_term_capital_gains", "rental_income"], - n_estimators=25, - ) - imputer.fit(train) - - first = imputer.generate(conditions, seed=456) - second = imputer.generate(conditions, seed=456) - third = imputer.generate(conditions, seed=654) - - for column in ("short_term_capital_gains", "rental_income"): - np.testing.assert_array_equal( - first[column].to_numpy(), second[column].to_numpy() - ) - assert not np.array_equal( - first[column].to_numpy(), third[column].to_numpy() - ) diff --git a/tests/pipelines/test_registry.py b/tests/pipelines/test_registry.py deleted file mode 100644 index 1060dae4..00000000 --- a/tests/pipelines/test_registry.py +++ /dev/null @@ -1,437 +0,0 @@ -"""Tests for the persistent US microplex run registry.""" - -import json - -from microplex_us.pipelines.registry import ( - append_us_microplex_run_registry_entry, - build_us_microplex_run_registry_entry, - load_us_microplex_run_registry, -) - - -def _manifest( - *, - created_at: str, - synthesis_backend: str, - calibration_backend: str, - candidate_error: float, - baseline_error: float, - delta: float, - full_oracle_error: float | None = None, - full_oracle_capped_error: float | None = None, - candidate_composite_loss: float | None = None, - baseline_composite_loss: float | None = None, - composite_delta: float | None = None, - calibration_converged: bool = True, - weight_collapse_suspected: bool = False, - candidate_native_loss: float | None = None, - baseline_native_loss: float | None = None, - native_delta: float | None = None, -) -> dict: - resolved_candidate_composite_loss = ( - candidate_error if candidate_composite_loss is None else candidate_composite_loss - ) - resolved_baseline_composite_loss = ( - baseline_error if baseline_composite_loss is None else baseline_composite_loss - ) - resolved_composite_delta = ( - resolved_candidate_composite_loss - resolved_baseline_composite_loss - if composite_delta is None - else composite_delta - ) - manifest = { - "created_at": created_at, - "config": { - "synthesis_backend": synthesis_backend, - "calibration_backend": calibration_backend, - "random_seed": 42, - }, - "rows": {"seed": 10, "synthetic": 20, "calibrated": 20}, - "weights": {"nonzero": 20, "total": 1000.0}, - "synthesis": {"source_names": ["cps", "puf"]}, - "calibration": { - "converged": calibration_converged, - "weight_collapse_suspected": weight_collapse_suspected, - "full_oracle_capped_mean_abs_relative_error": ( - candidate_error - if full_oracle_capped_error is None - else full_oracle_capped_error - ), - "full_oracle_mean_abs_relative_error": ( - candidate_error if full_oracle_error is None else full_oracle_error - ), - }, - "policyengine_harness": { - "candidate_mean_abs_relative_error": candidate_error, - "baseline_mean_abs_relative_error": baseline_error, - "mean_abs_relative_error_delta": delta, - "candidate_composite_parity_loss": resolved_candidate_composite_loss, - "baseline_composite_parity_loss": resolved_baseline_composite_loss, - "composite_parity_loss_delta": resolved_composite_delta, - "slice_win_rate": 1.0 if delta < 0 else 0.0, - "target_win_rate": 1.0 if delta < 0 else 0.0, - "supported_target_rate": 1.0, - "tag_summaries": { - "national": { - "candidate_mean_abs_relative_error": candidate_error, - "baseline_mean_abs_relative_error": baseline_error, - "mean_abs_relative_error_delta": delta, - "candidate_composite_parity_loss": resolved_candidate_composite_loss, - "baseline_composite_parity_loss": resolved_baseline_composite_loss, - "composite_parity_loss_delta": resolved_composite_delta, - "slice_win_rate": 1.0 if delta < 0 else 0.0, - "target_win_rate": 1.0 if delta < 0 else 0.0, - "supported_target_rate": 1.0, - } - }, - "parity_scorecard": { - "overall": { - "candidate_mean_abs_relative_error": candidate_error, - "baseline_mean_abs_relative_error": baseline_error, - "mean_abs_relative_error_delta": delta, - "candidate_composite_parity_loss": resolved_candidate_composite_loss, - "baseline_composite_parity_loss": resolved_baseline_composite_loss, - "composite_parity_loss_delta": resolved_composite_delta, - "slice_win_rate": 1.0 if delta < 0 else 0.0, - "target_win_rate": 1.0 if delta < 0 else 0.0, - "supported_target_rate": 1.0, - "candidate_beats_baseline": delta < 0, - }, - "national": { - "candidate_mean_abs_relative_error": candidate_error, - "baseline_mean_abs_relative_error": baseline_error, - "mean_abs_relative_error_delta": delta, - "candidate_composite_parity_loss": resolved_candidate_composite_loss, - "baseline_composite_parity_loss": resolved_baseline_composite_loss, - "composite_parity_loss_delta": resolved_composite_delta, - "slice_win_rate": 1.0 if delta < 0 else 0.0, - "target_win_rate": 1.0 if delta < 0 else 0.0, - "supported_target_rate": 1.0, - "candidate_beats_baseline": delta < 0, - }, - }, - }, - } - if candidate_native_loss is not None: - resolved_baseline_native_loss = ( - baseline_native_loss if baseline_native_loss is not None else baseline_error - ) - resolved_native_delta = ( - native_delta - if native_delta is not None - else candidate_native_loss - resolved_baseline_native_loss - ) - manifest["policyengine_native_scores"] = { - "candidate_enhanced_cps_native_loss": candidate_native_loss, - "baseline_enhanced_cps_native_loss": resolved_baseline_native_loss, - "enhanced_cps_native_loss_delta": resolved_native_delta, - "candidate_beats_baseline": resolved_native_delta < 0, - "candidate_unweighted_msre": candidate_native_loss, - "baseline_unweighted_msre": resolved_baseline_native_loss, - "unweighted_msre_delta": resolved_native_delta, - } - return manifest - - -def _harness_payload() -> dict: - return { - "metadata": { - "baseline_dataset": "enhanced_cps_2024.h5", - "targets_db": "policy_data.db", - "target_period": 2024, - "target_variables": ["snap", "household_count"], - "target_domains": ["snap"], - "target_geo_levels": ["state"], - "target_reform_id": 0, - "policyengine_us_runtime_version": "1.587.0", - } - } - - -def test_append_and_load_us_microplex_run_registry(tmp_path): - registry_path = tmp_path / "runs.jsonl" - - first_entry = build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "v1", - manifest_path=tmp_path / "v1" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T12:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.20, - baseline_error=0.30, - delta=-0.10, - ), - policyengine_harness_path=tmp_path / "v1" / "policyengine_harness.json", - policyengine_harness_payload=_harness_payload(), - metadata={"git_commit": "abc123"}, - ) - recorded_first = append_us_microplex_run_registry_entry(registry_path, first_entry) - - second_entry = build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "v2", - manifest_path=tmp_path / "v2" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T13:00:00+00:00", - synthesis_backend="synthesizer", - calibration_backend="entropy", - candidate_error=0.25, - baseline_error=0.30, - delta=-0.05, - ), - policyengine_harness_path=tmp_path / "v2" / "policyengine_harness.json", - policyengine_harness_payload=_harness_payload(), - metadata={"git_commit": "def456"}, - ) - recorded_second = append_us_microplex_run_registry_entry(registry_path, second_entry) - - entries = load_us_microplex_run_registry(registry_path) - - assert len(entries) == 2 - assert entries[0].artifact_id == "v1" - assert entries[0].source_names == ("cps", "puf") - assert entries[0].calibration_converged is True - assert entries[0].weight_collapse_suspected is False - assert entries[0].target_variables == ("snap", "household_count") - assert entries[0].policyengine_us_runtime_version == "1.587.0" - assert entries[0].supported_target_rate == 1.0 - assert entries[0].tag_summaries["national"]["supported_target_rate"] == 1.0 - assert entries[0].full_oracle_capped_mean_abs_relative_error == 0.20 - assert entries[0].full_oracle_mean_abs_relative_error == 0.20 - assert entries[0].candidate_composite_parity_loss == 0.20 - assert entries[0].parity_scorecard["overall"]["candidate_beats_baseline"] is True - assert entries[0].metadata["git_commit"] == "abc123" - assert entries[0].improved_candidate_frontier is True - assert entries[0].improved_delta_frontier is True - assert entries[0].improved_composite_frontier is True - assert entries[1].artifact_id == "v2" - assert entries[1].metadata["git_commit"] == "def456" - assert entries[1].improved_candidate_frontier is False - assert entries[1].improved_delta_frontier is False - assert entries[1].improved_composite_frontier is False - assert recorded_first.config_hash is not None - assert recorded_second.config_hash is not None - assert recorded_second.config_hash != recorded_first.config_hash - - raw_lines = registry_path.read_text().splitlines() - assert len(raw_lines) == 2 - assert json.loads(raw_lines[0])["artifact_id"] == "v1" - - -def test_default_frontier_metric_prefers_composite_parity_loss(tmp_path): - from microplex_us.pipelines.registry import select_us_microplex_frontier_entry - - registry_path = tmp_path / "runs.jsonl" - - append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "run-1", - manifest_path=tmp_path / "run-1" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T12:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.18, - baseline_error=0.30, - delta=-0.12, - candidate_composite_loss=0.45, - baseline_composite_loss=0.50, - composite_delta=-0.05, - ), - policyengine_harness_payload=_harness_payload(), - ), - ) - append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "run-2", - manifest_path=tmp_path / "run-2" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T13:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.20, - baseline_error=0.30, - delta=-0.10, - candidate_composite_loss=0.35, - baseline_composite_loss=0.50, - composite_delta=-0.15, - ), - policyengine_harness_payload=_harness_payload(), - ), - ) - - assert select_us_microplex_frontier_entry(registry_path).artifact_id == "run-2" - assert ( - select_us_microplex_frontier_entry( - registry_path, - metric="candidate_mean_abs_relative_error", - ).artifact_id - == "run-1" - ) - - -def test_full_oracle_capped_frontier_selection(tmp_path): - from microplex_us.pipelines.registry import select_us_microplex_frontier_entry - - registry_path = tmp_path / "runs.jsonl" - - append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "run-1", - manifest_path=tmp_path / "run-1" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T12:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.10, - baseline_error=0.30, - delta=-0.20, - full_oracle_capped_error=0.22, - full_oracle_error=0.25, - ), - policyengine_harness_payload=_harness_payload(), - ), - ) - append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "run-2", - manifest_path=tmp_path / "run-2" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T13:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.10, - baseline_error=0.30, - delta=-0.20, - full_oracle_capped_error=0.12, - full_oracle_error=0.15, - ), - policyengine_harness_payload=_harness_payload(), - ), - ) - - assert ( - select_us_microplex_frontier_entry( - registry_path, - metric="full_oracle_capped_mean_abs_relative_error", - ).artifact_id - == "run-2" - ) - - -def test_native_loss_frontier_selection(tmp_path): - from microplex_us.pipelines.registry import select_us_microplex_frontier_entry - - registry_path = tmp_path / "runs.jsonl" - - append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "run-1", - manifest_path=tmp_path / "run-1" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T12:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.18, - baseline_error=0.30, - delta=-0.12, - candidate_native_loss=0.2, - baseline_native_loss=0.5, - ), - policyengine_harness_payload=_harness_payload(), - ), - ) - append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "run-2", - manifest_path=tmp_path / "run-2" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T13:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.20, - baseline_error=0.30, - delta=-0.10, - candidate_native_loss=0.1, - baseline_native_loss=0.5, - ), - policyengine_harness_payload=_harness_payload(), - ), - ) - - assert ( - select_us_microplex_frontier_entry( - registry_path, - metric="candidate_enhanced_cps_native_loss", - ).artifact_id - == "run-2" - ) - assert ( - select_us_microplex_frontier_entry( - registry_path, - metric="enhanced_cps_native_loss_delta", - ).artifact_id - == "run-2" - ) - - entries = load_us_microplex_run_registry(registry_path) - assert entries[0].candidate_beats_baseline_native_loss is True - assert entries[1].candidate_beats_baseline_native_loss is True - - -def test_frontier_selection_ignores_weight_collapsed_runs(tmp_path): - from microplex_us.pipelines.registry import select_us_microplex_frontier_entry - - registry_path = tmp_path / "runs.jsonl" - - collapsed = append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "collapsed-run", - manifest_path=tmp_path / "collapsed-run" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T11:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.01, - baseline_error=0.30, - delta=-0.29, - candidate_composite_loss=0.02, - baseline_composite_loss=0.50, - composite_delta=-0.48, - weight_collapse_suspected=True, - ), - policyengine_harness_payload=_harness_payload(), - ), - ) - healthy = append_us_microplex_run_registry_entry( - registry_path, - build_us_microplex_run_registry_entry( - artifact_dir=tmp_path / "healthy-run", - manifest_path=tmp_path / "healthy-run" / "manifest.json", - manifest=_manifest( - created_at="2026-03-25T12:00:00+00:00", - synthesis_backend="bootstrap", - calibration_backend="entropy", - candidate_error=0.20, - baseline_error=0.30, - delta=-0.10, - candidate_composite_loss=0.35, - baseline_composite_loss=0.50, - composite_delta=-0.15, - ), - policyengine_harness_payload=_harness_payload(), - ), - ) - - assert collapsed.weight_collapse_suspected is True - assert collapsed.improved_candidate_frontier is None - assert collapsed.improved_composite_frontier is None - assert healthy.improved_candidate_frontier is True - assert select_us_microplex_frontier_entry(registry_path).artifact_id == "healthy-run" diff --git a/tests/pipelines/test_seed_stage_parity.py b/tests/pipelines/test_seed_stage_parity.py deleted file mode 100644 index e1970b1f..00000000 --- a/tests/pipelines/test_seed_stage_parity.py +++ /dev/null @@ -1,415 +0,0 @@ -"""Tests for seed/source-impute parity auditing.""" - -from __future__ import annotations - -import json -import subprocess -import sys -from pathlib import Path - -import h5py -import pandas as pd -import pytest - -from microplex_us.pipelines.seed_stage_parity import ( - SeedStageBooleanLandingFeatureSpec, - SeedStageCategoricalLandingFeatureSpec, - SeedStageFocusVariableSpec, - _normalize_seed_ids_for_policyengine_support, - _seed_tax_unit_support_payload, - build_us_seed_stage_parity_audit, - write_us_seed_stage_parity_audit, -) - - -def _write_period_dataset(path, data: dict[str, list | tuple], *, period: int = 2024) -> None: - with h5py.File(path, "w") as handle: - for variable, values in data.items(): - group = handle.create_group(variable) - group.create_dataset(str(period), data=values) - - -def test_build_us_seed_stage_parity_audit_projects_reference_and_profiles_positive_rows( - tmp_path, -) -> None: - seed_path = tmp_path / "seed.parquet" - reference_path = tmp_path / "reference.h5" - - pd.DataFrame( - { - "person_id": ["1", "2", "3"], - "household_id": ["10", "10", "20"], - "tax_unit_id": ["100", "100", "200"], - "hh_weight": [10.0, 10.0, 20.0], - "self_employment_income": [0.0, 50.0, 100.0], - "employment_income": [10.0, 20.0, 30.0], - "has_esi": [False, True, True], - "has_marketplace_health_coverage": [False, True, False], - "age": [5, 42, 67], - "state_fips": [1, 1, 2], - "employment_status": ["not_working", "self_employed", "self_employed"], - } - ).to_parquet(seed_path, index=False) - - _write_period_dataset( - reference_path, - { - "household_id": [10, 20], - "household_weight": [20.0, 40.0], - "person_id": [101, 102, 103], - "person_household_id": [10, 10, 20], - "tax_unit_id": [100, 200], - "person_tax_unit_id": [100, 100, 200], - "self_employment_income_before_lsr": [0.0, 50.0, 100.0], - "employment_income_before_lsr": [10.0, 20.0, 30.0], - "has_esi": [False, True, True], - "has_marketplace_health_coverage": [False, True, False], - "age": [5, 42, 67], - "state_fips": [1, 2], - }, - ) - - audit = build_us_seed_stage_parity_audit( - seed_path, - reference_path, - focus_variables=( - SeedStageFocusVariableSpec( - "self_employment_income", - "self_employment_income", - "self_employment_income_before_lsr", - value_kind="numeric", - ), - "missing_metric", - ), - boolean_landing_features=( - SeedStageBooleanLandingFeatureSpec("has_esi", "has_esi"), - SeedStageBooleanLandingFeatureSpec( - "has_marketplace_health_coverage", - "has_marketplace_health_coverage", - ), - ), - categorical_landing_features=( - SeedStageCategoricalLandingFeatureSpec( - "age_bin", - "age", - "age", - transform="age_bin", - ), - SeedStageCategoricalLandingFeatureSpec( - "state_fips", - "state_fips", - "state_fips", - ), - ), - candidate_only_landing_features=( - SeedStageCategoricalLandingFeatureSpec( - "employment_status", - "employment_status", - ), - ), - ) - - assert audit["comparisonStage"] == "seed_source_impute" - assert audit["weightScale"]["reference_to_seed_weight_scale"] == pytest.approx(2.0) - assert audit["seedStructure"]["tax_unit_id_count"] == 2 - assert audit["seedStructure"]["weighted_mean_rows_per_household"] == pytest.approx( - 4.0 / 3.0 - ) - assert audit["referenceStructure"]["weighted_mean_rows_per_household"] == pytest.approx( - 4.0 / 3.0 - ) - - self_employment = audit["focusVariables"]["self_employment_income"] - assert self_employment["comparison"]["type"] == "numeric" - assert self_employment["comparison"]["weighted_sum_ratio"] == pytest.approx(0.5) - assert self_employment["comparison"]["reference_scaled_weighted_sum_ratio"] == pytest.approx( - 1.0 - ) - assert self_employment["positiveSupport"]["seed_positive_weight_share"] == pytest.approx( - 0.75 - ) - - marketplace = self_employment["positiveBooleanProfiles"][ - "has_marketplace_health_coverage" - ] - assert marketplace["seed_positive_share"] == pytest.approx(1.0 / 3.0) - assert marketplace["reference_positive_share"] == pytest.approx(1.0 / 3.0) - assert marketplace["share_delta"] == pytest.approx(0.0) - - age_bin = self_employment["positiveCategoricalProfiles"]["age_bin"] - assert age_bin["seed"]["top_values"][0]["value"] == "65-69" - assert age_bin["seed"]["top_values"][0]["weighted_share"] == pytest.approx(2.0 / 3.0) - - state_fips = self_employment["positiveCategoricalProfiles"]["state_fips"] - assert state_fips["reference_present"] is True - assert state_fips["reference"]["top_values"][0]["value"] == "2" - - employment_status = self_employment["positiveCandidateOnlyProfiles"][ - "employment_status" - ] - assert employment_status["seed"]["top_values"][0]["value"] == "self_employed" - assert employment_status["seed"]["top_values"][0]["weighted_share"] == pytest.approx(1.0) - - missing = audit["focusVariables"]["missing_metric"] - assert missing["seed_present"] is False - assert missing["reference_present"] is False - - -def test_write_us_seed_stage_parity_audit_persists_json(tmp_path) -> None: - seed_path = tmp_path / "seed.parquet" - reference_path = tmp_path / "reference.h5" - output_path = tmp_path / "audit.json" - - pd.DataFrame( - { - "person_id": ["1"], - "household_id": ["10"], - "hh_weight": [10.0], - "health_savings_account_ald": [25.0], - } - ).to_parquet(seed_path, index=False) - _write_period_dataset( - reference_path, - { - "household_id": [10], - "household_weight": [10.0], - "person_id": [101], - "person_household_id": [10], - "tax_unit_id": [100], - "person_tax_unit_id": [100], - "health_savings_account_ald": [25.0], - }, - ) - - written = write_us_seed_stage_parity_audit( - seed_path, - reference_path, - output_path, - focus_variables=("health_savings_account_ald",), - boolean_landing_features=(), - categorical_landing_features=(), - candidate_only_landing_features=(), - ) - - payload = json.loads(written.read_text()) - assert written == output_path.resolve() - assert payload["focusVariables"]["health_savings_account_ald"]["comparison"]["type"] == ( - "numeric" - ) - - -def test_build_us_seed_stage_parity_audit_uses_household_weights_not_row_order(tmp_path) -> None: - reference_path = tmp_path / "reference.h5" - seed_a_path = tmp_path / "seed_a.parquet" - seed_b_path = tmp_path / "seed_b.parquet" - - rows = pd.DataFrame( - { - "person_id": ["1", "2", "3"], - "household_id": ["10", "10", "20"], - "weight": [100.0, 1.0, 200.0], - "taxable_interest_income": [1.0, 0.0, 2.0], - } - ) - rows.to_parquet(seed_a_path, index=False) - rows.iloc[[1, 0, 2]].reset_index(drop=True).to_parquet(seed_b_path, index=False) - - _write_period_dataset( - reference_path, - { - "household_id": [10, 20], - "household_weight": [5.0, 10.0], - "person_id": [101, 102, 103], - "person_household_id": [10, 10, 20], - "taxable_interest_income": [1.0, 0.0, 2.0], - }, - ) - - audit_a = build_us_seed_stage_parity_audit( - seed_a_path, - reference_path, - focus_variables=("taxable_interest_income",), - boolean_landing_features=(), - categorical_landing_features=(), - candidate_only_landing_features=(), - ) - audit_b = build_us_seed_stage_parity_audit( - seed_b_path, - reference_path, - focus_variables=("taxable_interest_income",), - boolean_landing_features=(), - categorical_landing_features=(), - candidate_only_landing_features=(), - ) - - assert audit_a["seedStructure"]["weighted_mean_rows_per_household"] == pytest.approx( - audit_b["seedStructure"]["weighted_mean_rows_per_household"] - ) - - -def test_build_us_seed_stage_parity_audit_marks_zero_reference_numeric_ratios(tmp_path) -> None: - seed_path = tmp_path / "seed.parquet" - reference_path = tmp_path / "reference.h5" - - pd.DataFrame( - { - "person_id": ["1"], - "household_id": ["10"], - "hh_weight": [10.0], - "taxable_interest_income": [25.0], - } - ).to_parquet(seed_path, index=False) - _write_period_dataset( - reference_path, - { - "household_id": [10], - "household_weight": [10.0], - "person_id": [101], - "person_household_id": [10], - "taxable_interest_income": [0.0], - }, - ) - - audit = build_us_seed_stage_parity_audit( - seed_path, - reference_path, - focus_variables=("taxable_interest_income",), - boolean_landing_features=(), - categorical_landing_features=(), - candidate_only_landing_features=(), - ) - - comparison = audit["focusVariables"]["taxable_interest_income"]["comparison"] - assert comparison["weighted_sum_ratio_defined"] is False - assert comparison["weighted_sum_ratio_case"] == "candidate_nonzero_reference_zero" - assert comparison["reference_scaled_weighted_sum_ratio_defined"] is False - assert comparison["weighted_positive_share_ratio_defined"] is False - - -def test_seed_stage_module_imports_without_duckdb(tmp_path) -> None: - repo_root = Path(__file__).resolve().parents[2] - code = """ -import builtins -import sys - -real_import = builtins.__import__ - -def hooked(name, globals=None, locals=None, fromlist=(), level=0): - if name == "duckdb": - raise ModuleNotFoundError("No module named 'duckdb'") - return real_import(name, globals, locals, fromlist, level) - -builtins.__import__ = hooked - -import microplex_us.pipelines.seed_stage_parity as mod -assert callable(mod.build_us_seed_stage_parity_audit) -""" - result = subprocess.run( - [sys.executable, "-c", code], - cwd=repo_root, - capture_output=True, - text=True, - check=False, - ) - assert result.returncode == 0, result.stderr - - -def test_seed_tax_unit_support_payload_sorts_largest_gaps_first() -> None: - payload = _seed_tax_unit_support_payload( - seed_path=Path("/tmp/seed.parquet"), - reference_path=Path("/tmp/reference.h5"), - period=2024, - support_audit={ - "candidate": { - "filing_status_weighted_counts": { - "SINGLE": {"weighted_count": 120.0}, - "JOINT": {"weighted_count": 80.0}, - "SEPARATE": {"weighted_count": 5.0}, - "HEAD_OF_HOUSEHOLD": {"weighted_count": 30.0}, - "SURVIVING_SPOUSE": {"weighted_count": 2.0}, - }, - "mfs_high_agi_support": [ - { - "agi_bin": "75k_to_100k", - "weighted_count": 10.0, - "weighted_agi": 850000.0, - }, - { - "agi_bin": "500k_plus", - "weighted_count": 0.0, - "weighted_agi": 0.0, - }, - ], - }, - "baseline": { - "filing_status_weighted_counts": { - "SINGLE": {"weighted_count": 100.0}, - "JOINT": {"weighted_count": 60.0}, - "SEPARATE": {"weighted_count": 15.0}, - "HEAD_OF_HOUSEHOLD": {"weighted_count": 20.0}, - "SURVIVING_SPOUSE": {"weighted_count": 3.0}, - }, - "mfs_high_agi_support": [ - { - "agi_bin": "75k_to_100k", - "weighted_count": 20.0, - "weighted_agi": 1700000.0, - }, - { - "agi_bin": "500k_plus", - "weighted_count": 4.0, - "weighted_agi": 4000000.0, - }, - ], - }, - "comparisons": { - "filing_status_weighted_delta": [ - {"filing_status": "SINGLE", "weighted_count_delta": 12.0}, - {"filing_status": "JOINT", "weighted_count_delta": 20.0}, - {"filing_status": "SEPARATE", "weighted_count_delta": -10.0}, - {"filing_status": "HEAD_OF_HOUSEHOLD", "weighted_count_delta": 10.0}, - {"filing_status": "SURVIVING_SPOUSE", "weighted_count_delta": -1.0}, - ], - "mfs_high_agi_delta": [ - { - "agi_bin": "75k_to_100k", - "weighted_count_delta": -10.0, - "weighted_agi_delta": -850000.0, - }, - { - "agi_bin": "500k_plus", - "weighted_count_delta": -4.0, - "weighted_agi_delta": -4000000.0, - }, - ], - }, - }, - ) - - assert payload["comparisonStage"] == "seed_tax_unit_support" - filing_rows = payload["comparisons"]["filing_status_weighted_delta"] - mfs_rows = payload["comparisons"]["mfs_high_agi_delta"] - - assert abs(filing_rows[0]["weighted_count_delta"]) >= abs( - filing_rows[1]["weighted_count_delta"] - ) - assert mfs_rows[0]["agi_bin"] == "500k_plus" - assert payload["verdictHints"]["largestFilingStatusGap"] == filing_rows[0]["filing_status"] - assert payload["verdictHints"]["largestMFSAgiGap"] == "500k_plus" - - -def test_normalize_seed_ids_for_policyengine_support_factorizes_string_ids() -> None: - normalized = _normalize_seed_ids_for_policyengine_support( - pd.DataFrame( - { - "person_id": ["14:1", "14:2", "22:1"], - "household_id": ["14", "14", "22"], - "tax_unit_id": ["14", "14", "22"], - } - ) - ) - - assert normalized["person_id"].dtype.kind in {"i", "u"} - assert normalized["household_id"].dtype.kind in {"i", "u"} - assert normalized["tax_unit_id"].dtype.kind in {"i", "u"} - assert normalized["household_id"].tolist() == [0, 0, 1] diff --git a/tests/pipelines/test_site_snapshot.py b/tests/pipelines/test_site_snapshot.py deleted file mode 100644 index 021a5ee7..00000000 --- a/tests/pipelines/test_site_snapshot.py +++ /dev/null @@ -1,239 +0,0 @@ -"""Tests for canonical US site snapshot generation.""" - -import json - -import pytest - -from microplex_us.pipelines.data_flow_snapshot import ( - write_us_microplex_data_flow_snapshot, -) -from microplex_us.pipelines.site_snapshot import build_us_microplex_site_snapshot - - -def test_build_us_microplex_site_snapshot_reads_manifest_and_harness(tmp_path): - artifact_dir = tmp_path / "run-1" - artifact_dir.mkdir() - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-03-29T00:00:00+00:00", - "config": {"n_synthetic": 2000}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_harness": "policyengine_harness.json", - }, - "synthesis": { - "scaffold_source": "cps_asec_2023", - "state_program_support_proxies": { - "available": ["ssi"], - "missing": ["snap"], - }, - }, - "calibration": { - "n_loaded_targets": 100, - "n_supported_targets": 90, - "converged": False, - "weight_collapse_suspected": False, - "household_weight_diagnostics": { - "effective_sample_size": 40.0, - "tiny_share": 0.01, - }, - "person_weight_diagnostics": { - "effective_sample_size": 80.0, - "tiny_share": 0.02, - }, - }, - "policyengine_harness": { - "candidate_mean_abs_relative_error": 0.9, - "baseline_mean_abs_relative_error": 1.1, - "mean_abs_relative_error_delta": -0.2, - }, - } - ) - ) - (artifact_dir / "seed_data.parquet").write_text("") - (artifact_dir / "synthetic_data.parquet").write_text("") - (artifact_dir / "calibrated_data.parquet").write_text("") - (artifact_dir / "targets.json").write_text("{}") - (artifact_dir / "policyengine_harness.json").write_text( - json.dumps( - { - "summary": { - "candidate_mean_abs_relative_error": 0.9, - "baseline_mean_abs_relative_error": 1.1, - "mean_abs_relative_error_delta": -0.2, - "candidate_composite_parity_loss": 0.8, - "baseline_composite_parity_loss": 1.2, - "target_win_rate": 0.2, - "slice_win_rate": 0.5, - "supported_target_rate": 0.9, - "tag_summaries": { - "state": { - "candidate_mean_abs_relative_error": 0.7, - "baseline_mean_abs_relative_error": 0.8, - "mean_abs_relative_error_delta": -0.1, - "candidate_composite_parity_loss": 0.6, - "baseline_composite_parity_loss": 0.9, - "target_win_rate": 0.3, - "slice_win_rate": 1.0, - "supported_target_rate": 0.85, - } - }, - "parity_scorecard": {"overall": {"candidate_beats_baseline": True}}, - "attribute_cell_summaries": { - "geo=state|feature=snap": {"candidate_target_count": 10} - }, - } - } - ) - ) - write_us_microplex_data_flow_snapshot( - artifact_dir, - artifact_dir / "data_flow_snapshot.json", - ) - - snapshot = build_us_microplex_site_snapshot(artifact_dir) - - assert snapshot["currentRun"]["benchmarkTag"] == "state" - assert snapshot["currentRun"]["candidateMeanAbsRelativeError"] == 0.7 - assert snapshot["currentRun"]["nSynthetic"] == 2000 - assert snapshot["currentRun"]["supportProxies"]["available"] == ["ssi"] - assert snapshot["summary"]["supported_target_rate"] == 0.9 - assert snapshot["dataFlow"]["runtime"]["scaffoldSource"] == "cps_asec_2023" - assert snapshot["sourceArtifact"]["artifactRef"] == "run-1" - assert snapshot["sourceArtifact"]["manifestFile"] == "manifest.json" - assert "artifactDir" not in snapshot["sourceArtifact"] - assert str(tmp_path) not in json.dumps(snapshot) - - -def test_build_us_microplex_site_snapshot_uses_frozen_data_flow_sidecar( - tmp_path, -): - artifact_dir = tmp_path / "run-2" - artifact_dir.mkdir() - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-03-29T00:00:00+00:00", - "config": {"n_synthetic": 1200}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_harness": "policyengine_harness.json", - }, - "rows": {"seed": 100, "synthetic": 1200, "calibrated": 1200}, - "synthesis": { - "backend": "seed", - "source_names": ["cps_asec_parquet"], - "scaffold_source": "cps_asec_parquet", - "condition_vars": [], - "target_vars": [], - "donor_integrated_variables": [], - "state_program_support_proxies": { - "available": [], - "missing": ["snap"], - }, - }, - "calibration": {"n_loaded_targets": 10, "n_supported_targets": 8}, - "policyengine_harness": { - "candidate_mean_abs_relative_error": 0.8, - "baseline_mean_abs_relative_error": 1.0, - "mean_abs_relative_error_delta": -0.2, - }, - } - ) - ) - (artifact_dir / "seed_data.parquet").write_text("") - (artifact_dir / "synthetic_data.parquet").write_text("") - (artifact_dir / "calibrated_data.parquet").write_text("") - (artifact_dir / "targets.json").write_text("{}") - (artifact_dir / "policyengine_harness.json").write_text( - json.dumps( - { - "summary": { - "candidate_mean_abs_relative_error": 0.8, - "baseline_mean_abs_relative_error": 1.0, - "mean_abs_relative_error_delta": -0.2, - "tag_summaries": {}, - "parity_scorecard": {}, - "attribute_cell_summaries": {}, - } - } - ) - ) - (artifact_dir / "data_flow_snapshot.json").write_text( - json.dumps( - { - "schemaVersion": 1, - "generatedAt": "2000-01-01T00:00:00Z", - "coverageMode": "stale", - "runtime": {"scaffoldSource": "stale_source"}, - } - ) - ) - - snapshot = build_us_microplex_site_snapshot(artifact_dir) - - assert snapshot["dataFlow"]["coverageMode"] == "stale" - assert snapshot["dataFlow"]["runtime"]["scaffoldSource"] == "stale_source" - - -def test_build_us_microplex_site_snapshot_requires_saved_data_flow_sidecar( - tmp_path, -): - artifact_dir = tmp_path / "run-3" - artifact_dir.mkdir() - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "created_at": "2026-03-29T00:00:00+00:00", - "config": {"n_synthetic": 100}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_harness": "policyengine_harness.json", - }, - "synthesis": { - "scaffold_source": "cps_asec_2023", - "state_program_support_proxies": {"available": [], "missing": []}, - }, - "calibration": { - "n_loaded_targets": 1, - "n_supported_targets": 1, - }, - "policyengine_harness": { - "candidate_mean_abs_relative_error": 0.1, - "baseline_mean_abs_relative_error": 0.2, - "mean_abs_relative_error_delta": -0.1, - }, - } - ) - ) - (artifact_dir / "seed_data.parquet").write_text("") - (artifact_dir / "synthetic_data.parquet").write_text("") - (artifact_dir / "calibrated_data.parquet").write_text("") - (artifact_dir / "targets.json").write_text("{}") - (artifact_dir / "policyengine_harness.json").write_text( - json.dumps( - { - "summary": { - "candidate_mean_abs_relative_error": 0.1, - "baseline_mean_abs_relative_error": 0.2, - "mean_abs_relative_error_delta": -0.1, - "tag_summaries": {}, - "parity_scorecard": {}, - "attribute_cell_summaries": {}, - } - } - ) - ) - - with pytest.raises(FileNotFoundError, match="data_flow_snapshot.json"): - build_us_microplex_site_snapshot(artifact_dir) diff --git a/tests/pipelines/test_source_stage_parity.py b/tests/pipelines/test_source_stage_parity.py deleted file mode 100644 index 65f5c12c..00000000 --- a/tests/pipelines/test_source_stage_parity.py +++ /dev/null @@ -1,296 +0,0 @@ -"""Tests for raw source-stage parity auditing.""" - -from __future__ import annotations - -import h5py -import pandas as pd -import pytest -from microplex.core import ( - EntityObservation, - EntityRelationship, - EntityType, - ObservationFrame, - RelationshipCardinality, - Shareability, - SourceArchetype, - SourceDescriptor, - TimeStructure, -) - -from microplex_us.pipelines.source_stage_parity import ( - SourceStageParityVariableSpec, - build_us_cps_source_stage_parity_audit, - build_us_puf_source_stage_parity_audit, - build_us_source_stage_parity_audit, - observation_frame_to_policyengine_entity_bundle, -) - - -def _write_period_dataset(path, data: dict[str, list | tuple], *, period: int = 2023) -> None: - with h5py.File(path, "w") as handle: - for variable, values in data.items(): - group = handle.create_group(variable) - group.create_dataset(str(period), data=values) - - -def _write_flat_dataset(path, data: dict[str, list | tuple]) -> None: - with h5py.File(path, "w") as handle: - for variable, values in data.items(): - handle.create_dataset(variable, data=values) - - -def _build_test_frame(*, source_name: str = "test_source") -> ObservationFrame: - households = pd.DataFrame( - { - "household_id": ["1", "2"], - "household_weight": [10.0, 20.0], - "state_fips": [1, 2], - "county_fips": [11, 22], - } - ) - persons = pd.DataFrame( - { - "person_id": ["101", "102", "103"], - "household_id": ["1", "1", "2"], - "tax_unit_id": ["11", "11", "12"], - "spm_unit_id": ["21", "21", "22"], - "family_id": ["31", "31", "32"], - "weight": [10.0, 10.0, 20.0], - "age": [30, 10, 50], - "wage_income": [100.0, 0.0, 200.0], - "employment_income": [100.0, 0.0, 200.0], - "is_hispanic": [1, 0, 0], - "filing_status": ["SINGLE", "SINGLE", "JOINT"], - } - ) - descriptor = SourceDescriptor( - name=source_name, - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - archetype=SourceArchetype.HOUSEHOLD_INCOME, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips",), - weight_column="household_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=("age",), - weight_column="weight", - ), - ), - ) - frame = ObservationFrame( - source=descriptor, - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - frame.validate() - return frame - - -def test_observation_frame_to_policyengine_entity_bundle_derives_group_tables() -> None: - frame = _build_test_frame() - bundle = observation_frame_to_policyengine_entity_bundle(frame) - - assert bundle.tax_units is not None - assert bundle.spm_units is not None - assert bundle.families is not None - assert bundle.tax_units["tax_unit_id"].tolist() == ["11", "12"] - assert bundle.tax_units["household_id"].tolist() == ["1", "2"] - assert bundle.spm_units["spm_unit_id"].tolist() == ["21", "22"] - assert bundle.families["family_id"].tolist() == ["31", "32"] - - -def test_build_us_source_stage_parity_audit_reports_weighted_alias_and_structure( - tmp_path, -) -> None: - reference_path = tmp_path / "reference.h5" - _write_period_dataset( - reference_path, - { - "household_id": [1, 2], - "household_weight": [10.0, 20.0], - "person_id": [101, 102, 103], - "person_household_id": [1, 1, 2], - "tax_unit_id": [11, 12], - "person_tax_unit_id": [11, 11, 12], - "age": [30, 10, 50], - "state_fips": [1, 2], - "county_fips": [11, 22], - "employment_income": [100.0, 0.0, 200.0], - "is_hispanic": [1, 0, 0], - }, - ) - - bundle = observation_frame_to_policyengine_entity_bundle(_build_test_frame()) - audit = build_us_source_stage_parity_audit( - bundle, - reference_path, - source_id="cps_asec", - period=2023, - focus_variables=( - SourceStageParityVariableSpec( - "employment_income", - "wage_income", - "employment_income", - ), - SourceStageParityVariableSpec("state_fips", "state_fips"), - SourceStageParityVariableSpec("age", "age", value_kind="numeric"), - ), - ) - - assert audit["entityStructure"]["candidate"]["weighted_mean_household_size"] == pytest.approx( - 4.0 / 3.0 - ) - assert audit["householdSizeDistribution"]["candidate"]["shares"]["1"] == pytest.approx( - 2.0 / 3.0 - ) - assert audit["householdSizeDistribution"]["candidate"]["shares"]["2"] == pytest.approx( - 1.0 / 3.0 - ) - employment = audit["focusVariables"]["employment_income"] - assert employment["candidate_variable"] == "wage_income" - assert employment["reference_variable"] == "employment_income" - assert employment["comparison"]["weighted_sum_ratio"] == pytest.approx(1.0) - age = audit["focusVariables"]["age"] - assert age["candidate"]["kind"] == "numeric" - assert age["reference"]["kind"] == "numeric" - assert age["comparison"]["type"] == "numeric" - assert age["comparison"]["weighted_mean_ratio"] == pytest.approx(1.0) - assert audit["focusVariables"]["state_fips"]["candidate_entity"] == "household" - assert ( - audit["schema"]["entities"]["person"]["extra_in_candidate_count"] >= 1 - ) - - -def test_build_us_source_stage_parity_audit_reads_flat_reference_h5(tmp_path) -> None: - reference_path = tmp_path / "reference_flat.h5" - _write_flat_dataset( - reference_path, - { - "household_id": [1, 2], - "household_weight": [10.0, 20.0], - "person_id": [101, 102, 103], - "person_household_id": [1, 1, 2], - "tax_unit_id": [11, 12], - "person_tax_unit_id": [11, 11, 12], - "age": [30, 10, 50], - "employment_income": [100.0, 0.0, 200.0], - "weird_metric": [1.0, 2.0, 3.0, 4.0], - }, - ) - - bundle = observation_frame_to_policyengine_entity_bundle(_build_test_frame()) - audit = build_us_source_stage_parity_audit( - bundle, - reference_path, - source_id="cps_asec", - period=2023, - focus_variables=( - SourceStageParityVariableSpec( - "employment_income", - "wage_income", - "employment_income", - ), - ), - ) - - assert audit["entityStructure"]["reference"]["weighted_mean_household_size"] == pytest.approx( - 4.0 / 3.0 - ) - assert ( - audit["focusVariables"]["employment_income"]["comparison"]["weighted_sum_ratio"] - == pytest.approx(1.0) - ) - - -def test_build_us_cps_source_stage_parity_audit_uses_provider_frame( - monkeypatch, - tmp_path, -) -> None: - reference_path = tmp_path / "reference.h5" - _write_period_dataset( - reference_path, - { - "household_id": [1, 2], - "household_weight": [10.0, 20.0], - "person_id": [101, 102, 103], - "person_household_id": [1, 1, 2], - "tax_unit_id": [11, 12], - "person_tax_unit_id": [11, 11, 12], - "age": [30, 10, 50], - }, - ) - - monkeypatch.setattr( - "microplex_us.pipelines.source_stage_parity.CPSASECSourceProvider.load_frame", - lambda self, query=None: _build_test_frame(source_name="mock_cps"), - ) - audit = build_us_cps_source_stage_parity_audit( - reference_path, - year=2023, - download=False, - sample_n=5, - random_seed=7, - focus_variables=(SourceStageParityVariableSpec("age", "age"),), - ) - - assert audit["sourceId"] == "cps_asec" - assert audit["candidate"]["metadata"]["candidateSourceName"] == "mock_cps" - assert audit["candidate"]["metadata"]["providerFilters"]["sample_n"] == 5 - - -def test_build_us_puf_source_stage_parity_audit_uses_provider_frame( - monkeypatch, - tmp_path, -) -> None: - reference_path = tmp_path / "reference.h5" - _write_period_dataset( - reference_path, - { - "household_id": [1, 2], - "household_weight": [10.0, 20.0], - "person_id": [101, 102, 103], - "person_household_id": [1, 1, 2], - "tax_unit_id": [11, 12], - "person_tax_unit_id": [11, 11, 12], - "employment_income": [100.0, 0.0, 200.0], - }, - period=2024, - ) - - monkeypatch.setattr( - "microplex_us.pipelines.source_stage_parity.PUFSourceProvider.load_frame", - lambda self, query=None: _build_test_frame(source_name="mock_puf"), - ) - audit = build_us_puf_source_stage_parity_audit( - reference_path, - target_year=2024, - sample_n=8, - random_seed=3, - focus_variables=( - SourceStageParityVariableSpec( - "employment_income", - "employment_income", - ), - ), - ) - - assert audit["sourceId"] == "irs_soi_puf" - assert audit["candidate"]["metadata"]["candidateSourceName"] == "mock_puf" - assert audit["candidate"]["metadata"]["providerFilters"]["sample_n"] == 8 diff --git a/tests/pipelines/test_stage9_replay.py b/tests/pipelines/test_stage9_replay.py deleted file mode 100644 index a82cf365..00000000 --- a/tests/pipelines/test_stage9_replay.py +++ /dev/null @@ -1,119 +0,0 @@ -"""Tests for safe Stage 9 validation replay.""" - -import json - -import pytest - -from microplex_us.pipelines.stage9_replay import ( - main, - replay_us_stage9_validation_benchmarking, -) - - -def _write_stage8_bundle(tmp_path, *, stage8_status: str = "complete"): - artifact_dir = tmp_path / "bundle" - manifest_dir = artifact_dir / "stage_artifacts" / "manifests" - manifest_dir.mkdir(parents=True) - dataset_path = artifact_dir / "policyengine_us.h5" - dataset_path.write_bytes(b"h5-placeholder") - stage8_manifest = { - "stageId": "08_dataset_assembly", - "lifecycleStatus": stage8_status, - "outputs": { - "policyengine_dataset": { - "path": "policyengine_us.h5", - "exists": True, - } - }, - } - (manifest_dir / "08_dataset_assembly.json").write_text(json.dumps(stage8_manifest)) - manifest = { - "config": {"policyengine_dataset_year": 2024}, - "artifacts": {"policyengine_dataset": "policyengine_us.h5"}, - "stage_output_manifests": { - "08_dataset_assembly": ( - "stage_artifacts/manifests/08_dataset_assembly.json" - ) - }, - } - (artifact_dir / "manifest.json").write_text(json.dumps(manifest)) - return artifact_dir - - -def test_stage9_replay_writes_new_evidence_without_mutating_source_bundle(tmp_path): - artifact_dir = _write_stage8_bundle(tmp_path) - original_manifest = (artifact_dir / "manifest.json").read_text() - - result = replay_us_stage9_validation_benchmarking( - artifact_dir, - run_id="unit-replay", - precomputed_policyengine_native_scores={ - "summary": {"enhanced_cps_native_loss_delta": -0.1} - }, - ) - - assert result.output_dir == ( - artifact_dir - / "stage_artifacts" - / "09_validation_benchmarking" - / "replays" - / "unit-replay" - ) - assert result.validation_evidence.exists() - assert result.policyengine_native_scores is not None - assert result.policyengine_native_scores.exists() - assert (artifact_dir / "manifest.json").read_text() == original_manifest - - evidence = json.loads(result.validation_evidence.read_text()) - assert evidence["stageId"] == "09_validation_benchmarking" - assert evidence["evidence"][0]["key"] == "policyengine_native_scores" - - -def test_stage9_replay_rejects_incomplete_stage8(tmp_path): - artifact_dir = _write_stage8_bundle(tmp_path, stage8_status="running") - - with pytest.raises(ValueError, match="Stage 8 must be complete"): - replay_us_stage9_validation_benchmarking( - artifact_dir, - precomputed_policyengine_native_scores={"summary": {"loss": 1.0}}, - ) - - -def test_stage9_replay_rejects_stage8_dataset_path_mismatch(tmp_path): - artifact_dir = _write_stage8_bundle(tmp_path) - stage8_manifest_path = ( - artifact_dir / "stage_artifacts" / "manifests" / "08_dataset_assembly.json" - ) - stage8_manifest = json.loads(stage8_manifest_path.read_text()) - stage8_manifest["outputs"]["policyengine_dataset"]["path"] = ( - "other/policyengine_us.h5" - ) - stage8_manifest_path.write_text(json.dumps(stage8_manifest)) - - with pytest.raises(ValueError, match="does not match"): - replay_us_stage9_validation_benchmarking( - artifact_dir, - precomputed_policyengine_native_scores={"summary": {"loss": 1.0}}, - ) - - -def test_stage9_replay_cli_smoke(tmp_path, capsys): - artifact_dir = _write_stage8_bundle(tmp_path) - payload_path = tmp_path / "native_scores.json" - payload_path.write_text(json.dumps({"summary": {"loss": 1.0}})) - - assert ( - main( - [ - str(artifact_dir), - "--run-id", - "cli-replay", - "--precomputed-policyengine-native-scores", - str(payload_path), - ] - ) - == 0 - ) - - output = capsys.readouterr().out.strip() - assert output.endswith("evidence_manifest.json") diff --git a/tests/pipelines/test_stage_artifacts.py b/tests/pipelines/test_stage_artifacts.py deleted file mode 100644 index fba96ed9..00000000 --- a/tests/pipelines/test_stage_artifacts.py +++ /dev/null @@ -1,390 +0,0 @@ -"""Tests for US stage artifact inventory helpers.""" - -import json - -import pandas as pd -import pytest - -from microplex_us.pipelines.stage_artifacts import ( - build_us_stage_artifact_inventory, - load_us_calibrated_stage_artifacts, - load_us_candidate_calibration_replay_artifacts, - load_us_candidate_stage_artifacts, - load_us_dataset_assembly_artifacts, - load_us_policyengine_entity_stage_artifacts, - load_us_seed_scaffold_stage_artifacts, - load_us_stage_artifact_inventory, - load_us_stage_json_artifact, - resolve_us_stage_artifact_from_inventory, - resolve_us_stage_artifact_path_checked, - write_us_stage_artifact_inventory, -) -from microplex_us.pipelines.stage_manifest import ( - write_us_policyengine_entity_stage_artifact, -) -from microplex_us.policyengine import PolicyEngineUSEntityTableBundle - - -def test_build_us_stage_artifact_inventory_hashes_files_and_directories(tmp_path): - (tmp_path / "seed_data.parquet").write_text("seed") - (tmp_path / "synthetic_data.parquet").write_text("synthetic") - source_plan = ( - tmp_path / "stage_artifacts" / "03_source_planning" / "source_plan.json" - ) - source_plan.parent.mkdir(parents=True) - source_plan.write_text("{}") - entity_dir = tmp_path / "stage_artifacts" / "06_policyengine_entities" - entity_dir.mkdir(parents=True) - (entity_dir / "metadata.json").write_text("{}") - (entity_dir / "households.parquet").write_text("households") - manifest = { - "config": {"calibration_backend": "none"}, - "rows": {"seed": 1, "synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "source_plan": "stage_artifacts/03_source_planning/source_plan.json", - "pre_calibration_policyengine_entity_tables": ( - "stage_artifacts/06_policyengine_entities/metadata.json" - ), - }, - } - - inventory = build_us_stage_artifact_inventory( - tmp_path, - manifest_payload=manifest, - max_hash_bytes=None, - ) - - records = { - (record["stageId"], record["key"]): record for record in inventory["artifacts"] - } - assert ( - records[("05_donor_integration_synthesis", "synthetic_data")]["classification"] - == "manual_replay" - ) - assert ( - records[("05_donor_integration_synthesis", "synthetic_data")]["hashStatus"] - == "hashed" - ) - assert records[("05_donor_integration_synthesis", "synthetic_data")]["contentHash"] - assert records[("03_source_planning", "source_plan")]["classification"] == ( - "diagnostic_only" - ) - entity_record = records[ - ("06_policyengine_entities", "pre_calibration_policyengine_entity_tables") - ] - assert entity_record["classification"] == "manual_resume" - assert entity_record["fileCount"] == 2 - assert entity_record["hashStatus"] == "hashed" - - -def test_build_us_stage_artifact_inventory_classifies_missing_and_contract_only( - tmp_path, -): - manifest = { - "config": {"calibration_backend": "none"}, - "rows": {"seed": 1, "synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - }, - } - - inventory = build_us_stage_artifact_inventory(tmp_path, manifest_payload=manifest) - - records = { - (record["stageId"], record["key"]): record for record in inventory["artifacts"] - } - assert ( - records[("05_donor_integration_synthesis", "synthetic_data")]["classification"] - == "missing_required" - ) - assert ( - records[("05_donor_integration_synthesis", "synthesizer")]["classification"] - == "contract_only" - ) - - -def test_build_us_stage_artifact_inventory_skips_large_file_hashes(tmp_path): - (tmp_path / "synthetic_data.parquet").write_text("synthetic") - manifest = { - "config": {"calibration_backend": "none"}, - "rows": {"synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"synthetic_data": "synthetic_data.parquet"}, - } - - inventory = build_us_stage_artifact_inventory( - tmp_path, - manifest_payload=manifest, - max_hash_bytes=3, - ) - - record = next( - record for record in inventory["artifacts"] if record["key"] == "synthetic_data" - ) - assert record["hashStatus"] == "too_large" - assert record["contentHash"] is None - - -def test_write_load_and_resolve_us_stage_artifact_inventory(tmp_path): - (tmp_path / "policyengine_us.h5").write_text("dataset") - manifest = { - "config": {"calibration_backend": "none"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"policyengine_dataset": "policyengine_us.h5"}, - } - - path = write_us_stage_artifact_inventory( - tmp_path, - tmp_path / "stage_artifacts" / "artifact_inventory.json", - manifest_payload=manifest, - ) - loaded = load_us_stage_artifact_inventory(path) - dataset_path = resolve_us_stage_artifact_from_inventory( - tmp_path, - loaded, - "08_dataset_assembly", - "policyengine_dataset", - ) - - assert loaded["schemaVersion"] == 1 - assert dataset_path == tmp_path / "policyengine_us.h5" - - -def test_load_us_stage_artifact_inventory_rejects_unknown_schema(tmp_path): - path = tmp_path / "artifact_inventory.json" - path.write_text(json.dumps({"schemaVersion": 99})) - - with pytest.raises(RuntimeError, match="Unsupported US stage artifact inventory"): - load_us_stage_artifact_inventory(path) - - -def test_load_us_candidate_stage_artifacts_reads_stage5_boundary(tmp_path): - pytest.importorskip("pyarrow") - seed = pd.DataFrame({"person_id": [1], "income": [20]}) - synthetic = pd.DataFrame({"person_id": [1, 2], "income": [20, 30]}) - seed.to_parquet(tmp_path / "seed_data.parquet", index=False) - synthetic.to_parquet(tmp_path / "synthetic_data.parquet", index=False) - manifest = { - "config": {"calibration_backend": "none"}, - "rows": {"seed": 1, "synthetic": 2}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - }, - } - - loaded = load_us_candidate_stage_artifacts(tmp_path, manifest_payload=manifest) - - pd.testing.assert_frame_equal(loaded.seed_data, seed) - pd.testing.assert_frame_equal(loaded.synthetic_data, synthetic) - assert ( - loaded.artifact_paths["synthetic_data"] == tmp_path / "synthetic_data.parquet" - ) - - -def test_load_us_seed_scaffold_stage_artifacts_reads_stage4_boundary(tmp_path): - pytest.importorskip("pyarrow") - scaffold = pd.DataFrame({"person_id": [1], "income": [10]}) - scaffold_path = ( - tmp_path / "stage_artifacts" / "04_seed_scaffold" / "scaffold_seed_data.parquet" - ) - scaffold_path.parent.mkdir(parents=True) - scaffold.to_parquet(scaffold_path, index=False) - manifest = { - "config": {"calibration_backend": "none"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "scaffold_seed_data": ( - "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet" - ), - }, - } - - loaded = load_us_seed_scaffold_stage_artifacts(tmp_path, manifest_payload=manifest) - - pd.testing.assert_frame_equal(loaded.scaffold_seed_data, scaffold) - assert loaded.artifact_paths["scaffold_seed_data"] == scaffold_path - - -def test_load_us_candidate_calibration_replay_artifacts_combines_boundaries( - tmp_path, -): - pytest.importorskip("pyarrow") - scaffold = pd.DataFrame({"person_id": [1], "income": [10]}) - seed = pd.DataFrame({"person_id": [1], "income": [20]}) - synthetic = pd.DataFrame({"person_id": [1, 2], "income": [20, 30]}) - scaffold_path = ( - tmp_path / "stage_artifacts" / "04_seed_scaffold" / "scaffold_seed_data.parquet" - ) - scaffold_path.parent.mkdir(parents=True) - scaffold.to_parquet(scaffold_path, index=False) - seed.to_parquet(tmp_path / "seed_data.parquet", index=False) - synthetic.to_parquet(tmp_path / "synthetic_data.parquet", index=False) - (tmp_path / "targets.json").write_text( - json.dumps({"marginal": {"age": {"20": 1.0}}, "continuous": {"income": 1.0}}) - ) - manifest = { - "config": {"calibration_backend": "none"}, - "rows": {"seed": 1, "synthetic": 2}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "scaffold_seed_data": ( - "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet" - ), - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "targets": "targets.json", - }, - } - - loaded = load_us_candidate_calibration_replay_artifacts( - tmp_path, - manifest_payload=manifest, - ) - - pd.testing.assert_frame_equal(loaded.candidate.synthetic_data, synthetic) - assert loaded.seed_scaffold is not None - pd.testing.assert_frame_equal(loaded.seed_scaffold.scaffold_seed_data, scaffold) - assert loaded.targets.continuous == {"income": 1.0} - assert loaded.artifact_paths["targets"] == tmp_path / "targets.json" - - -def test_load_us_policyengine_entity_stage_artifacts_reads_checkpoint(tmp_path): - pytest.importorskip("pyarrow") - bundle = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame({"household_id": [1], "household_weight": [1.0]}), - persons=pd.DataFrame({"person_id": [10], "household_id": [1]}), - tax_units=None, - spm_units=None, - families=None, - marital_units=None, - ) - write_us_policyengine_entity_stage_artifact(bundle, tmp_path) - manifest = { - "config": {"calibration_backend": "none"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "pre_calibration_policyengine_entity_tables": ( - "stage_artifacts/06_policyengine_entities/metadata.json" - ), - }, - } - - loaded = load_us_policyengine_entity_stage_artifacts( - tmp_path, - manifest_payload=manifest, - ) - - assert loaded.metadata["stageId"] == "06_policyengine_entities" - pd.testing.assert_frame_equal(loaded.bundle.households, bundle.households) - - -def test_load_us_calibrated_stage_artifacts_reads_stage7_outputs(tmp_path): - pytest.importorskip("pyarrow") - calibrated = pd.DataFrame({"person_id": [1], "weight": [2.0]}) - calibrated.to_parquet(tmp_path / "calibrated_data.parquet", index=False) - (tmp_path / "targets.json").write_text( - json.dumps({"marginal": {}, "continuous": {"income": 1.0}}) - ) - summary_path = tmp_path / "stage_artifacts" / "07_calibration" - summary_path.mkdir(parents=True) - (summary_path / "calibration_summary.json").write_text( - json.dumps({"backend": "none", "converged": True}) - ) - manifest = { - "config": {"calibration_backend": "none"}, - "rows": {"calibrated": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {"backend": "none"}, - "artifacts": { - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "calibration_summary": ( - "stage_artifacts/07_calibration/calibration_summary.json" - ), - }, - } - - loaded = load_us_calibrated_stage_artifacts(tmp_path, manifest_payload=manifest) - - pd.testing.assert_frame_equal(loaded.calibrated_data, calibrated) - assert loaded.targets.continuous == {"income": 1.0} - assert loaded.calibration_summary["converged"] is True - - -def test_load_us_dataset_assembly_artifacts_resolves_stage8_paths(tmp_path): - (tmp_path / "manifest.json").write_text("{}") - (tmp_path / "stage_manifest.json").write_text("{}") - (tmp_path / "data_flow_snapshot.json").write_text("{}") - (tmp_path / "policyengine_us.h5").write_text("dataset") - stage_artifacts = tmp_path / "stage_artifacts" - stage_artifacts.mkdir() - (stage_artifacts / "artifact_inventory.json").write_text("{}") - (stage_artifacts / "conditional_readiness.json").write_text("{}") - manifest = { - "config": {"calibration_backend": "none"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "stage_manifest": "stage_manifest.json", - "data_flow_snapshot": "data_flow_snapshot.json", - "artifact_inventory": "stage_artifacts/artifact_inventory.json", - "conditional_readiness": "stage_artifacts/conditional_readiness.json", - }, - } - - loaded = load_us_dataset_assembly_artifacts(tmp_path, manifest_payload=manifest) - - assert loaded.policyengine_dataset == tmp_path / "policyengine_us.h5" - assert loaded.stage_manifest == tmp_path / "stage_manifest.json" - assert loaded.data_flow_snapshot == tmp_path / "data_flow_snapshot.json" - assert loaded.artifact_inventory == stage_artifacts / "artifact_inventory.json" - assert ( - loaded.conditional_readiness == stage_artifacts / "conditional_readiness.json" - ) - - -def test_stage_artifact_checked_resolver_enforces_format_and_existence(tmp_path): - (tmp_path / "synthetic_data.parquet").write_text("synthetic") - manifest = { - "config": {"calibration_backend": "none"}, - "rows": {"synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"synthetic_data": "synthetic_data.parquet"}, - } - - with pytest.raises(ValueError, match="expected 'json'"): - resolve_us_stage_artifact_path_checked( - tmp_path, - "05_donor_integration_synthesis", - "synthetic_data", - manifest_payload=manifest, - expected_format="json", - ) - - with pytest.raises(FileNotFoundError, match="Stage artifact not found"): - load_us_stage_json_artifact( - tmp_path, - "03_source_planning", - "source_plan", - manifest_payload={ - **manifest, - "artifacts": {"source_plan": "missing.json"}, - }, - ) diff --git a/tests/pipelines/test_stage_contracts.py b/tests/pipelines/test_stage_contracts.py deleted file mode 100644 index a530c520..00000000 --- a/tests/pipelines/test_stage_contracts.py +++ /dev/null @@ -1,159 +0,0 @@ -"""Tests for canonical US pipeline stage contracts.""" - -import pytest - -from microplex_us.pipelines.stage_contracts import ( - canonicalize_us_pipeline_stage_id, - config_keys_for_us_pipeline_stage, - default_us_pipeline_stage_contracts, - get_us_pipeline_stage_contract, - get_us_stage_artifact_contract, - resolve_us_stage_artifact_contract_path, - serialize_us_pipeline_stage_contracts, -) - - -def test_default_us_pipeline_stage_contracts_are_stable_and_complete(): - contracts = default_us_pipeline_stage_contracts() - - assert [contract.step for contract in contracts] == [ - f"{index:02d}" for index in range(1, 10) - ] - assert [contract.id for contract in contracts] == [ - "01_run_profile", - "02_source_loading", - "03_source_planning", - "04_seed_scaffold", - "05_donor_integration_synthesis", - "06_policyengine_entities", - "07_calibration", - "08_dataset_assembly", - "09_validation_benchmarking", - ] - assert len({contract.id for contract in contracts}) == 9 - for contract in contracts: - assert contract.title - assert contract.purpose - assert contract.consumes - assert contract.produces - assert contract.inputs - assert contract.outputs - assert contract.diagnostics - assert contract.validations - assert contract.resume_mode - for artifact in contract.artifacts: - assert artifact.format - assert artifact.hash_mode - if artifact.resume_role is not None: - assert artifact.resume_role in { - "diagnostic", - "manual_replay", - "manual_resume", - "post_artifact_evidence", - } - - -def test_get_us_pipeline_stage_contract_returns_one_stage(): - contract = get_us_pipeline_stage_contract("08_dataset_assembly") - - assert contract.step == "08" - assert contract.title == "Dataset assembly and publication" - - -def test_get_us_pipeline_stage_contract_rejects_unknown_stage(): - with pytest.raises(KeyError, match="Unknown US pipeline stage"): - get_us_pipeline_stage_contract("bogus") - - -def test_serialize_us_pipeline_stage_contracts_is_json_ready(): - payload = serialize_us_pipeline_stage_contracts() - - assert payload["schemaVersion"] == 1 - assert payload["contractVersion"] == "us-runtime-stages-v2" - assert len(payload["stages"]) == 9 - assert payload["stages"][5]["id"] == "06_policyengine_entities" - assert payload["stages"][5]["inputs"][0]["artifact_key"] == "synthetic_data" - assert payload["stages"][7]["artifacts"][-1]["key"] == "conditional_readiness" - assert payload["stages"][7]["artifacts"][-1]["format"] == "json" - - -def test_canonicalize_us_pipeline_stage_id_maps_legacy_runtime_ids(): - assert ( - canonicalize_us_pipeline_stage_id("policyengine_materialization") - == "06_policyengine_entities" - ) - assert canonicalize_us_pipeline_stage_id("target_build") == "07_calibration" - assert canonicalize_us_pipeline_stage_id("finalization") == "08_dataset_assembly" - assert canonicalize_us_pipeline_stage_id("benchmark") == "09_validation_benchmarking" - assert canonicalize_us_pipeline_stage_id("08_dataset_assembly") == "08_dataset_assembly" - assert canonicalize_us_pipeline_stage_id("custom-stage") == "custom-stage" - - -def test_stage_contracts_expose_config_scope_and_canonical_paths(tmp_path): - assert "n_synthetic" in config_keys_for_us_pipeline_stage( - "05_donor_integration_synthesis" - ) - assert resolve_us_stage_artifact_contract_path( - tmp_path, - "08_dataset_assembly", - "artifact_inventory", - ) == (tmp_path / "stage_artifacts" / "artifact_inventory.json") - - -def test_required_stage_inputs_reference_prior_outputs_and_artifacts(): - contracts = default_us_pipeline_stage_contracts() - contracts_by_id = {contract.id: contract for contract in contracts} - - for contract in contracts: - for resource in contract.inputs: - if not resource.required: - continue - if resource.kind == "stage_output": - assert resource.stage_id is not None - upstream = contracts_by_id[resource.stage_id] - assert any( - output.key == resource.key - and output.kind == "stage_output" - and output.stage_id == resource.stage_id - for output in upstream.outputs - ) - if resource.kind == "artifact": - assert resource.stage_id is not None - artifact = get_us_stage_artifact_contract( - resource.stage_id, - resource.artifact_key or resource.key, - ) - assert artifact.required - - -def test_source_planning_seam_exposes_descriptors_for_stage3(): - stage2 = get_us_pipeline_stage_contract("02_source_loading") - stage3 = get_us_pipeline_stage_contract("03_source_planning") - - stage2_outputs = {resource.key for resource in stage2.outputs} - stage3_inputs = {resource.key for resource in stage3.inputs} - - assert "source_descriptors" in stage2_outputs - assert "source_descriptors" in stage3_inputs - - -def test_stage_config_scopes_use_real_build_config_keys(): - stage5_keys = set(config_keys_for_us_pipeline_stage("05_donor_integration_synthesis")) - stage7_keys = set(config_keys_for_us_pipeline_stage("07_calibration")) - - assert { - "n_synthetic", - "random_seed", - "synthesis_backend", - "donor_imputer_backend", - "donor_imputer_condition_selection", - } <= stage5_keys - assert "calibration_epochs" not in stage7_keys - assert "calibration_l0_lambda" not in stage7_keys - assert { - "calibration_backend", - "calibration_tol", - "calibration_max_iter", - "target_sparsity", - "policyengine_targets_db", - } <= stage7_keys diff --git a/tests/pipelines/test_stage_manifest.py b/tests/pipelines/test_stage_manifest.py deleted file mode 100644 index a8beea37..00000000 --- a/tests/pipelines/test_stage_manifest.py +++ /dev/null @@ -1,359 +0,0 @@ -"""Tests for US stage manifests and reusable stage artifacts.""" - -import json - -import pandas as pd -import pytest - -from microplex_us.pipelines.stage_manifest import ( - build_us_stage_manifest, - load_us_policyengine_entity_stage_artifact, - load_us_stage_manifest, - resolve_us_stage_artifact_path, - stage_summary_for_data_flow_snapshot, - write_us_policyengine_entity_stage_artifact, - write_us_stage_manifest, -) -from microplex_us.policyengine import PolicyEngineUSEntityTableBundle - - -def test_build_us_stage_manifest_reports_nine_stage_statuses(tmp_path): - (tmp_path / "manifest.json").write_text("{}") - scaffold_seed_path = ( - tmp_path / "stage_artifacts" / "04_seed_scaffold" / "scaffold_seed_data.parquet" - ) - scaffold_seed_path.parent.mkdir(parents=True) - scaffold_seed_path.write_text("scaffold") - (tmp_path / "seed_data.parquet").write_text("seed") - (tmp_path / "synthetic_data.parquet").write_text("synthetic") - (tmp_path / "calibrated_data.parquet").write_text("calibrated") - (tmp_path / "targets.json").write_text("{}") - (tmp_path / "policyengine_us.h5").write_text("dataset") - source_plan_path = tmp_path / "stage_artifacts" / "03_source_planning" - source_plan_path.mkdir(parents=True) - (source_plan_path / "source_plan.json").write_text("{}") - entity_path = tmp_path / "stage_artifacts" / "06_policyengine_entities" - entity_path.mkdir(parents=True) - (entity_path / "metadata.json").write_text("{}") - calibration_path = tmp_path / "stage_artifacts" / "07_calibration" - calibration_path.mkdir(parents=True) - (calibration_path / "calibration_summary.json").write_text("{}") - final_entity_path = calibration_path / "policyengine_entity_tables" - final_entity_path.mkdir(parents=True) - (final_entity_path / "metadata.json").write_text("{}") - (tmp_path / "stage_manifest.json").write_text("{}") - (tmp_path / "data_flow_snapshot.json").write_text("{}") - (tmp_path / "stage_artifacts" / "artifact_inventory.json").write_text("{}") - (tmp_path / "stage_artifacts" / "conditional_readiness.json").write_text("{}") - manifest = { - "created_at": "2026-05-28T00:00:00+00:00", - "config": {"calibration_backend": "entropy"}, - "rows": {"seed": 1, "synthetic": 1, "calibrated": 1}, - "synthesis": { - "source_names": ["cps_asec_2023"], - "scaffold_source": "cps_asec_2023", - "backend": "seed", - "donor_integrated_variables": [], - }, - "calibration": {"backend": "policyengine_db_entropy"}, - "artifacts": { - "scaffold_seed_data": ( - "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet" - ), - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "source_plan": "stage_artifacts/03_source_planning/source_plan.json", - "pre_calibration_policyengine_entity_tables": ( - "stage_artifacts/06_policyengine_entities/metadata.json" - ), - "calibration_summary": ( - "stage_artifacts/07_calibration/calibration_summary.json" - ), - "policyengine_entity_tables": ( - "stage_artifacts/07_calibration/policyengine_entity_tables/metadata.json" - ), - "policyengine_dataset": "policyengine_us.h5", - "stage_manifest": "stage_manifest.json", - "data_flow_snapshot": "data_flow_snapshot.json", - "artifact_inventory": "stage_artifacts/artifact_inventory.json", - "conditional_readiness": "stage_artifacts/conditional_readiness.json", - }, - } - - payload = build_us_stage_manifest(tmp_path, manifest_payload=manifest) - - assert payload["schemaVersion"] == 3 - assert payload["generatedAt"] == "2026-05-28T00:00:00+00:00" - assert [stage["id"] for stage in payload["stages"]] == [ - "01_run_profile", - "02_source_loading", - "03_source_planning", - "04_seed_scaffold", - "05_donor_integration_synthesis", - "06_policyengine_entities", - "07_calibration", - "08_dataset_assembly", - "09_validation_benchmarking", - ] - statuses = {stage["id"]: stage["status"] for stage in payload["stages"]} - assert statuses["01_run_profile"] == "ready" - assert statuses["02_source_loading"] == "metadata_only" - assert statuses["03_source_planning"] == "ready" - assert statuses["04_seed_scaffold"] == "ready" - assert statuses["05_donor_integration_synthesis"] == "ready" - assert statuses["06_policyengine_entities"] == "ready" - assert statuses["07_calibration"] == "ready" - assert statuses["08_dataset_assembly"] == "ready" - assert statuses["09_validation_benchmarking"] == "deferred" - stage5_artifacts = { - artifact["key"]: artifact - for stage in payload["stages"] - if stage["id"] == "05_donor_integration_synthesis" - for artifact in stage["artifacts"] - } - assert stage5_artifacts["synthetic_data"]["format"] == "parquet_dataframe" - assert stage5_artifacts["synthetic_data"]["hash_mode"] == "file_sha256" - - -def test_load_us_stage_manifest_accepts_v1_v2_and_v3(tmp_path): - v1_path = tmp_path / "stage_manifest_v1.json" - v1_path.write_text( - json.dumps( - { - "schemaVersion": 1, - "contractVersion": "us-runtime-stages-v1", - "generatedAt": None, - "pipeline": "us_microplex", - "artifactRoot": ".", - "manifest": "manifest.json", - "stages": [], - } - ) - ) - v2_path = tmp_path / "stage_manifest_v2.json" - v2_path.write_text( - json.dumps( - { - "schemaVersion": 2, - "contractVersion": "us-runtime-stages-v2", - "generatedAt": None, - "pipeline": "us_microplex", - "artifactRoot": ".", - "manifest": "manifest.json", - "stages": [], - } - ) - ) - v3_path = tmp_path / "stage_manifest_v3.json" - v3_path.write_text( - json.dumps( - { - "schemaVersion": 3, - "contractVersion": "us-runtime-stages-v2", - "generatedAt": None, - "pipeline": "us_microplex", - "artifactRoot": ".", - "manifest": "manifest.json", - "stages": [], - } - ) - ) - - assert load_us_stage_manifest(v1_path)["schemaVersion"] == 1 - assert load_us_stage_manifest(v2_path)["schemaVersion"] == 2 - assert load_us_stage_manifest(v3_path)["schemaVersion"] == 3 - - -def test_build_us_stage_manifest_keeps_empty_validation_index_deferred(tmp_path): - (tmp_path / "policyengine_us.h5").write_text("dataset") - (tmp_path / "stage_manifest.json").write_text("{}") - (tmp_path / "data_flow_snapshot.json").write_text("{}") - (tmp_path / "stage_artifacts" / "artifact_inventory.json").parent.mkdir( - parents=True, - exist_ok=True, - ) - (tmp_path / "stage_artifacts" / "artifact_inventory.json").write_text("{}") - (tmp_path / "stage_artifacts" / "conditional_readiness.json").write_text("{}") - evidence_path = ( - tmp_path - / "stage_artifacts" - / "09_validation_benchmarking" - / "evidence_manifest.json" - ) - evidence_path.parent.mkdir(parents=True) - evidence_path.write_text( - json.dumps( - { - "formatVersion": 1, - "stageId": "09_validation_benchmarking", - "evidence": [], - "summaries": {}, - } - ) - ) - manifest = { - "config": {"calibration_backend": "entropy"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "stage_manifest": "stage_manifest.json", - "data_flow_snapshot": "data_flow_snapshot.json", - "artifact_inventory": "stage_artifacts/artifact_inventory.json", - "conditional_readiness": "stage_artifacts/conditional_readiness.json", - "validation_evidence": ( - "stage_artifacts/09_validation_benchmarking/evidence_manifest.json" - ), - }, - } - - payload = build_us_stage_manifest(tmp_path, manifest_payload=manifest) - - statuses = {stage["id"]: stage["status"] for stage in payload["stages"]} - assert statuses["09_validation_benchmarking"] == "deferred" - - -def test_build_us_stage_manifest_requires_validation_evidence_for_stage9_ready( - tmp_path, -): - (tmp_path / "policyengine_us.h5").write_text("dataset") - (tmp_path / "policyengine_native_scores.json").write_text("{}") - manifest = { - "config": {"calibration_backend": "entropy"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": { - "policyengine_dataset": "policyengine_us.h5", - "policyengine_native_scores": "policyengine_native_scores.json", - }, - } - - payload = build_us_stage_manifest(tmp_path, manifest_payload=manifest) - - statuses = {stage["id"]: stage["status"] for stage in payload["stages"]} - assert statuses["09_validation_benchmarking"] == "incomplete" - - -def test_stage_summary_omits_unreferenced_path_hints(tmp_path): - manifest = { - "config": {"calibration_backend": "entropy"}, - "rows": {"seed": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {}, - } - - payload = build_us_stage_manifest(tmp_path, manifest_payload=manifest) - summaries = stage_summary_for_data_flow_snapshot(payload) - - outputs = {stage["id"]: stage["outputs"] for stage in summaries} - assert outputs["04_seed_scaffold"] == [] - assert outputs["05_donor_integration_synthesis"] == [] - - -def test_build_us_stage_manifest_reports_incomplete_referenced_artifacts(tmp_path): - manifest = { - "created_at": "2026-05-28T00:00:00+00:00", - "config": {"calibration_backend": "entropy"}, - "rows": {"seed": 1, "synthetic": 1}, - "synthesis": { - "source_names": ["cps_asec_2023"], - "scaffold_source": "cps_asec_2023", - "backend": "seed", - }, - "artifacts": { - "scaffold_seed_data": ( - "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet" - ), - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "policyengine_harness": "policyengine_harness.json", - }, - } - - payload = build_us_stage_manifest(tmp_path, manifest_payload=manifest) - - statuses = {stage["id"]: stage["status"] for stage in payload["stages"]} - assert statuses["04_seed_scaffold"] == "incomplete" - assert statuses["05_donor_integration_synthesis"] == "incomplete" - assert statuses["09_validation_benchmarking"] == "incomplete" - - -def test_write_us_stage_manifest_and_resolve_artifact_path(tmp_path): - (tmp_path / "policyengine_us.h5").write_text("dataset") - manifest = { - "config": {"calibration_backend": "none"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"policyengine_dataset": "policyengine_us.h5"}, - } - - manifest_path = write_us_stage_manifest( - tmp_path, - tmp_path / "stage_manifest.json", - manifest_payload=manifest, - ) - loaded = json.loads(manifest_path.read_text()) - dataset_path = resolve_us_stage_artifact_path( - tmp_path, - loaded, - "08_dataset_assembly", - "policyengine_dataset", - ) - - assert dataset_path == tmp_path / "policyengine_us.h5" - assert ( - stage_summary_for_data_flow_snapshot(loaded)[7]["id"] == "08_dataset_assembly" - ) - - -def test_policyengine_entity_stage_artifact_round_trips_partial_bundle(tmp_path): - pytest.importorskip("pyarrow") - - bundle = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - {"household_id": [1, 2], "household_weight": [1.0, 2.0]} - ), - persons=pd.DataFrame( - {"person_id": [10, 20], "household_id": [1, 2], "age": [40, 50]} - ), - tax_units=None, - spm_units=None, - families=None, - marital_units=None, - ) - - manifest_path = write_us_policyengine_entity_stage_artifact(bundle, tmp_path) - loaded, metadata = load_us_policyengine_entity_stage_artifact(manifest_path) - - assert manifest_path == ( - tmp_path / "stage_artifacts" / "06_policyengine_entities" / "metadata.json" - ) - assert metadata["stageId"] == "06_policyengine_entities" - assert metadata["stage"] == "post_microsim" - pd.testing.assert_frame_equal(loaded.households, bundle.households) - pd.testing.assert_frame_equal(loaded.persons, bundle.persons) - assert loaded.tax_units is None - - -def test_policyengine_entity_stage_artifact_does_not_replace_run_root(tmp_path): - pytest.importorskip("pyarrow") - - (tmp_path / "manifest.json").write_text("{}") - bundle = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame({"household_id": [1], "household_weight": [1.0]}), - persons=pd.DataFrame({"person_id": [10], "household_id": [1]}), - tax_units=None, - spm_units=None, - families=None, - marital_units=None, - ) - - write_us_policyengine_entity_stage_artifact(bundle, tmp_path) - - assert (tmp_path / "manifest.json").exists() - assert ( - tmp_path / "stage_artifacts" / "06_policyengine_entities" / "metadata.json" - ).exists() diff --git a/tests/pipelines/test_stage_readiness.py b/tests/pipelines/test_stage_readiness.py deleted file mode 100644 index 2a72fa45..00000000 --- a/tests/pipelines/test_stage_readiness.py +++ /dev/null @@ -1,222 +0,0 @@ -"""Tests for US conditional-readiness reports.""" - -import json - -import pytest - -from microplex_us.pipelines.stage_artifacts import build_us_stage_artifact_inventory -from microplex_us.pipelines.stage_readiness import ( - build_us_conditional_readiness_report, - build_us_stage_reuse_key, - load_us_conditional_readiness_report, - write_us_conditional_readiness_report, -) - - -def test_build_us_stage_reuse_key_ignores_checkpoint_output_paths(tmp_path): - (tmp_path / "synthetic_data.parquet").write_text("synthetic") - base_manifest = { - "config": { - "n_synthetic": 10, - "calibration_backend": "none", - "pipeline_checkpoint_save_post_microsim_path": "/tmp/a", - }, - "rows": {"synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"synthetic_data": "synthetic_data.parquet"}, - } - changed_output_path_manifest = { - **base_manifest, - "config": { - **base_manifest["config"], - "pipeline_checkpoint_save_post_microsim_path": "/tmp/b", - }, - } - - inventory = build_us_stage_artifact_inventory( - tmp_path, - manifest_payload=base_manifest, - max_hash_bytes=None, - ) - - assert build_us_stage_reuse_key( - "05_donor_integration_synthesis", - base_manifest, - inventory, - ) == build_us_stage_reuse_key( - "05_donor_integration_synthesis", - changed_output_path_manifest, - inventory, - ) - - -def test_build_us_stage_reuse_key_uses_stage_scoped_config(tmp_path): - (tmp_path / "synthetic_data.parquet").write_text("synthetic") - base_manifest = { - "config": { - "n_synthetic": 10, - "synthesis_backend": "bootstrap", - "policyengine_dataset_year": 2024, - }, - "rows": {"synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"synthetic_data": "synthetic_data.parquet"}, - } - changed_stage8_config = { - **base_manifest, - "config": { - **base_manifest["config"], - "policyengine_dataset_year": 2025, - }, - } - changed_stage5_config = { - **base_manifest, - "config": { - **base_manifest["config"], - "n_synthetic": 20, - }, - } - inventory = build_us_stage_artifact_inventory( - tmp_path, - manifest_payload=base_manifest, - max_hash_bytes=None, - ) - - base_key = build_us_stage_reuse_key( - "05_donor_integration_synthesis", - base_manifest, - inventory, - ) - assert base_key == build_us_stage_reuse_key( - "05_donor_integration_synthesis", - changed_stage8_config, - inventory, - ) - assert base_key != build_us_stage_reuse_key( - "05_donor_integration_synthesis", - changed_stage5_config, - inventory, - ) - - -def test_conditional_readiness_reports_config_mismatch_as_rerun(tmp_path): - (tmp_path / "synthetic_data.parquet").write_text("synthetic") - manifest = { - "config": {"n_synthetic": 10, "calibration_backend": "none"}, - "rows": {"synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"synthetic_data": "synthetic_data.parquet"}, - } - - report = build_us_conditional_readiness_report( - tmp_path, - manifest_payload=manifest, - requested_config={"n_synthetic": 20, "calibration_backend": "none"}, - ) - - stages = {stage["stageId"]: stage for stage in report["stages"]} - assert stages["05_donor_integration_synthesis"]["compatibility"] == "mismatch" - assert stages["05_donor_integration_synthesis"]["readiness"] == "must_rerun" - assert stages["05_donor_integration_synthesis"]["reason"] == ( - "Requested configuration does not match this stage's saved run inputs." - ) - assert stages["08_dataset_assembly"]["compatibility"] == "match" - - -def test_conditional_readiness_reports_manual_replay_without_requested_config(tmp_path): - (tmp_path / "synthetic_data.parquet").write_text("synthetic") - manifest = { - "config": {"n_synthetic": 10, "calibration_backend": "none"}, - "rows": {"synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"synthetic_data": "synthetic_data.parquet"}, - } - - report = build_us_conditional_readiness_report( - tmp_path, - manifest_payload=manifest, - ) - - stages = {stage["stageId"]: stage for stage in report["stages"]} - assert stages["05_donor_integration_synthesis"]["compatibility"] == ( - "not_evaluated" - ) - assert stages["05_donor_integration_synthesis"]["readiness"] == "manual_replay" - assert stages["05_donor_integration_synthesis"]["reloadableArtifacts"] == [ - "05_donor_integration_synthesis.synthetic_data" - ] - - -def test_conditional_readiness_reports_missing_required_artifacts_as_rerun(tmp_path): - manifest = { - "config": {"n_synthetic": 10, "calibration_backend": "none"}, - "rows": {"synthetic": 1}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"synthetic_data": "synthetic_data.parquet"}, - } - - report = build_us_conditional_readiness_report( - tmp_path, - manifest_payload=manifest, - ) - - stages = {stage["stageId"]: stage for stage in report["stages"]} - assert stages["05_donor_integration_synthesis"]["readiness"] == "must_rerun" - assert "05_donor_integration_synthesis.synthetic_data" in stages[ - "05_donor_integration_synthesis" - ]["missingArtifacts"] - - -def test_conditional_readiness_reports_stage9_from_stage8_dataset(tmp_path): - (tmp_path / "policyengine_us.h5").write_text("dataset") - manifest = { - "config": {"calibration_backend": "none"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"policyengine_dataset": "policyengine_us.h5"}, - } - - report = build_us_conditional_readiness_report( - tmp_path, - manifest_payload=manifest, - ) - - stages = {stage["stageId"]: stage for stage in report["stages"]} - assert stages["09_validation_benchmarking"]["status"] == "deferred" - assert stages["09_validation_benchmarking"]["readiness"] == ( - "post_artifact_evidence" - ) - - -def test_write_and_load_us_conditional_readiness_report(tmp_path): - (tmp_path / "policyengine_us.h5").write_text("dataset") - manifest = { - "config": {"calibration_backend": "none"}, - "synthesis": {"source_names": ["source"], "scaffold_source": "source"}, - "calibration": {}, - "artifacts": {"policyengine_dataset": "policyengine_us.h5"}, - } - - path = write_us_conditional_readiness_report( - tmp_path, - tmp_path / "stage_artifacts" / "conditional_readiness.json", - manifest_payload=manifest, - ) - loaded = load_us_conditional_readiness_report(path) - - assert loaded["schemaVersion"] == 1 - assert loaded["generatedAt"] is None - assert loaded["stages"][0]["stageId"] == "01_run_profile" - - -def test_load_us_conditional_readiness_report_rejects_unknown_schema(tmp_path): - path = tmp_path / "conditional_readiness.json" - path.write_text(json.dumps({"schemaVersion": 99})) - - with pytest.raises(RuntimeError, match="Unsupported US conditional-readiness"): - load_us_conditional_readiness_report(path) diff --git a/tests/pipelines/test_stage_run.py b/tests/pipelines/test_stage_run.py deleted file mode 100644 index f224b8ca..00000000 --- a/tests/pipelines/test_stage_run.py +++ /dev/null @@ -1,902 +0,0 @@ -"""Tests for typed US stage-run output manifests.""" - -import json -from dataclasses import fields - -import pytest - -from microplex_us.pipelines.stage_contracts import ( - US_CANONICAL_STAGE_IDS, - get_us_pipeline_stage_contract, - get_us_stage_artifact_contract, -) -from microplex_us.pipelines.stage_resume import ( - preflight_us_stage_resume, -) -from microplex_us.pipelines.stage_run import ( - US_STAGE_OUTPUT_MANIFEST_TYPES, - USArtifactRef, - USAuxiliaryArtifact, - USCalibrationOutputs, - USDatasetAssemblyOutputs, - USDiagnosticOutput, - USDonorSynthesisOutputs, - USPolicyEngineEntityOutputs, - USRunProfileOutputs, - USSeedScaffoldOutputs, - USSourceLoadingOutputs, - USSourcePlanningOutputs, - USStageInputOverride, - USStageRunWriter, - USValidationBenchmarkingOutputs, - build_us_stage_output_manifests_from_artifact_manifest, - parse_us_stage_input_override, - write_us_stage_run_manifests_from_artifact_manifest, -) - -_BASE_STAGE_MANIFEST_FIELDS = { - "schema_version", - "contract_version", - "input_stage_manifest", - "diagnostics", - "auxiliary_artifacts", - "metadata", - "complete", - "lifecycle_status", - "started_at", - "updated_at", - "completed_at", - "failed_at", - "deferred_reason", - "failure", - "events", - "stage_id", -} - - -def test_every_canonical_stage_has_typed_output_manifest(): - assert tuple(US_STAGE_OUTPUT_MANIFEST_TYPES) == US_CANONICAL_STAGE_IDS - - -def test_stage_output_manifests_use_contract_outputs_as_required_source(): - for stage_id, manifest_type in US_STAGE_OUTPUT_MANIFEST_TYPES.items(): - contract = get_us_pipeline_stage_contract(stage_id) - expected = tuple( - resource.key for resource in contract.outputs if resource.required - ) - output = manifest_type() - - assert output.required_output_keys() == expected - assert set(expected) <= {item.name for item in fields(manifest_type)} - - -def test_stage_output_manifest_fields_are_declared_by_contracts(): - for stage_id, manifest_type in US_STAGE_OUTPUT_MANIFEST_TYPES.items(): - contract = get_us_pipeline_stage_contract(stage_id) - contract_output_keys = {resource.key for resource in contract.outputs} - contract_artifact_keys = {artifact.key for artifact in contract.artifacts} - typed_output_fields = { - item.name - for item in fields(manifest_type) - if item.name not in _BASE_STAGE_MANIFEST_FIELDS - } - - assert contract_output_keys <= typed_output_fields - assert typed_output_fields <= contract_output_keys | contract_artifact_keys - - -def test_stage_run_writer_records_typed_stage_manifests(tmp_path): - _write_artifact_bundle_files(tmp_path) - manifest = _artifact_manifest() - - updated_manifest = write_us_stage_run_manifests_from_artifact_manifest( - tmp_path, - manifest, - ) - - assert (tmp_path / "manifest.json").exists() - assert ( - tmp_path - / "stage_artifacts" - / "manifests" - / "05_donor_integration_synthesis.json" - ).exists() - assert ( - tmp_path / "stage_artifacts" / "manifests" / "09_validation_benchmarking.json" - ).exists() - assert ( - updated_manifest["stage_output_manifests"]["07_calibration"] - == "stage_artifacts/manifests/07_calibration.json" - ) - stage5_manifest = json.loads( - ( - tmp_path - / "stage_artifacts" - / "manifests" - / "05_donor_integration_synthesis.json" - ).read_text() - ) - assert stage5_manifest["stageId"] == "05_donor_integration_synthesis" - assert stage5_manifest["diagnostics"] - assert stage5_manifest["inputStageManifest"] == ( - "stage_artifacts/manifests/04_seed_scaffold.json" - ) - _assert_stage_manifests_write_required_outputs(tmp_path, updated_manifest) - - -def test_stage_run_writer_writes_required_outputs_for_each_stage(tmp_path): - _write_mock_stage_prefix(tmp_path, US_CANONICAL_STAGE_IDS[-1]) - - manifest = json.loads((tmp_path / "manifest.json").read_text()) - _assert_stage_manifests_write_required_outputs(tmp_path, manifest) - - -@pytest.mark.parametrize( - ("previous_stage_id", "stage_id"), - zip(US_CANONICAL_STAGE_IDS, US_CANONICAL_STAGE_IDS[1:]), -) -def test_adjacent_stage_serialized_outputs_satisfy_next_stage_inputs( - tmp_path, - previous_stage_id, - stage_id, -): - _write_mock_stage_prefix(tmp_path, previous_stage_id) - - current_output = _mock_stage_output( - stage_id, - input_stage_manifest=_stage_manifest_ref(previous_stage_id), - ) - current_writer = USStageRunWriter(tmp_path) - - current_writer.record_stage(current_output) - - assert current_writer.recorded_stages == (current_output,) - - -def test_adjacent_stage_serialized_output_schema_breaks_next_stage_input(tmp_path): - seams = [ - (previous_stage_id, stage_id, resource.key) - for previous_stage_id, stage_id in zip( - US_CANONICAL_STAGE_IDS, - US_CANONICAL_STAGE_IDS[1:], - ) - for resource in get_us_pipeline_stage_contract(stage_id).inputs - if resource.required and resource.stage_id == previous_stage_id - ] - assert seams - - for previous_stage_id, stage_id, missing_key in seams: - seam_root = tmp_path / f"{previous_stage_id}-to-{stage_id}-{missing_key}" - _write_mock_stage_prefix( - seam_root, - previous_stage_id, - missing_stage_id=previous_stage_id, - missing_output_key=missing_key, - ) - - current_output = _mock_stage_output( - stage_id, - input_stage_manifest=_stage_manifest_ref(previous_stage_id), - ) - - with pytest.raises(ValueError, match=missing_key): - USStageRunWriter(seam_root).record_stage(current_output) - - -def test_stage_resume_preflight_reports_missing_required_artifact_paths(tmp_path): - manifest_dir = tmp_path / "stage_artifacts" / "manifests" - manifest_dir.mkdir(parents=True) - (tmp_path / "manifest.json").write_text( - json.dumps( - { - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - } - } - ) - ) - (manifest_dir / "05_donor_integration_synthesis.json").write_text( - json.dumps( - { - "contractVersion": "us-runtime-stages-v2", - "stageId": "05_donor_integration_synthesis", - "complete": True, - "lifecycleStatus": "complete", - "requiredOutputs": ["seed_data", "synthetic_data"], - "outputs": { - "seed_data": { - "path": "seed_data.parquet", - "exists": True, - }, - "synthetic_data": { - "path": "synthetic_data.parquet", - "exists": True, - }, - }, - } - ) - ) - - preflight = preflight_us_stage_resume( - tmp_path, - "06_policyengine_entities", - ) - - assert not preflight.ok - missing = {item.label for item in preflight.missing} - assert "05_donor_integration_synthesis.seed_data" in missing - assert "05_donor_integration_synthesis.synthetic_data" in missing - - -def test_stage_run_writer_rejects_missing_diagnostics(tmp_path): - writer = USStageRunWriter(tmp_path) - output = USRunProfileOutputs( - manifest=USArtifactRef( - key="manifest", - path="manifest.json", - format="json", - required=True, - assume_exists=True, - ), - resolved_config={"n_synthetic": 10}, - provider_query_plan={"source_names": ["source"]}, - ) - - with pytest.raises(ValueError, match="does not expose diagnostics"): - writer.record_stage(output) - - -def test_stage_run_writer_requires_prior_stage_or_override(tmp_path): - output = USSourceLoadingOutputs( - observation_frame_summary={"source_count": 1}, - source_descriptors=("source",), - source_relationships={"status": "summarized"}, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"source_names": ["source"]}, - ) - }, - ) - - with pytest.raises(ValueError, match="requires 01_run_profile"): - USStageRunWriter(tmp_path).record_stage(output) - - with pytest.raises(ValueError, match="require allow_stage_input_overrides"): - USStageRunWriter( - tmp_path, - stage_input_overrides=( - USStageInputOverride( - stage_id="02_source_loading", - key="provider_query_plan", - path="overrides/provider_query_plan.json", - ), - ), - ) - - writer = USStageRunWriter( - tmp_path, - allow_stage_input_overrides=True, - stage_input_overrides=( - USStageInputOverride( - stage_id="02_source_loading", - key="provider_query_plan", - path="overrides/provider_query_plan.json", - reason="test override", - ), - ), - ) - writer.record_stage(output) - assert writer.recorded_stages == (output,) - - -def test_stage_run_writer_requires_specific_input_override(tmp_path): - output = USSourceLoadingOutputs( - observation_frame_summary={"source_count": 1}, - source_descriptors=("source",), - source_relationships={"status": "summarized"}, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"source_names": ["source"]}, - ) - }, - ) - - writer = USStageRunWriter( - tmp_path, - allow_stage_input_overrides=True, - stage_input_overrides=( - USStageInputOverride( - stage_id="02_source_loading", - key="source_datasets", - path="overrides/source_datasets.json", - ), - ), - ) - - with pytest.raises(ValueError, match="provider_query_plan"): - writer.record_stage(output) - - -def test_stage_run_writer_validates_required_inputs_from_prior_manifest(tmp_path): - writer = USStageRunWriter(tmp_path) - writer.record_stage( - USRunProfileOutputs( - manifest=USArtifactRef( - key="manifest", - path="manifest.json", - format="json", - required=True, - assume_exists=True, - ), - resolved_config={"n_synthetic": 10}, - provider_query_plan={}, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"has_config": True}, - ) - }, - complete=False, - ) - ) - output = USSourceLoadingOutputs( - observation_frame_summary={"source_count": 1}, - source_descriptors=("source",), - source_relationships={"status": "summarized"}, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"source_names": ["source"]}, - ) - }, - ) - - with pytest.raises(ValueError, match="01_run_profile.provider_query_plan"): - writer.record_stage(output) - - -def test_stage_run_writer_requires_prior_stage_even_without_stage_bound_inputs( - tmp_path, -): - output = USSourcePlanningOutputs( - scaffold_selection={"scaffold_source": "source"}, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"scaffold_source": "source"}, - ) - }, - complete=False, - ) - - with pytest.raises(ValueError, match="requires 02_source_loading"): - USStageRunWriter(tmp_path).record_stage(output) - - -def test_stage_run_writer_rejects_arbitrary_input_manifest(tmp_path): - arbitrary_manifest = tmp_path / "arbitrary.json" - arbitrary_manifest.write_text("{}") - output = USSourceLoadingOutputs( - input_stage_manifest="arbitrary.json", - observation_frame_summary={"source_count": 1}, - source_descriptors=("source",), - source_relationships={"status": "summarized"}, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"source_names": ["source"]}, - ) - }, - ) - - with pytest.raises(ValueError, match="requires 01_run_profile"): - USStageRunWriter(tmp_path).record_stage(output) - - -def test_stage_run_writer_rejects_empty_required_structured_outputs(tmp_path): - output = USRunProfileOutputs( - manifest=USArtifactRef( - key="manifest", - path="manifest.json", - format="json", - required=True, - assume_exists=True, - ), - resolved_config={}, - provider_query_plan={"source_names": ["source"]}, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"has_config": False}, - ) - }, - ) - - with pytest.raises(ValueError, match="resolved_config"): - USStageRunWriter(tmp_path).record_stage(output) - - -def test_stage_run_writer_rejects_undeclared_auxiliary_artifact(tmp_path): - writer = USStageRunWriter(tmp_path) - output = USRunProfileOutputs( - manifest=USArtifactRef( - key="manifest", - path="manifest.json", - format="json", - required=True, - assume_exists=True, - ), - resolved_config={"n_synthetic": 10}, - provider_query_plan={"source_names": ["source"]}, - diagnostics={ - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"has_config": True}, - ) - }, - auxiliary_artifacts={ - "not_declared": USAuxiliaryArtifact( - key="not_declared", - path="not_declared.json", - format="json", - ) - }, - ) - - with pytest.raises(KeyError, match="not declared"): - writer.update(output) - - -def test_parse_us_stage_input_override(): - override = parse_us_stage_input_override( - "02_source_loading.provider_query_plan=overrides/provider_query_plan.json" - ) - - assert override == USStageInputOverride( - stage_id="02_source_loading", - key="provider_query_plan", - path="overrides/provider_query_plan.json", - ) - - with pytest.raises(ValueError, match="STAGE_ID.KEY=PATH"): - parse_us_stage_input_override("02_source_loading=missing-key") - - with pytest.raises(ValueError, match="Unknown US pipeline stage"): - parse_us_stage_input_override("unknown_stage.provider_query_plan=override.json") - - with pytest.raises(ValueError, match="Unknown input override key"): - parse_us_stage_input_override("02_source_loading.not_an_input=override.json") - - -def test_build_stage_outputs_from_manifest_exposes_diagnostics(tmp_path): - _write_artifact_bundle_files(tmp_path) - outputs = build_us_stage_output_manifests_from_artifact_manifest( - tmp_path, - _artifact_manifest(), - ) - - assert len(outputs) == 9 - assert all(output.diagnostics for output in outputs) - stage6 = outputs[5] - assert "policyengine_dataset" not in stage6.materialized_policyengine_inputs - assert stage6.materialized_policyengine_inputs["tables"]["households"]["rows"] == 1 - - -def test_build_stage_outputs_treats_missing_declared_dataset_as_incomplete( - tmp_path, -): - _write_artifact_bundle_files(tmp_path) - (tmp_path / "policyengine_us.h5").unlink() - - outputs = build_us_stage_output_manifests_from_artifact_manifest( - tmp_path, - _artifact_manifest(), - ) - - stage8 = outputs[7] - assert stage8.complete is False - assert stage8.missing_required_outputs(tmp_path) == ("policyengine_dataset",) - - -def test_build_stage_outputs_hydrates_stage9_summary_from_validation_evidence( - tmp_path, -): - _write_artifact_bundle_files(tmp_path) - evidence_path = _write_validation_evidence_manifest(tmp_path) - manifest = _artifact_manifest() - manifest.pop("policyengine_native_scores") - manifest["artifacts"]["validation_evidence"] = str( - evidence_path.relative_to(tmp_path) - ) - - outputs = build_us_stage_output_manifests_from_artifact_manifest( - tmp_path, - manifest, - ) - - stage9 = outputs[8] - assert stage9.complete is True - assert stage9.benchmark_summary == { - "policyengine_native_scores": { - "enhanced_cps_native_loss_delta": -0.1, - } - } - assert stage9.diagnostics["stage_summary"].summary == stage9.benchmark_summary - - -def test_build_stage_outputs_does_not_complete_stage9_from_stale_evidence_summary( - tmp_path, -): - _write_artifact_bundle_files(tmp_path) - evidence_path = _write_validation_evidence_manifest(tmp_path) - (tmp_path / "policyengine_native_scores.json").unlink() - manifest = _artifact_manifest() - manifest.pop("policyengine_native_scores") - manifest["artifacts"]["validation_evidence"] = str( - evidence_path.relative_to(tmp_path) - ) - - outputs = build_us_stage_output_manifests_from_artifact_manifest( - tmp_path, - manifest, - ) - - stage9 = outputs[8] - assert stage9.complete is False - assert stage9.benchmark_summary == {} - - -def test_stage_run_writer_preserves_existing_validation_evidence_summary( - tmp_path, -): - _write_artifact_bundle_files(tmp_path) - evidence_path = _write_validation_evidence_manifest(tmp_path) - manifest = _artifact_manifest() - manifest.pop("policyengine_native_scores") - manifest["artifacts"]["validation_evidence"] = str( - evidence_path.relative_to(tmp_path) - ) - - write_us_stage_run_manifests_from_artifact_manifest(tmp_path, manifest) - - stage9_manifest = json.loads( - ( - tmp_path - / "stage_artifacts" - / "manifests" - / "09_validation_benchmarking.json" - ).read_text() - ) - rewritten_evidence = json.loads(evidence_path.read_text()) - - assert stage9_manifest["complete"] is True - assert stage9_manifest["outputs"]["benchmark_summary"] == { - "policyengine_native_scores": { - "enhanced_cps_native_loss_delta": -0.1, - } - } - assert rewritten_evidence["summaries"] == { - "policyengine_native_scores": { - "enhanced_cps_native_loss_delta": -0.1, - } - } - assert any( - record["key"] == "policyengine_native_scores" - and record["path"] == "policyengine_native_scores.json" - and record["exists"] is True - for record in rewritten_evidence["evidence"] - ) - - -def _write_mock_stage_prefix( - root, - through_stage_id, - *, - missing_stage_id=None, - missing_output_key=None, -): - writer = USStageRunWriter(root) - for stage_id in US_CANONICAL_STAGE_IDS[ - : US_CANONICAL_STAGE_IDS.index(through_stage_id) + 1 - ]: - writer.record_stage( - _mock_stage_output( - stage_id, - missing_output_key=( - missing_output_key if stage_id == missing_stage_id else None - ), - complete=stage_id != missing_stage_id, - ) - ) - writer.write_manifest_files() - - -def _mock_stage_output( - stage_id, - *, - input_stage_manifest=None, - missing_output_key=None, - complete=True, -): - diagnostics = { - "stage_summary": USDiagnosticOutput( - key="stage_summary", - summary={"stage_id": stage_id}, - ) - } - common = { - "input_stage_manifest": input_stage_manifest, - "diagnostics": diagnostics, - "complete": complete, - } - values = _mock_stage_output_values(stage_id) - if missing_output_key is not None: - values[missing_output_key] = _missing_output_value(values[missing_output_key]) - return _mock_stage_output_type(stage_id)(**common, **values) - - -def _mock_stage_output_type(stage_id): - return { - "01_run_profile": USRunProfileOutputs, - "02_source_loading": USSourceLoadingOutputs, - "03_source_planning": USSourcePlanningOutputs, - "04_seed_scaffold": USSeedScaffoldOutputs, - "05_donor_integration_synthesis": USDonorSynthesisOutputs, - "06_policyengine_entities": USPolicyEngineEntityOutputs, - "07_calibration": USCalibrationOutputs, - "08_dataset_assembly": USDatasetAssemblyOutputs, - "09_validation_benchmarking": USValidationBenchmarkingOutputs, - }[stage_id] - - -def _mock_stage_output_values(stage_id): - if stage_id == "01_run_profile": - return { - "manifest": _mock_artifact_ref("01_run_profile", "manifest"), - "resolved_config": {"n_synthetic": 10}, - "provider_query_plan": {"source_names": ["source"]}, - } - if stage_id == "02_source_loading": - return { - "observation_frame_summary": {"source_count": 1}, - "source_descriptors": ("source",), - "source_relationships": {"status": "valid"}, - } - if stage_id == "03_source_planning": - return { - "source_plan": _mock_artifact_ref("03_source_planning", "source_plan"), - "scaffold_selection": {"scaffold_source": "source"}, - } - if stage_id == "04_seed_scaffold": - return { - "scaffold_seed_data": _mock_artifact_ref( - "04_seed_scaffold", - "scaffold_seed_data", - ), - "seed_schema_metadata": {"required_columns": ["person_id"]}, - } - if stage_id == "05_donor_integration_synthesis": - return { - "seed_data": _mock_artifact_ref( - "05_donor_integration_synthesis", - "seed_data", - ), - "synthetic_data": _mock_artifact_ref( - "05_donor_integration_synthesis", - "synthetic_data", - ), - "synthesis_metadata": {"backend": "mock"}, - "source_weight_diagnostics": _mock_artifact_ref( - "05_donor_integration_synthesis", - "source_weight_diagnostics", - category="diagnostic", - ), - } - if stage_id == "06_policyengine_entities": - return { - "pre_calibration_policyengine_entity_tables": _mock_artifact_ref( - "06_policyengine_entities", - "pre_calibration_policyengine_entity_tables", - ), - "materialized_policyengine_inputs": {"tables": {"households": {"rows": 1}}}, - } - if stage_id == "07_calibration": - return { - "calibrated_data": _mock_artifact_ref( - "07_calibration", - "calibrated_data", - ), - "targets": _mock_artifact_ref("07_calibration", "targets"), - "calibration_summary": _mock_artifact_ref( - "07_calibration", - "calibration_summary", - category="diagnostic", - ), - "policyengine_entity_tables": _mock_artifact_ref( - "07_calibration", - "policyengine_entity_tables", - ), - "target_ledger": {"target_count": 1}, - } - if stage_id == "08_dataset_assembly": - return { - "policyengine_dataset": _mock_artifact_ref( - "08_dataset_assembly", - "policyengine_dataset", - ), - "stage_manifest": _mock_artifact_ref( - "08_dataset_assembly", - "stage_manifest", - category="derived", - ), - "data_flow_snapshot": _mock_artifact_ref( - "08_dataset_assembly", - "data_flow_snapshot", - category="derived", - ), - "artifact_inventory": _mock_artifact_ref( - "08_dataset_assembly", - "artifact_inventory", - category="derived", - ), - "conditional_readiness": _mock_artifact_ref( - "08_dataset_assembly", - "conditional_readiness", - category="derived", - ), - } - if stage_id == "09_validation_benchmarking": - return { - "validation_evidence": _mock_artifact_ref( - "09_validation_benchmarking", - "validation_evidence", - ), - "benchmark_summary": {"loss_delta": -0.1}, - "policyengine_native_scores": _mock_artifact_ref( - "09_validation_benchmarking", - "policyengine_native_scores", - category="diagnostic", - ), - } - raise KeyError(stage_id) - - -def _mock_artifact_ref(stage_id, artifact_key, *, category="required_output"): - contract = get_us_stage_artifact_contract(stage_id, artifact_key) - return USArtifactRef( - key=artifact_key, - path=contract.path_hint or f"stage_artifacts/{stage_id}/{artifact_key}", - format=contract.format, - required=contract.required, - category=category, - resume_role=contract.resume_role, - assume_exists=True, - ) - - -def _missing_output_value(value): - return None if isinstance(value, USArtifactRef) else type(value)() - - -def _stage_manifest_ref(stage_id): - return f"stage_artifacts/manifests/{stage_id}.json" - - -def _assert_stage_manifests_write_required_outputs(root, manifest): - assert tuple(manifest["stage_output_manifests"]) == US_CANONICAL_STAGE_IDS - - for stage_id in US_CANONICAL_STAGE_IDS: - contract = get_us_pipeline_stage_contract(stage_id) - required_outputs = tuple( - resource.key for resource in contract.outputs if resource.required - ) - manifest_path = root / manifest["stage_output_manifests"][stage_id] - stage_manifest = json.loads(manifest_path.read_text()) - - assert stage_manifest["stageId"] == stage_id - assert tuple(stage_manifest["requiredOutputs"]) == required_outputs - assert set(required_outputs) <= set(stage_manifest["outputs"]) - assert not stage_manifest["missingRequiredOutputs"] - - -def _write_artifact_bundle_files(root): - for relative in ( - "seed_data.parquet", - "synthetic_data.parquet", - "calibrated_data.parquet", - "targets.json", - "policyengine_us.h5", - "policyengine_native_scores.json", - "source_weight_diagnostics.json", - "stage_artifacts/03_source_planning/source_plan.json", - "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet", - "stage_artifacts/06_policyengine_entities/metadata.json", - "stage_artifacts/07_calibration/calibration_summary.json", - "stage_artifacts/07_calibration/policyengine_entity_tables/metadata.json", - ): - path = root / relative - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text("{}") - ( - root / "stage_artifacts" / "06_policyengine_entities" / "metadata.json" - ).write_text( - json.dumps( - { - "format_version": 1, - "stage": "post_microsim", - "households": {"rows": 1, "columns": ["household_id"]}, - "persons": {"rows": 1, "columns": ["person_id"]}, - } - ) - ) - - -def _write_validation_evidence_manifest(root): - evidence_path = ( - root - / "stage_artifacts" - / "09_validation_benchmarking" - / "evidence_manifest.json" - ) - evidence_path.parent.mkdir(parents=True, exist_ok=True) - evidence_path.write_text( - json.dumps( - { - "formatVersion": 1, - "stageId": "09_validation_benchmarking", - "evidence": [ - { - "key": "policyengine_native_scores", - "path": "policyengine_native_scores.json", - "exists": True, - } - ], - "summaries": { - "policyengine_native_scores": { - "enhanced_cps_native_loss_delta": -0.1, - } - }, - } - ) - ) - return evidence_path - - -def _artifact_manifest(): - return { - "created_at": "2026-05-30T00:00:00+00:00", - "config": {"n_synthetic": 10, "calibration_backend": "entropy"}, - "rows": {"seed": 1, "synthetic": 1, "calibrated": 1}, - "synthesis": { - "source_names": ["source"], - "scaffold_source": "source", - "backend": "seed", - }, - "calibration": {"backend": "entropy", "converged": True}, - "policyengine_native_scores": {"enhanced_cps_native_loss_delta": -0.1}, - "artifacts": { - "seed_data": "seed_data.parquet", - "synthetic_data": "synthetic_data.parquet", - "calibrated_data": "calibrated_data.parquet", - "targets": "targets.json", - "policyengine_dataset": "policyengine_us.h5", - "policyengine_native_scores": "policyengine_native_scores.json", - "source_weight_diagnostics": "source_weight_diagnostics.json", - "source_plan": "stage_artifacts/03_source_planning/source_plan.json", - "scaffold_seed_data": ( - "stage_artifacts/04_seed_scaffold/scaffold_seed_data.parquet" - ), - "pre_calibration_policyengine_entity_tables": ( - "stage_artifacts/06_policyengine_entities/metadata.json" - ), - "policyengine_entity_tables": ( - "stage_artifacts/07_calibration/policyengine_entity_tables/metadata.json" - ), - "calibration_summary": ( - "stage_artifacts/07_calibration/calibration_summary.json" - ), - }, - } diff --git a/tests/pipelines/test_stage_runtime.py b/tests/pipelines/test_stage_runtime.py deleted file mode 100644 index f724dd96..00000000 --- a/tests/pipelines/test_stage_runtime.py +++ /dev/null @@ -1,341 +0,0 @@ -"""Tests for live US stage runtime manifest updates.""" - -import json - -import pytest -from microplex.core import RelationshipCardinality - -import microplex_us.pipelines.stage_runtime as stage_runtime_module -from microplex_us.pipelines.stage_contracts import US_STAGE_CONTRACT_VERSION -from microplex_us.pipelines.stage_run import ( - USArtifactRef, - USDiagnosticOutput, - USRunProfileOutputs, - USSourceLoadingOutputs, - USStageInputOverride, - USValidationBenchmarkingOutputs, -) -from microplex_us.pipelines.stage_runtime import USStageRuntimeWriter - - -def _diagnostics(stage_id: str) -> dict[str, USDiagnosticOutput]: - return { - "stage_summary": USDiagnosticOutput( - key="stage_summary", - description=f"Summary for {stage_id}.", - summary={"stage": stage_id}, - ) - } - - -def test_runtime_writer_requires_previous_stage_completion_to_start(tmp_path): - writer = USStageRuntimeWriter(tmp_path) - - with pytest.raises(ValueError, match="01_run_profile to be complete"): - writer.start_stage("02_source_loading") - - writer.start_stage("01_run_profile") - - with pytest.raises(ValueError, match="01_run_profile to be complete"): - writer.start_stage("02_source_loading") - - -def test_runtime_writer_completes_stage_and_exposes_lifecycle(tmp_path): - writer = USStageRuntimeWriter(tmp_path) - writer.start_stage("01_run_profile", metadata={"profile": "test"}) - writer.complete_stage( - USRunProfileOutputs( - manifest=USArtifactRef( - key="manifest", - path="manifest.json", - format="json", - required=True, - assume_exists=True, - ), - resolved_config={"calibration_backend": "none"}, - provider_query_plan={"source_names": ["unit"]}, - diagnostics=_diagnostics("01_run_profile"), - ) - ) - - writer.start_stage("02_source_loading") - writer.complete_stage( - USSourceLoadingOutputs( - observation_frame_summary={"source_count": 1}, - source_descriptors=("unit",), - source_relationships={"household_person": "ok"}, - diagnostics=_diagnostics("02_source_loading"), - ) - ) - - stage2_path = tmp_path / "stage_artifacts" / "manifests" / "02_source_loading.json" - stage2 = json.loads(stage2_path.read_text()) - aggregate = json.loads((tmp_path / "stage_manifest.json").read_text()) - aggregate_stage2 = {stage["id"]: stage for stage in aggregate["stages"]}[ - "02_source_loading" - ] - - assert stage2["lifecycleStatus"] == "complete" - assert stage2["inputStageManifest"] == ( - "stage_artifacts/manifests/01_run_profile.json" - ) - assert aggregate_stage2["lifecycleStatus"] == "complete" - assert aggregate_stage2["outputManifest"] == ( - "stage_artifacts/manifests/02_source_loading.json" - ) - assert aggregate_stage2["completedAt"] is not None - assert [event["event"] for event in stage2["events"]] == [ - "stage_started", - "stage_completed", - ] - - -def test_runtime_writer_finalize_preserves_completed_stage_lifecycle(tmp_path): - writer = USStageRuntimeWriter( - tmp_path, - manifest_payload={ - "config": {"calibration_backend": "none"}, - "artifacts": {"manifest": "manifest.json"}, - }, - ) - writer.start_stage("01_run_profile", metadata={"profile": "test"}) - writer.complete_stage( - USRunProfileOutputs( - manifest=USArtifactRef( - key="manifest", - path="manifest.json", - format="json", - required=True, - assume_exists=True, - ), - resolved_config={"calibration_backend": "none"}, - provider_query_plan={"source_names": ["unit"]}, - diagnostics=_diagnostics("01_run_profile"), - ) - ) - stage1_path = tmp_path / "stage_artifacts" / "manifests" / "01_run_profile.json" - before = json.loads(stage1_path.read_text()) - - writer.finalize_from_artifact_manifest( - { - "config": {"calibration_backend": "none"}, - "artifacts": {"manifest": "manifest.json"}, - "synthesis": {"source_names": ["unit"]}, - } - ) - - after = json.loads(stage1_path.read_text()) - assert after["lifecycleStatus"] == "complete" - assert after["startedAt"] == before["startedAt"] - assert after["updatedAt"] == before["updatedAt"] - assert after["completedAt"] == before["completedAt"] - assert after["events"] == before["events"] - - -def test_runtime_writer_finalize_rehydrates_failed_stage_outputs( - tmp_path, - monkeypatch, -): - previous_stage = ( - tmp_path / "stage_artifacts" / "manifests" / "08_dataset_assembly.json" - ) - previous_stage.parent.mkdir(parents=True, exist_ok=True) - previous_stage.write_text( - json.dumps( - { - "stageId": "08_dataset_assembly", - "outputs": { - "policyengine_dataset": { - "key": "policyengine_dataset", - "path": "policyengine_us.h5", - "exists": True, - } - }, - } - ) - ) - (tmp_path / "policyengine_us.h5").write_text("{}") - evidence_path = ( - tmp_path - / "stage_artifacts" - / "09_validation_benchmarking" - / "evidence_manifest.json" - ) - evidence_path.parent.mkdir(parents=True, exist_ok=True) - evidence_path.write_text("{}") - - writer = USStageRuntimeWriter(tmp_path) - writer.record_output( - "09_validation_benchmarking", - "validation_evidence", - USArtifactRef( - key="validation_evidence", - path=evidence_path.relative_to(tmp_path), - format="json", - required=True, - assume_exists=True, - ), - ) - writer.record_output( - "09_validation_benchmarking", - "benchmark_summary", - {"policyengine_native_scores": {"loss_delta": -0.1}}, - ) - writer.fail_stage("09_validation_benchmarking", ValueError("finalize failed")) - - def _empty_rebuilt_stage_outputs(*_args, **_kwargs): - return ( - USValidationBenchmarkingOutputs( - diagnostics=_diagnostics("09_validation_benchmarking"), - complete=False, - ), - ) - - monkeypatch.setattr( - stage_runtime_module, - "build_us_stage_output_manifests_from_artifact_manifest", - _empty_rebuilt_stage_outputs, - ) - - writer.finalize_from_artifact_manifest( - { - "artifacts": {"policyengine_dataset": "policyengine_us.h5"}, - "config": {"calibration_backend": "microcalibrate"}, - } - ) - - stage9 = json.loads( - ( - tmp_path - / "stage_artifacts" - / "manifests" - / "09_validation_benchmarking.json" - ).read_text() - ) - assert stage9["complete"] is True - assert stage9["lifecycleStatus"] == "complete" - assert stage9["failedAt"] is None - assert stage9["failure"] is None - assert stage9["missingRequiredOutputs"] == [] - assert stage9["outputs"]["benchmark_summary"] == { - "policyengine_native_scores": {"loss_delta": -0.1} - } - assert stage9["outputs"]["validation_evidence"]["exists"] is True - - -def test_runtime_writer_serializes_enum_outputs(tmp_path): - writer = USStageRuntimeWriter( - tmp_path, - allow_stage_input_overrides=True, - stage_input_overrides=( - USStageInputOverride( - stage_id="02_source_loading", - key="provider_query_plan", - path="overrides/provider_query_plan.json", - ), - ), - ) - writer.start_stage("02_source_loading") - writer.complete_stage( - USSourceLoadingOutputs( - observation_frame_summary={"source_count": 1}, - source_descriptors=("unit",), - source_relationships={ - "unit": [{"cardinality": RelationshipCardinality.ONE_TO_MANY}] - }, - diagnostics=_diagnostics("02_source_loading"), - ) - ) - - stage2 = json.loads( - ( - tmp_path / "stage_artifacts" / "manifests" / "02_source_loading.json" - ).read_text() - ) - - assert stage2["outputs"]["source_relationships"]["unit"][0]["cardinality"] == ( - "one_to_many" - ) - - -def test_runtime_writer_records_overrides_in_running_manifest(tmp_path): - writer = USStageRuntimeWriter( - tmp_path, - allow_stage_input_overrides=True, - stage_input_overrides=( - USStageInputOverride( - stage_id="02_source_loading", - key="provider_query_plan", - path="overrides/provider_query_plan.json", - reason="unit test", - ), - ), - ) - - writer.start_stage("02_source_loading") - stage2 = json.loads( - ( - tmp_path / "stage_artifacts" / "manifests" / "02_source_loading.json" - ).read_text() - ) - - assert stage2["inputOverrides"] == [ - { - "stageId": "02_source_loading", - "key": "provider_query_plan", - "path": "overrides/provider_query_plan.json", - "reason": "unit test", - } - ] - - -def test_runtime_writer_refreshes_root_manifest_on_stage_start(tmp_path): - writer = USStageRuntimeWriter(tmp_path) - - writer.start_stage("01_run_profile") - manifest = json.loads((tmp_path / "manifest.json").read_text()) - - assert manifest["stage_output_manifests"]["01_run_profile"] == ( - "stage_artifacts/manifests/01_run_profile.json" - ) - - -def test_runtime_writer_rejects_stale_complete_previous_manifest(tmp_path): - writer = USStageRuntimeWriter(tmp_path) - stage1_path = tmp_path / "stage_artifacts" / "manifests" / "01_run_profile.json" - stage1_path.parent.mkdir(parents=True) - stage1_path.write_text( - json.dumps( - { - "stageId": "01_run_profile", - "contractVersion": US_STAGE_CONTRACT_VERSION, - "lifecycleStatus": "complete", - "requiredOutputs": ["manifest"], - "missingRequiredOutputs": ["manifest"], - "outputs": {}, - } - ) - ) - - with pytest.raises(ValueError, match="missing required outputs"): - writer.start_stage("02_source_loading") - - -def test_runtime_writer_update_writes_json_artifact_reference(tmp_path): - writer = USStageRuntimeWriter(tmp_path) - payload = writer.record_output( - "03_source_planning", - "source_plan", - {"scaffoldSource": "cps"}, - path="stage_artifacts/03_source_planning/source_plan.json", - ) - - source_plan_path = ( - tmp_path / "stage_artifacts" / "03_source_planning" / "source_plan.json" - ) - - assert json.loads(source_plan_path.read_text()) == {"scaffoldSource": "cps"} - assert payload["outputs"]["source_plan"]["path"] == ( - "stage_artifacts/03_source_planning/source_plan.json" - ) - assert payload["outputs"]["source_plan"]["exists"] is True diff --git a/tests/pipelines/test_summarize_child_tax_unit_agi_drift.py b/tests/pipelines/test_summarize_child_tax_unit_agi_drift.py deleted file mode 100644 index f4da63c1..00000000 --- a/tests/pipelines/test_summarize_child_tax_unit_agi_drift.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import annotations - -from pathlib import Path - -import pandas as pd - -from microplex_us.pipelines.summarize_child_tax_unit_agi_drift import ( - summarize_child_tax_unit_agi_drift, -) - - -def test_summarize_child_tax_unit_agi_drift(tmp_path: Path) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - (artifact_dir / "manifest.json").write_text("{}") - frame = pd.DataFrame( - { - "age": [5, 35, 12], - "is_tax_unit_dependent": [1, 0, 1], - "tax_unit_id": ["tu1", "tu2", "tu1"], - "partnership_s_corp_income": [0.0, 100.0, 5.0], - "taxable_interest_income": [0.0, 10.0, 0.0], - } - ) - frame.to_parquet(artifact_dir / "seed_data.parquet", index=False) - - payload = summarize_child_tax_unit_agi_drift(artifact_dir) - - seed = payload["stages"]["seed"] - assert seed["row_count"] == 3 - dependents = seed["subsets"]["dependents_under_20"]["partnership_s_corp_income"] - assert dependents["count"] == 2 - assert dependents["sum"] == 5.0 - assert dependents["nonzero_share"] == 0.5 - - tax_units = seed["tax_unit_subsets"]["with_children"]["partnership_s_corp_income"] - assert tax_units["count"] == 1 - assert tax_units["sum"] == 5.0 diff --git a/tests/pipelines/test_summarize_donor_conditioning.py b/tests/pipelines/test_summarize_donor_conditioning.py deleted file mode 100644 index 4e5e8d0d..00000000 --- a/tests/pipelines/test_summarize_donor_conditioning.py +++ /dev/null @@ -1,215 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path - -from microplex_us.pipelines.summarize_donor_conditioning import ( - summarize_donor_conditioning, -) - - -def test_summarize_donor_conditioning_filters_and_counts(tmp_path: Path) -> None: - artifact_dir = tmp_path / "artifact" - artifact_dir.mkdir() - (artifact_dir / "manifest.json").write_text( - json.dumps( - { - "synthesis": { - "donor_conditioning_diagnostics": [ - { - "donor_source": "irs_soi_puf_2024", - "model_variables": ["taxable_interest_income"], - "restored_variables": ["taxable_interest_income"], - "condition_selection": "pe_prespecified", - "used_condition_surface": False, - "raw_shared_vars": [ - "age", - "employment_status", - "income", - "state_fips", - ], - "shared_vars_after_model_exclusion": [ - "age", - "employment_status", - "income", - "state_fips", - ], - "projection_applied": False, - "entity_compatible_shared_vars": [], - "requested_supplemental_shared_condition_vars": [ - "employment_status", - "income", - "state_fips", - ], - "requested_challenger_shared_condition_vars": [ - "self_employment_income", - "rental_income", - ], - "raw_supplemental_shared_condition_var_status": [ - { - "variable": "employment_status", - "selected": False, - "in_shared_overlap": False, - "reason": "incompatible_condition_support", - }, - { - "variable": "income", - "selected": False, - "in_shared_overlap": False, - "reason": "excluded_from_shared_overlap", - }, - { - "variable": "state_fips", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - ], - "raw_challenger_shared_condition_var_status": [ - { - "variable": "self_employment_income", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - { - "variable": "rental_income", - "selected": False, - "in_shared_overlap": False, - "reason": "excluded_from_shared_overlap", - }, - ], - "supplemental_shared_condition_var_status": [ - { - "variable": "employment_status", - "selected": False, - "in_shared_overlap": False, - "reason": "missing_current_column", - }, - { - "variable": "income", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - { - "variable": "state_fips", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - ], - "challenger_shared_condition_var_status": [ - { - "variable": "self_employment_income", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - { - "variable": "rental_income", - "selected": False, - "in_shared_overlap": False, - "reason": "missing_current_column", - }, - ], - "selected_condition_vars": [ - "age", - "income", - "state_fips", - ], - "dropped_shared_vars": ["education", "tenure"], - }, - { - "donor_source": "scf", - "model_variables": ["bank_account_assets"], - "restored_variables": ["bank_account_assets"], - "condition_selection": "top_correlated", - "used_condition_surface": False, - "selected_condition_vars": ["age", "income"], - "dropped_shared_vars": ["state_fips"], - }, - ] - } - } - ) - ) - - payload = summarize_donor_conditioning( - artifact_dir, - focus_variables=("taxable_interest_income",), - ) - - assert payload["block_count"] == 1 - assert payload["focus_variables"] == ["taxable_interest_income"] - assert payload["selected_condition_var_frequency"] == { - "age": 1, - "income": 1, - "state_fips": 1, - } - assert payload["dropped_shared_var_frequency"] == { - "education": 1, - "tenure": 1, - } - assert payload["supplemental_shared_condition_reason_frequency"] == { - "missing_current_column": 1, - "selected": 2, - } - assert payload["raw_supplemental_shared_condition_reason_frequency"] == { - "excluded_from_shared_overlap": 1, - "incompatible_condition_support": 1, - "selected": 1, - } - assert payload["raw_challenger_shared_condition_reason_frequency"] == { - "excluded_from_shared_overlap": 1, - "selected": 1, - } - assert payload["challenger_shared_condition_reason_frequency"] == { - "missing_current_column": 1, - "selected": 1, - } - assert payload["blocks"][0]["donor_source"] == "irs_soi_puf_2024" - assert payload["blocks"][0]["raw_shared_vars"] == [ - "age", - "employment_status", - "income", - "state_fips", - ] - assert payload["blocks"][0]["raw_supplemental_shared_condition_var_status"] == [ - { - "variable": "employment_status", - "selected": False, - "in_shared_overlap": False, - "reason": "incompatible_condition_support", - }, - { - "variable": "income", - "selected": False, - "in_shared_overlap": False, - "reason": "excluded_from_shared_overlap", - }, - { - "variable": "state_fips", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - ] - assert payload["blocks"][0]["requested_challenger_shared_condition_vars"] == [ - "self_employment_income", - "rental_income", - ] - assert payload["blocks"][0]["raw_challenger_shared_condition_var_status"] == [ - { - "variable": "self_employment_income", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - { - "variable": "rental_income", - "selected": False, - "in_shared_overlap": False, - "reason": "excluded_from_shared_overlap", - }, - ] diff --git a/tests/pipelines/test_summarize_pe_native_family_drilldown.py b/tests/pipelines/test_summarize_pe_native_family_drilldown.py deleted file mode 100644 index a55a5038..00000000 --- a/tests/pipelines/test_summarize_pe_native_family_drilldown.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Tests for PE-native family drilldown summaries.""" - -from __future__ import annotations - -import json -from pathlib import Path - -from microplex_us.pipelines.summarize_pe_native_family_drilldown import ( - classify_pe_native_target_family, - summarize_us_pe_native_family_drilldown, -) - - -def test_classify_pe_native_target_family_covers_national_irs_bucket() -> None: - assert ( - classify_pe_native_target_family( - "nation/irs/count/count/AGI in 500k-1m/taxable/Single" - ) - == "national_irs_other" - ) - - -def test_summarize_us_pe_native_family_drilldown_aggregates_matching_targets( - tmp_path, -) -> None: - root = tmp_path / "root" - _write_audit_bundle( - root / "run-a", - largest_regressing_family="national_irs_other", - top_targets=[ - ( - "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - 100.0, - ), - ( - "state/WI/adjusted_gross_income/count/500000_inf", - 250.0, - ), - ], - filing_status_gaps=[("JOINT", 1000.0), ("SEPARATE", -250.0)], - mfs_agi_gaps=[("100k_to_200k", -50.0), ("500k_plus", -10.0)], - ) - _write_audit_bundle( - root / "run-b", - largest_regressing_family="state_agi_distribution", - top_targets=[ - ( - "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - 75.0, - ), - ( - "nation/irs/count/count/AGI in 500k-1m/taxable/Single", - 60.0, - ), - ], - filing_status_gaps=[("HEAD_OF_HOUSEHOLD", 500.0)], - mfs_agi_gaps=[("75k_to_100k", -20.0)], - ) - - summary = summarize_us_pe_native_family_drilldown( - [root], - family="national_irs_other", - top_k=5, - ) - - assert summary["totalAudits"] == 2 - assert summary["auditsWithMatchingTargets"] == 2 - assert summary["auditsWhereFamilyLeads"] == 1 - assert summary["matchingTargetCounts"][0] == { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "count": 2, - "weightedTermDeltaSum": 175.0, - "weightedTermDeltaMean": 87.5, - } - assert summary["leadTargetCounts"] == [ - { - "target": "nation/irs/ordinary dividends/total/AGI in 500k-1m/taxable/All", - "count": 1, - "weightedTermDeltaSum": 100.0, - "weightedTermDeltaMean": 100.0, - } - ] - assert summary["leadFilingStatusGapSummary"] == [ - { - "filingStatus": "JOINT", - "count": 1, - "positiveCount": 1, - "negativeCount": 0, - "weightedCountDeltaSum": 1000.0, - "meanAbsWeightedCountDelta": 1000.0, - }, - { - "filingStatus": "SEPARATE", - "count": 1, - "positiveCount": 0, - "negativeCount": 1, - "weightedCountDeltaSum": -250.0, - "meanAbsWeightedCountDelta": 250.0, - }, - ] - assert summary["leadMFSAgiGapSummary"][0] == { - "agiBin": "100k_to_200k", - "count": 1, - "positiveCount": 0, - "negativeCount": 1, - "weightedCountDeltaSum": -50.0, - "meanAbsWeightedCountDelta": 50.0, - } - assert summary["matchingAudits"][0]["artifactPath"] == "run-a" - assert summary["leadAudits"][0]["artifactPath"] == "run-a" - - -def test_summarize_us_pe_native_family_drilldown_ignores_non_bundle_audits(tmp_path) -> None: - root = tmp_path / "root" - stray = root / "stray" - stray.mkdir(parents=True) - (stray / "pe_us_data_rebuild_native_audit.json").write_text( - json.dumps({"verdictHints": {"largestRegressingFamily": "national_irs_other"}}) - ) - _write_audit_bundle( - root / "run-a", - largest_regressing_family="national_irs_other", - top_targets=[("nation/irs/aca_spending/mi", 10.0)], - ) - - summary = summarize_us_pe_native_family_drilldown( - [root], - family="state_aca_spending", - ) - - assert summary["totalAudits"] == 1 - assert summary["auditsWithMatchingTargets"] == 1 - assert summary["matchingAudits"][0]["artifactPath"] == "run-a" - - -def _write_audit_bundle( - bundle_dir: Path, - *, - largest_regressing_family: str, - top_targets: list[tuple[str, float]], - filing_status_gaps: list[tuple[str, float]] | None = None, - mfs_agi_gaps: list[tuple[str, float]] | None = None, -) -> None: - bundle_dir.mkdir(parents=True) - (bundle_dir / "policyengine_us.h5").write_text("dataset") - (bundle_dir / "pe_us_data_rebuild_native_audit.json").write_text( - json.dumps( - { - "verdictHints": { - "largestRegressingFamily": largest_regressing_family, - "largestRegressingTarget": top_targets[0][0], - }, - "topTargetRegressions": [ - { - "target_name": target_name, - "weighted_term_delta": weighted_term_delta, - } - for target_name, weighted_term_delta in top_targets - ], - "supportAuditSummary": { - "topFilingStatusGaps": [ - { - "filing_status": filing_status, - "weighted_count_delta": weighted_count_delta, - } - for filing_status, weighted_count_delta in list( - filing_status_gaps or [] - ) - ], - "topMFSAgiGaps": [ - { - "agi_bin": agi_bin, - "weighted_count_delta": weighted_count_delta, - } - for agi_bin, weighted_count_delta in list(mfs_agi_gaps or []) - ], - }, - } - ) - ) diff --git a/tests/pipelines/test_summarize_pe_native_regressions.py b/tests/pipelines/test_summarize_pe_native_regressions.py deleted file mode 100644 index 8df5a0fb..00000000 --- a/tests/pipelines/test_summarize_pe_native_regressions.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Tests for PE-native regression summary helpers.""" - -from __future__ import annotations - -import json -from pathlib import Path - -from microplex_us.pipelines.summarize_pe_native_regressions import ( - summarize_us_pe_native_regressions, -) - - -def test_summarize_us_pe_native_regressions_aggregates_family_counts(tmp_path) -> None: - root_a = tmp_path / "root-a" - root_b = tmp_path / "root-b" - _write_scored_bundle( - root_a / "run-1", - loss_delta=2.5, - families=[ - ("state_agi_distribution", 1.2), - ("national_irs_other", 0.8), - ("state_aca_spending", 0.2), - ], - audit_family="state_agi_distribution", - audit_target="state/WI/adjusted_gross_income/count/500000_inf", - ) - _write_scored_bundle( - root_a / "run-2", - loss_delta=1.5, - families=[ - ("national_irs_other", 1.1), - ("state_agi_distribution", 0.6), - ], - ) - _write_scored_bundle( - root_b / "run-3", - loss_delta=0.5, - families=[ - ("state_aca_spending", 0.4), - ("national_irs_other", 0.3), - ("state_agi_distribution", 0.1), - ], - audit_family="state_aca_spending", - audit_target="nation/irs/aca_spending/mi", - missing_inputs=["has_esi"], - ) - - summary = summarize_us_pe_native_regressions([root_a, root_b], top_k=2) - - assert summary["totalScoredRuns"] == 3 - assert summary["totalAuditedRuns"] == 2 - assert summary["largestFamilyCounts"] == [ - {"family": "national_irs_other", "count": 1}, - {"family": "state_aca_spending", "count": 1}, - {"family": "state_agi_distribution", "count": 1}, - ] - assert summary["top3FamilyCounts"][0] == { - "family": "national_irs_other", - "top3Count": 3, - "rank1Count": 1, - "rank2Count": 2, - "rank3Count": 0, - } - assert summary["familyCountsByRoot"]["root-a"] == [ - {"family": "national_irs_other", "count": 2}, - {"family": "state_agi_distribution", "count": 2}, - {"family": "state_aca_spending", "count": 1}, - ] - assert summary["targetCountsFromAudits"] == [ - {"target": "nation/irs/aca_spending/mi", "count": 1}, - {"target": "state/WI/adjusted_gross_income/count/500000_inf", "count": 1}, - ] - assert summary["missingCriticalInputsCounts"] == [ - {"variable": "has_esi", "count": 1} - ] - assert summary["worstRuns"][0]["artifactPath"] == "run-1" - assert summary["bestRuns"][0]["artifactPath"] == "run-3" - - -def test_summarize_us_pe_native_regressions_ignores_non_bundle_scores(tmp_path) -> None: - root = tmp_path / "root" - stray = root / "stray" - stray.mkdir(parents=True) - (stray / "policyengine_native_scores.json").write_text( - json.dumps({"summary": {"enhanced_cps_native_loss_delta": 99.0}}) - ) - - _write_scored_bundle( - root / "run-1", - loss_delta=1.0, - families=[("national_irs_other", 0.9)], - ) - - summary = summarize_us_pe_native_regressions([root]) - - assert summary["totalScoredRuns"] == 1 - assert summary["worstRuns"][0]["artifactPath"] == "run-1" - - -def _write_scored_bundle( - bundle_dir: Path, - *, - loss_delta: float, - families: list[tuple[str, float]], - audit_family: str | None = None, - audit_target: str | None = None, - missing_inputs: list[str] | None = None, -) -> None: - bundle_dir.mkdir(parents=True) - (bundle_dir / "policyengine_us.h5").write_text("dataset") - (bundle_dir / "policyengine_native_scores.json").write_text( - json.dumps( - { - "summary": { - "enhanced_cps_native_loss_delta": loss_delta, - "candidate_beats_baseline": False, - }, - "family_breakdown": [ - { - "family": family, - "loss_contribution_delta": delta, - } - for family, delta in families - ], - } - ) - ) - if audit_family is None and audit_target is None and not missing_inputs: - return - (bundle_dir / "pe_us_data_rebuild_native_audit.json").write_text( - json.dumps( - { - "verdictHints": { - "largestRegressingFamily": audit_family, - "largestRegressingTarget": audit_target, - }, - "supportAuditSummary": { - "missingStoredCriticalInputs": list(missing_inputs or []), - }, - } - ) - ) diff --git a/tests/pipelines/test_summarize_policyengine_oracle_regressions.py b/tests/pipelines/test_summarize_policyengine_oracle_regressions.py deleted file mode 100644 index 5e65c7c7..00000000 --- a/tests/pipelines/test_summarize_policyengine_oracle_regressions.py +++ /dev/null @@ -1,177 +0,0 @@ -"""Tests for calibration-oracle regression summary helpers.""" - -from __future__ import annotations - -import json -from pathlib import Path - -from microplex_us.pipelines.summarize_policyengine_oracle_regressions import ( - summarize_us_policyengine_oracle_regressions, -) - - -def test_summarize_us_policyengine_oracle_regressions_aggregates_groups(tmp_path) -> None: - root_a = tmp_path / "root-a" - root_b = tmp_path / "root-b" - _write_oracle_bundle( - root_a / "run-1", - scope_capped_loss=2.5, - families=[ - ("person_count|domain=age", 0.4), - ("tax_unit_count|domain=salt", 0.3), - ("aca_ptc|domain=aca_ptc", 0.1), - ], - geographies=[ - ("state:OR", 0.5), - ("state:GA", 0.2), - ("nation", 0.1), - ], - ) - _write_oracle_bundle( - root_a / "run-2", - scope_capped_loss=1.5, - families=[ - ("tax_unit_count|domain=salt", 0.6), - ("person_count|domain=age", 0.2), - ], - geographies=[ - ("state:GA", 0.4), - ("state:OR", 0.1), - ], - ) - _write_oracle_bundle( - root_b / "run-3", - scope_capped_loss=0.5, - families=[ - ("aca_ptc|domain=aca_ptc", 0.5), - ("person_count|domain=age", 0.4), - ("tax_unit_count|domain=salt", 0.3), - ], - geographies=[ - ("state:OR", 0.6), - ("state:CA", 0.2), - ("state:GA", 0.1), - ], - ) - - summary = summarize_us_policyengine_oracle_regressions([root_a, root_b], top_k=2) - - assert summary["lossScope"] == "full_oracle" - assert summary["totalScoredRuns"] == 3 - assert summary["largestFamilyCounts"] == [ - {"group": "aca_ptc|domain=aca_ptc", "count": 1}, - {"group": "person_count|domain=age", "count": 1}, - {"group": "tax_unit_count|domain=salt", "count": 1}, - ] - assert summary["largestGeographyCounts"] == [ - {"group": "state:OR", "count": 2}, - {"group": "state:GA", "count": 1}, - ] - assert summary["top3FamilyCounts"][0] == { - "group": "person_count|domain=age", - "top3Count": 3, - "rank1Count": 1, - "rank2Count": 2, - "rank3Count": 0, - } - assert summary["top3GeographyCounts"][0] == { - "group": "state:GA", - "top3Count": 3, - "rank1Count": 1, - "rank2Count": 1, - "rank3Count": 1, - } - assert summary["familyCountsByRoot"]["root-a"] == [ - {"group": "person_count|domain=age", "count": 2}, - {"group": "tax_unit_count|domain=salt", "count": 2}, - {"group": "aca_ptc|domain=aca_ptc", "count": 1}, - ] - assert summary["geographyCountsByRoot"]["root-a"] == [ - {"group": "state:GA", "count": 2}, - {"group": "state:OR", "count": 2}, - {"group": "nation", "count": 1}, - ] - assert summary["worstRuns"][0]["artifactPath"] == "run-1" - assert summary["bestRuns"][0]["artifactPath"] == "run-3" - - -def test_summarize_us_policyengine_oracle_regressions_ignores_non_bundle_manifests( - tmp_path, -) -> None: - root = tmp_path / "root" - stray = root / "stray" - stray.mkdir(parents=True) - (stray / "manifest.json").write_text( - json.dumps( - { - "calibration": { - "oracle_loss": { - "full_oracle": { - "capped_mean_abs_relative_error": 99.0, - "family_ranking": [], - "geography_ranking": [], - } - } - } - } - ) - ) - - _write_oracle_bundle( - root / "run-1", - scope_capped_loss=1.0, - families=[("person_count|domain=age", 0.9)], - geographies=[("state:OR", 0.9)], - ) - - summary = summarize_us_policyengine_oracle_regressions([root]) - - assert summary["totalScoredRuns"] == 1 - assert summary["worstRuns"][0]["artifactPath"] == "run-1" - - -def _write_oracle_bundle( - bundle_dir: Path, - *, - scope_capped_loss: float, - families: list[tuple[str, float]], - geographies: list[tuple[str, float]], -) -> None: - bundle_dir.mkdir(parents=True) - (bundle_dir / "policyengine_us.h5").write_text("dataset") - (bundle_dir / "manifest.json").write_text( - json.dumps( - { - "calibration": { - "active_solve_capped_mean_abs_relative_error": scope_capped_loss - / 2.0, - "n_constraints": 10, - "n_supported_targets": 100, - "n_unsupported_targets": 0, - "n_calibration_stages_applied": 1, - "oracle_loss": { - "full_oracle": { - "capped_mean_abs_relative_error": scope_capped_loss, - "mean_abs_relative_error": scope_capped_loss * 2.0, - "family_ranking": [ - { - "group": group, - "capped_loss_share": share, - "capped_sum_abs_relative_error": share * 100.0, - } - for group, share in families - ], - "geography_ranking": [ - { - "group": group, - "capped_loss_share": share, - "capped_sum_abs_relative_error": share * 100.0, - } - for group, share in geographies - ], - } - }, - } - } - ) - ) diff --git a/tests/pipelines/test_summarize_policyengine_oracle_target_drilldown.py b/tests/pipelines/test_summarize_policyengine_oracle_target_drilldown.py deleted file mode 100644 index 547b0a3f..00000000 --- a/tests/pipelines/test_summarize_policyengine_oracle_target_drilldown.py +++ /dev/null @@ -1,277 +0,0 @@ -"""Tests for one-artifact calibration-oracle target drilldowns.""" - -from __future__ import annotations - -import json -import sqlite3 -from pathlib import Path - -import pandas as pd -from microplex.targets import TargetQuery - -from microplex_us.pipelines.summarize_policyengine_oracle_target_drilldown import ( - summarize_us_policyengine_oracle_target_drilldown, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - _policyengine_target_ledger_entry, -) -from microplex_us.policyengine import ( - PolicyEngineUSDBTargetProvider, - PolicyEngineUSEntityTableBundle, - build_policyengine_us_time_period_arrays, - compute_policyengine_us_definition_hash, - write_policyengine_us_time_period_dataset, -) - - -def test_summarize_us_policyengine_oracle_target_drilldown_filters_saved_artifact( - tmp_path, -) -> None: - bundle_dir = tmp_path / "bundle" - bundle_dir.mkdir() - db_path = tmp_path / "policy_data.db" - dataset_path = bundle_dir / "policyengine_us.h5" - - _create_policyengine_targets_db(db_path) - _write_policyengine_dataset(dataset_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - target = provider.load_target_set( - TargetQuery(period=2024, provider_filters={"variables": ["household_count"]}) - ).targets[0] - target_ledger = [ - _policyengine_target_ledger_entry( - target=target, - stage="solve_now", - reason="selected_stage_1", - household_count=2, - ) - ] - - config = USMicroplexBuildConfig( - policyengine_targets_db=str(db_path), - policyengine_target_period=2024, - policyengine_target_variables=("household_count",), - policyengine_calibration_target_variables=("household_count",), - calibration_backend="entropy", - policyengine_dataset_year=2024, - ) - (bundle_dir / "manifest.json").write_text( - json.dumps( - { - "config": config.to_dict(), - "artifacts": {"policyengine_dataset": dataset_path.name}, - "calibration": { - "oracle_relative_error_cap": 10.0, - "materialized_variables": [], - "target_ledger": target_ledger, - }, - } - ) - ) - - summary = summarize_us_policyengine_oracle_target_drilldown(bundle_dir, top_k=5) - - assert summary["summary"]["targetCount"] == 1 - assert summary["summary"]["supportedTargetCount"] == 1 - assert summary["summary"]["unsupportedTargetCount"] == 0 - assert summary["summary"]["stageCounts"] == {"solve_now": 1} - assert summary["summary"]["largestFamiliesByCappedError"] == [ - { - "group": "household_count|domain=household_count", - "cappedErrorMass": 0.6, - "count": 1, - "meanCappedError": 0.6, - } - ] - assert summary["summary"]["largestGeographiesByCappedError"] == [ - { - "group": "state:CA", - "cappedErrorMass": 0.6, - "count": 1, - "meanCappedError": 0.6, - } - ] - assert summary["topRows"][0]["stage"] == "solve_now" - assert summary["topRows"][0]["loss_family"] == "household_count|domain=household_count" - assert summary["topRows"][0]["loss_geography"] == "state:CA" - assert summary["topRows"][0]["actual_value"] == 2.0 - assert summary["topRows"][0]["target_value"] == 5.0 - assert summary["topRows"][0]["driver_variable"] == "household_count" - assert summary["topRows"][0]["provenance_class"] == "stored_input" - - family_summary = summarize_us_policyengine_oracle_target_drilldown( - bundle_dir, - family="household_count|domain=household_count", - geography="state:CA", - stage="solve_now", - top_k=5, - ) - assert family_summary["summary"]["targetCount"] == 1 - assert family_summary["topRows"][0]["target_name"] == summary["topRows"][0]["target_name"] - - -def test_summarize_us_policyengine_oracle_target_drilldown_marks_rematerialized_formula( - tmp_path, -) -> None: - bundle_dir = tmp_path / "bundle" - bundle_dir.mkdir() - db_path = tmp_path / "policy_data.db" - dataset_path = bundle_dir / "policyengine_us.h5" - - _create_policyengine_targets_db( - db_path, - variable="snap", - value=250.0, - domain_variable="snap", - ) - _write_policyengine_dataset(dataset_path, include_raw_snap=True) - - provider = PolicyEngineUSDBTargetProvider(db_path) - target = provider.load_target_set( - TargetQuery(period=2024, provider_filters={"variables": ["snap"]}) - ).targets[0] - target_ledger = [ - _policyengine_target_ledger_entry( - target=target, - stage="solve_now", - reason="selected_stage_1", - household_count=2, - ) - ] - - config = USMicroplexBuildConfig( - policyengine_targets_db=str(db_path), - policyengine_target_period=2024, - policyengine_target_variables=("snap",), - policyengine_calibration_target_variables=("snap",), - calibration_backend="entropy", - policyengine_dataset_year=2024, - ) - (bundle_dir / "manifest.json").write_text( - json.dumps( - { - "config": config.to_dict(), - "artifacts": {"policyengine_dataset": dataset_path.name}, - "calibration": { - "oracle_relative_error_cap": 10.0, - "materialized_variables": [], - "target_ledger": target_ledger, - }, - } - ) - ) - - summary = summarize_us_policyengine_oracle_target_drilldown(bundle_dir, top_k=5) - - assert summary["topRows"][0]["driver_variable"] == "snap" - assert summary["topRows"][0]["driver_is_materialized"] is True - assert summary["topRows"][0]["provenance_class"] == "policyengine_materialized" - - -def _write_policyengine_dataset(path: Path, *, include_raw_snap: bool = False) -> None: - household_data = { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 6], - } - household_variable_map = {"state_fips": "state_fips"} - if include_raw_snap: - household_data["snap"] = [100.0, 0.0] - household_variable_map["snap"] = "snap" - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame(household_data), - persons=pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [35, 40], - } - ), - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map=household_variable_map, - person_variable_map={"age": "age"}, - ) - write_policyengine_us_time_period_dataset(arrays, path) - - -def _create_policyengine_targets_db( - path: Path, - *, - variable: str = "household_count", - value: float = 5.0, - domain_variable: str = "household_count", -) -> None: - conn = sqlite3.connect(path) - conn.executescript( - f""" - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - - CREATE VIEW target_overview AS - SELECT - t.target_id, - t.stratum_id, - t.variable, - t.value, - t.period, - t.active, - 'state' AS geo_level, - '06' AS geographic_id, - '{domain_variable}' AS domain_variable - FROM targets AS t; - """ - ) - conn.execute( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, NULL) - """, - (1, compute_policyengine_us_definition_hash(())), - ) - conn.execute( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - (1, variable, 2024, 1, 0, value, 1, None, "test", variable), - ) - conn.commit() - conn.close() diff --git a/tests/pipelines/test_transparency_sidecars.py b/tests/pipelines/test_transparency_sidecars.py deleted file mode 100644 index 6447bd6a..00000000 --- a/tests/pipelines/test_transparency_sidecars.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Tests for non-gating Microplex transparency sidecars.""" - -from __future__ import annotations - -import importlib.util -import json -import sys -from pathlib import Path - -import pytest - -_MODULE_PATH = ( - Path(__file__).resolve().parents[2] - / "src" - / "microplex_us" - / "pipelines" - / "transparency_sidecars.py" -) -_spec = importlib.util.spec_from_file_location("transparency_sidecars", _MODULE_PATH) -sidecars = importlib.util.module_from_spec(_spec) -sys.modules["transparency_sidecars"] = sidecars -_spec.loader.exec_module(sidecars) - - -def _write_json(path: Path, payload: dict) -> Path: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload)) - return path - - -def test_sidecars_parse_active_donor_block_without_h5(tmp_path): - artifact_root = tmp_path / "artifact" - log_path = artifact_root / "logs" / "gate1_build.log" - log_path.parent.mkdir(parents=True) - log_path.write_text( - "\n".join( - [ - "[2026-06-01T07:34:56-04:00] Starting Gate-1 fresh eCPS-shaped Microplex build.", - "[2026-06-01T07:34:56-04:00] Shape: CPS/ASEC survey year 2025 spine (calendar/income year 2024) + PUF 2024 clones.", - "PE-US-data rebuild checkpoint: starting build [output_root=/tmp/run, version_id=mp-test, target_profile=pe_native_broad, providers=cps_asec,irs_soi_puf]", - "Downloading CPS ASEC 2025 from https://example.test/asec.zip...", - "US microplex donor integration: source ready [donor_source=irs_soi_puf_2024, donor_rows=232699, shared_vars=14, donor_target_vars=71, blocks=70]", - "US microplex donor integration: block start [donor_source=irs_soi_puf_2024, block=capital_gains, restored=capital_gains]", - "US microplex donor integration: block run [donor_source=irs_soi_puf_2024, block=capital_gains, condition_vars=8, donor_rows=232699, current_rows=142125]", - ] - ) - ) - summary = sidecars.write_transparency_sidecars(artifact_root) - - assert summary["dataset_available"] is False - assert summary["production_performance_gate"] == "loss" - imputation = json.loads( - (artifact_root / "transparency" / "imputation_manifest.json").read_text() - ) - source = imputation["donor_integration"]["sources"][0] - assert source["donor_source"] == "irs_soi_puf_2024" - assert source["ready"]["blocks"] == 70 - assert source["active_blocks"] == ["capital_gains"] - - source_manifest = json.loads( - (artifact_root / "transparency" / "source_manifest.json").read_text() - ) - assert source_manifest["build_config"]["version_id"] == "mp-test" - assert source_manifest["source_events"][0]["message"].startswith( - "Downloading CPS ASEC 2025" - ) - - -def test_sidecars_summarize_h5_columns_rows_and_calibration(tmp_path): - h5py = pytest.importorskip("h5py") - import numpy as np - - artifact_root = tmp_path / "artifact" - artifact_root.mkdir() - h5_path = artifact_root / "policyengine_us.h5" - with h5py.File(h5_path, "w") as h5: - h5.create_dataset("age/2024", data=np.array([30, 40, 50])) - h5.create_dataset("snap/2024", data=np.array([0, 1, 0])) - h5.create_dataset("employment_income/2024", data=np.array([1, 2, 3])) - h5.create_dataset("snap_reported/2024", data=np.array([0, 1, 0])) - h5.create_dataset("household_weight/2024", data=np.array([1.0, 2.0])) - contract = _write_json( - tmp_path / "contract.json", - { - "required": ["age", "snap", "employment_income", "state_code"], - "forbidden": ["snap_reported"], - "ecps_internal_optional": [], - "formula_owned_excluded": ["weeks_worked"], - }, - ) - _write_json( - artifact_root / "calibration_summary.json", - { - "backend": "policyengine_db_entropy", - "period": 2024, - "converged": False, - "n_loaded_targets": 10, - "n_supported_targets": 9, - }, - ) - - sidecars.write_transparency_sidecars(artifact_root, contract_path=contract) - - columns = json.loads( - (artifact_root / "transparency" / "column_manifest.json").read_text() - ) - assert columns["available"] is True - assert columns["missing_required"] == ["state_code"] - assert columns["forbidden_present"] == ["snap_reported"] - assert columns["diagnostic_status"] == "needs_attention" - - rows = json.loads( - (artifact_root / "transparency" / "row_count_manifest.json").read_text() - ) - assert rows["available"] is True - assert rows["shape_counts"][0]["shape"] == "3" - assert rows["shape_counts"][0]["variable_count"] == 4 - - calibration = json.loads( - (artifact_root / "transparency" / "calibration_trace.json").read_text() - ) - assert calibration["available"] is True - assert calibration["summaries"][0]["n_supported_targets"] == 9 diff --git a/tests/pipelines/test_us.py b/tests/pipelines/test_us.py deleted file mode 100644 index 0dd307e1..00000000 --- a/tests/pipelines/test_us.py +++ /dev/null @@ -1,12305 +0,0 @@ -"""Tests for the US microplex pipeline library.""" - -import json -import logging -import sqlite3 -from pathlib import Path -from types import SimpleNamespace - -import h5py -import numpy as np -import pandas as pd -import pytest -from microplex.calibration import LinearConstraint -from microplex.core import ( - EntityObservation, - EntityRelationship, - EntityType, - ObservationFrame, - RelationshipCardinality, - Shareability, - SourceDescriptor, - SourceQuery, - SourceVariableCapability, - StaticSourceProvider, - TimeStructure, -) -from microplex.targets import TargetAggregation, TargetQuery, TargetSpec - -import microplex_us.pipelines.us as us_pipeline_module -from microplex_us.geography import BlockGeography -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexPipeline, - USMicroplexTargets, - _attach_household_census_geographies, - _normalize_policyengine_constraints_for_microcalibrate, - _policyengine_target_loss_geography_key, - _select_feasible_policyengine_calibration_constraints, - _select_policyengine_deferred_stage_constraints, - _select_ssi_takeup_by_age_amount, - _summarize_policyengine_target_fit_report, - _summarize_weight_diagnostics, - build_us_microplex, -) -from microplex_us.policyengine.comparison import ( - PolicyEngineUSTargetEvaluation, - PolicyEngineUSTargetEvaluationReport, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSConstraint, - PolicyEngineUSEntityTableBundle, - PolicyEngineUSVariableBinding, - PolicyEngineUSVariableMaterializationResult, - build_policyengine_us_export_variable_maps, - compute_policyengine_us_definition_hash, -) - - -def _create_policyengine_calibration_db(path) -> None: - national_constraints: tuple[PolicyEngineUSConstraint, ...] = () - california_constraints = (PolicyEngineUSConstraint("state_fips", "==", "6"),) - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - """ - ) - conn.executemany( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - [ - ( - 1, - compute_policyengine_us_definition_hash(national_constraints), - None, - ), - ( - 2, - compute_policyengine_us_definition_hash( - california_constraints, - parent_stratum_id=1, - ), - 1, - ), - ], - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - constraint_variable, - operation, - value - ) VALUES (?, ?, ?, ?) - """, - [(2, "state_fips", "==", "6")], - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (1, "household_count", 2024, 1, 0, 450.0, 1, None, "test", "national"), - (2, "household_count", 2024, 2, 0, 225.0, 1, None, "test", "ca"), - ], - ) - conn.commit() - conn.close() - - -def _create_policyengine_calibration_db_with_unsupported_target(path) -> None: - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - """ - ) - conn.executemany( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - [ - (1, compute_policyengine_us_definition_hash(()), None), - (2, compute_policyengine_us_definition_hash(()), None), - ], - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 10, - "household_count", - 2024, - 1, - 0, - 450.0, - 1, - 0.0, - "test", - "All households", - ), - ( - 11, - "income_tax", - 2024, - 2, - 0, - 0.0, - 1, - 0.0, - "test", - "Income tax total", - ), - ], - ) - conn.commit() - conn.close() - - -def test_select_ssi_takeup_by_age_amount_matches_reported_age_group_amounts(): - selected, summary = _select_ssi_takeup_by_age_amount( - person_ids=pd.Series([1, 2, 3, 4]), - ages=pd.Series([70, 70, 40, 40]), - weights=pd.Series([1.0, 1.0, 1.0, 1.0]), - reported_ssi=pd.Series([100.0, 0.0, 100.0, 0.0]), - full_takeup_ssi=pd.Series([80.0, 20.0, 20.0, 80.0]), - ) - - assert selected.tolist() == [True, True, True, True] - assert summary["reported_amount"] == 200.0 - assert summary["selected_amount"] == 200.0 - assert summary["groups"]["aged"]["selected_amount"] == 100.0 - assert summary["groups"]["under65"]["selected_amount"] == 100.0 - - -class TestUSMicroplexBuildConfig: - """Test pipeline configuration.""" - - def test_defaults(self): - config = USMicroplexBuildConfig() - - assert config.synthesis_backend == "synthesizer" - assert config.calibration_backend == "entropy" - assert config.n_synthetic == 100_000 - assert config.random_seed == 42 - assert config.donor_imputer_authoritative_override_variables == () - assert ( - config.policyengine_calibration_deferred_stage_min_active_households == () - ) - assert config.policyengine_calibration_deferred_stage_max_constraints == 24 - assert ( - config.policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error - is None - ) - assert config.policyengine_calibration_deferred_stage_top_family_count == 8 - assert config.policyengine_calibration_deferred_stage_top_geography_count == 8 - assert config.dependent_tax_leaf_soft_cap_multiplier is None - assert config.dependent_tax_leaf_soft_cap_base_variables == ( - "employment_income", - "wage_income", - "self_employment_income", - ) - assert config.dependent_tax_leaf_soft_cap_variables == ( - "taxable_interest_income", - "tax_exempt_interest_income", - "taxable_pension_income", - "dividend_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "partnership_s_corp_income", - "rental_income", - ) - - def test_custom_values(self): - config = USMicroplexBuildConfig( - n_synthetic=250, - synthesis_backend="seed", - calibration_backend="ipf", - synthesizer_epochs=12, - policyengine_selection_backend="pe_native_loss", - policyengine_selection_household_budget=500, - policyengine_selection_state_floor=25, - policyengine_selection_max_iter=750, - policyengine_selection_tol=1e-7, - policyengine_selection_l2_penalty=1e-5, - policyengine_selection_target_total_weight=150_000_000.0, - ) - - assert config.n_synthetic == 250 - assert config.synthesis_backend == "seed" - assert config.calibration_backend == "ipf" - assert config.synthesizer_epochs == 12 - assert config.policyengine_selection_backend == "pe_native_loss" - assert config.policyengine_selection_household_budget == 500 - assert config.policyengine_selection_state_floor == 25 - assert config.policyengine_selection_max_iter == 750 - assert config.policyengine_selection_tol == 1e-7 - assert config.policyengine_selection_l2_penalty == 1e-5 - assert config.policyengine_selection_target_total_weight == 150_000_000.0 - assert config.policyengine_oracle_relative_error_cap == 10.0 - - def test_can_opt_into_authoritative_donor_overrides(self): - config = USMicroplexBuildConfig( - donor_imputer_authoritative_override_variables=( - "self_employment_income", - "rental_income", - ) - ) - - assert config.donor_imputer_authoritative_override_variables == ( - "self_employment_income", - "rental_income", - ) - - def test_puf_support_clone_requires_seed_backend_and_no_household_selection(self): - with pytest.raises(ValueError, match="synthesis_backend='seed'"): - USMicroplexBuildConfig(puf_support_clone_enabled=True) - - with pytest.raises(ValueError, match="policyengine_selection_household_budget"): - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - policyengine_selection_household_budget=10, - ) - - def test_initialize_puf_support_clone_calibration_weights_reserves_clone_share( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - puf_support_clone_prior_weight_share=0.05, - ) - ) - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": ["h1", "h2", "h1__puf_clone", "h2__puf_clone"], - "household_weight": [100.0, 200.0, 0.0, 0.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2, 3, 4], - "household_id": [ - "h1", - "h2", - "h1__puf_clone", - "h2__puf_clone", - ], - "person_is_puf_clone": [0.0, 0.0, 1.0, 1.0], - "weight": [100.0, 200.0, 0.0, 0.0], - } - ), - tax_units=pd.DataFrame(), - spm_units=pd.DataFrame(), - families=pd.DataFrame(), - marital_units=pd.DataFrame(), - ) - - updated_tables, summary = pipeline._initialize_puf_clone_calibration_weights( - tables - ) - - assert summary["applied"] is True - assert summary["clone_household_count"] == 2 - assert summary["clone_prior_weight_share"] == pytest.approx(0.05) - assert summary["pre_clone_weight_sum"] == 0.0 - assert summary["pre_clone_original_weight_sum"] == pytest.approx(300.0) - assert summary["clone_prior_total_weight"] == pytest.approx(300.0 * 0.05 / 0.95) - assert summary["clone_prior_household_weight"] == pytest.approx( - 300.0 * 0.05 / 0.95 / 2 - ) - assert updated_tables.households["household_weight"].tolist() == [ - pytest.approx(100.0), - pytest.approx(200.0), - pytest.approx(300.0 * 0.05 / 0.95 / 2), - pytest.approx(300.0 * 0.05 / 0.95 / 2), - ] - assert updated_tables.persons["weight"].tolist() == [100.0, 200.0, 0.0, 0.0] - - def test_initialize_puf_support_clone_calibration_weights_skips_no_calibration( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - calibration_backend="none", - puf_support_clone_enabled=True, - ) - ) - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [100.0, 0.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [1, 2], - "person_is_puf_clone": [0.0, 1.0], - "weight": [100.0, 0.0], - } - ), - tax_units=pd.DataFrame(), - spm_units=pd.DataFrame(), - families=pd.DataFrame(), - marital_units=pd.DataFrame(), - ) - - updated_tables, summary = pipeline._initialize_puf_clone_calibration_weights( - tables - ) - - assert summary["applied"] is False - assert summary["reason"] == "calibration_backend_none" - assert updated_tables.households["household_weight"].tolist() == [100.0, 0.0] - - def test_rejects_conflicting_policyengine_weight_rescale_modes(self): - with pytest.raises(ValueError, match="mutually exclusive"): - USMicroplexBuildConfig( - policyengine_calibration_rescale_to_input_weight_sum=True, - policyengine_calibration_rescale_to_target_total_weight=True, - policyengine_calibration_target_total_weight=150_000_000.0, - ) - - def test_rejects_target_rescale_without_target_total_weight(self): - with pytest.raises( - ValueError, - match="requires policyengine_calibration_target_total_weight", - ): - USMicroplexBuildConfig( - policyengine_calibration_rescale_to_target_total_weight=True - ) - - def test_rejects_nonpositive_oracle_relative_error_cap(self): - with pytest.raises(ValueError, match="must be positive"): - USMicroplexBuildConfig(policyengine_oracle_relative_error_cap=0.0) - - def test_rejects_nonpositive_deferred_stage_support_floor(self): - with pytest.raises(ValueError, match="must contain only positive"): - USMicroplexBuildConfig( - policyengine_calibration_deferred_stage_min_active_households=(0,) - ) - - def test_rejects_negative_deferred_stage_family_focus_limit(self): - with pytest.raises(ValueError, match="must be nonnegative"): - USMicroplexBuildConfig( - policyengine_calibration_deferred_stage_top_family_count=-1 - ) - - def test_rejects_negative_deferred_stage_geography_focus_limit(self): - with pytest.raises(ValueError, match="must be nonnegative"): - USMicroplexBuildConfig( - policyengine_calibration_deferred_stage_top_geography_count=-1 - ) - - def test_rejects_nonpositive_deferred_stage_constraint_cap(self): - with pytest.raises(ValueError, match="must be positive"): - USMicroplexBuildConfig( - policyengine_calibration_deferred_stage_max_constraints=0 - ) - - def test_rejects_nonpositive_deferred_stage_trigger_threshold(self): - with pytest.raises(ValueError, match="must be positive"): - USMicroplexBuildConfig( - policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error=0.0 - ) - - def test_rejects_negative_dependent_tax_leaf_soft_cap_multiplier(self): - with pytest.raises(ValueError, match="must be non-negative"): - USMicroplexBuildConfig(dependent_tax_leaf_soft_cap_multiplier=-0.01) - - -def test_apply_dependent_tax_leaf_soft_caps_only_for_dependents(): - config = USMicroplexBuildConfig(dependent_tax_leaf_soft_cap_multiplier=0.5) - pipeline = USMicroplexPipeline(config) - seed_data = pd.DataFrame( - { - "is_tax_unit_dependent": [1, 0], - "employment_income": [100.0, 100.0], - "wage_income": [0.0, 20.0], - "self_employment_income": [0.0, 0.0], - "taxable_interest_income": [80.0, 80.0], - "rental_income": [120.0, 120.0], - } - ) - - updated = pipeline._apply_dependent_tax_leaf_soft_caps(seed_data.copy()) - - assert updated.loc[0, "taxable_interest_income"] == pytest.approx(50.0) - assert updated.loc[0, "rental_income"] == pytest.approx(50.0) - assert updated.loc[1, "taxable_interest_income"] == pytest.approx(80.0) - assert updated.loc[1, "rental_income"] == pytest.approx(120.0) - - -def test_summarize_policyengine_target_fit_report_caps_relative_error(): - target = TargetSpec( - name="tiny_target", - entity=EntityType.HOUSEHOLD, - period=2024, - measure="state_income_tax", - aggregation=TargetAggregation.SUM, - value=0.0, - source="test", - metadata={}, - ) - report = PolicyEngineUSTargetEvaluationReport( - label="candidate", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation(target=target, actual_value=25.0), - ], - ) - - summary = _summarize_policyengine_target_fit_report( - report, - target_count=1, - relative_error_cap=10.0, - ) - - assert summary["mean_abs_relative_error"] == pytest.approx(25.0) - assert summary["capped_mean_abs_relative_error"] == pytest.approx(10.0) - assert summary["relative_error_cap"] == pytest.approx(10.0) - - -def test_summarize_policyengine_target_fit_report_penalizes_unsupported_targets(): - supported_target = TargetSpec( - name="supported_target", - entity=EntityType.HOUSEHOLD, - period=2024, - aggregation=TargetAggregation.COUNT, - value=100.0, - source="test", - metadata={}, - ) - unsupported_target = TargetSpec( - name="unsupported_target", - entity=EntityType.TAX_UNIT, - period=2024, - aggregation=TargetAggregation.COUNT, - value=1.0, - source="test", - metadata={}, - ) - report = PolicyEngineUSTargetEvaluationReport( - label="candidate", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation( - target=supported_target, - actual_value=100.0, - ), - ], - unsupported_targets=[unsupported_target], - ) - - summary = _summarize_policyengine_target_fit_report( - report, - target_count=2, - relative_error_cap=10.0, - ) - - assert summary["supported_only_mean_abs_relative_error"] == pytest.approx( - 0.0, - abs=1e-12, - ) - assert summary["unsupported_target_error_penalty"] == pytest.approx(10.0) - assert summary["mean_abs_relative_error"] == pytest.approx(5.0) - assert summary["capped_mean_abs_relative_error"] == pytest.approx(5.0) - - -def test_select_policyengine_deferred_stage_constraints_prioritizes_target_level_loss(): - def _target(name: str) -> TargetSpec: - return TargetSpec( - name=name, - entity=EntityType.HOUSEHOLD, - period=2024, - aggregation=TargetAggregation.COUNT, - value=100.0, - source="test", - metadata={ - "variable": "household_count", - "geo_level": "state", - "geographic_id": "6", - }, - ) - - compiled_targets = [ - _target("a_low"), - _target("m_mid"), - _target("z_high"), - ] - compiled_constraints = ( - SimpleNamespace(coefficients=np.array([1.0] * 12)), - SimpleNamespace(coefficients=np.array([1.0] * 12)), - SimpleNamespace(coefficients=np.array([1.0] * 12)), - ) - target_ledger = [ - { - "target_name": target.name, - "stage": "solve_later", - "variable": "household_count", - "domain_variable": "", - "geo_level": "state", - "geographic_id": "6", - } - for target in compiled_targets - ] - deferred_oracle_loss = { - "family_ranking": [ - { - "group": "household_count", - "capped_loss_share": 1.0, - "capped_sum_abs_relative_error": 10.0, - } - ], - "geography_ranking": [ - { - "group": "state:CA", - "capped_loss_share": 1.0, - "capped_sum_abs_relative_error": 10.0, - } - ], - } - - selected_targets, _, metadata = _select_policyengine_deferred_stage_constraints( - compiled_targets=compiled_targets, - compiled_constraints=compiled_constraints, - target_ledger=target_ledger, - deferred_oracle_loss=deferred_oracle_loss, - deferred_target_priority_lookup={ - "a_low": 0.5, - "m_mid": 1.0, - "z_high": 4.0, - }, - selected_target_names=set(), - household_count=100, - min_active_households=10, - max_constraints=1, - max_constraints_per_household=None, - top_family_count=1, - top_geography_count=1, - ) - - assert [target.name for target in selected_targets] == ["z_high"] - assert metadata["target_error_priority_available"] is True - assert metadata["n_focus_eligible_constraints"] == 3 - - -class TestUSMicroplexPipeline: - """Test orchestration for US microplex builds.""" - - def test_policyengine_target_loss_geography_key_normalizes_state_fips(self): - assert ( - _policyengine_target_loss_geography_key( - {"geo_level": "state", "geographic_id": "1"} - ) - == "state:AL" - ) - assert ( - _policyengine_target_loss_geography_key( - {"geo_level": "state", "geographic_id": "01"} - ) - == "state:AL" - ) - assert ( - _policyengine_target_loss_geography_key( - {"geo_level": "national", "geographic_id": "usa"} - ) - == "national:US" - ) - - @pytest.fixture - def households(self): - return pd.DataFrame( - { - "household_id": [1, 2, 3], - "state_fips": [6, 36, 48], - "county_fips": [6037, 36061, 48201], - "hh_weight": [100.0, 150.0, 200.0], - "tenure": [1, 2, 1], - } - ) - - @pytest.fixture - def persons(self): - return pd.DataFrame( - { - "person_id": [10, 11, 12, 13, 14, 15], - "household_id": [1, 1, 2, 2, 3, 3], - "age": [34, 12, 47, 43, 68, 30], - "sex": [1, 2, 2, 1, 1, 2], - "education": [3, 1, 4, 4, 2, 4], - "employment_status": [1, 0, 1, 1, 2, 1], - "income": [55_000.0, 0.0, 72_000.0, 40_000.0, 18_000.0, 65_000.0], - } - ) - - def test_attach_household_census_geographies_from_state_county(self): - geography = BlockGeography.from_data( - pd.DataFrame( - { - "geoid": ["060010201001000", "360610101001000"], - "state_fips": ["06", "36"], - "county": ["001", "061"], - "county_fips": ["06001", "36061"], - "tract": ["020100", "010100"], - "tract_geoid": ["06001020100", "36061010100"], - "cd_id": ["CA-01", "NY-12"], - "prob": [1.0, 1.0], - } - ) - ) - households = pd.DataFrame( - { - "household_id": [10, 20], - "state_fips": [6, 36], - "county_fips": [1, 61], - }, - index=[100, 200], - ) - - result = _attach_household_census_geographies( - households, - seed=0, - geography=geography, - ).sort_values("household_id") - - assert result["block_geoid"].tolist() == [ - "060010201001000", - "360610101001000", - ] - assert result["county_fips"].tolist() == ["06001", "36061"] - assert result["tract_geoid"].tolist() == ["06001020100", "36061010100"] - assert result["congressional_district_geoid"].tolist() == [601, 3612] - - def test_prepare_seed_data(self, persons, households): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - - seed = pipeline.prepare_seed_data(persons, households) - - assert len(seed) == len(persons) - assert "state" in seed.columns - assert "county_fips" in seed.columns - assert "block_geoid" in seed.columns - assert "tract_geoid" in seed.columns - assert "congressional_district_geoid" in seed.columns - assert "age_group" in seed.columns - assert "income_bracket" in seed.columns - assert set(seed["state"]) == {"CA", "NY", "TX"} - assert set(seed["age_group"].astype(str)) == {"0-17", "18-34", "35-54", "65+"} - - def test_prepare_seed_data_normalizes_social_security_components(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - households = pd.DataFrame( - { - "household_id": [1], - "state_fips": [6], - "county_fips": [6037], - "hh_weight": [100.0], - "tenure": [1], - } - ) - persons = pd.DataFrame( - { - "person_id": [10], - "household_id": [1], - "age": [68], - "sex": [1], - "education": [3], - "employment_status": [2], - "income": [1_200.0], - "gross_social_security": [1_200.0], - "social_security_disability": [200.0], - } - ) - - seed = pipeline.prepare_seed_data(persons, households) - row = seed.iloc[0] - - assert row["social_security"] == 1_200.0 - assert row["social_security_retirement"] == 0.0 - assert row["social_security_disability"] == 200.0 - assert row["social_security_survivors"] == 0.0 - assert row["social_security_dependents"] == 0.0 - assert row["social_security_unclassified"] == 1_000.0 - - def test_build_targets(self, persons, households): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - seed = pipeline.prepare_seed_data(persons, households) - - targets = pipeline.build_targets(seed) - - assert isinstance(targets, USMicroplexTargets) - assert set(targets.marginal.keys()) == {"state", "age_group", "income_bracket"} - assert targets.marginal["state"]["CA"] == 200.0 - assert targets.marginal["state"]["NY"] == 300.0 - assert targets.marginal["state"]["TX"] == 400.0 - expected_income = float((seed["hh_weight"] * seed["income"]).sum()) - assert targets.continuous["income"] == expected_income - - def test_build_with_bootstrap_backend(self, persons, households): - config = USMicroplexBuildConfig( - n_synthetic=12, - synthesis_backend="bootstrap", - calibration_backend="entropy", - random_seed=7, - ) - result = build_us_microplex(persons, households, config) - - assert len(result.seed_data) == len(persons) - assert result.synthetic_data["household_id"].nunique() == 12 - assert result.calibrated_data["household_id"].nunique() == 12 - assert len(result.synthetic_data) > 12 - assert len(result.calibrated_data) > 12 - assert "weight" in result.calibrated_data.columns - assert result.calibration_summary["max_error"] < 0.05 - assert result.synthesizer is None - assert result.policyengine_tables is not None - assert result.source_frame is not None - assert result.fusion_plan is not None - assert len(result.policyengine_tables.households) == 12 - assert len(result.policyengine_tables.persons) == len(result.calibrated_data) - assert len(result.policyengine_tables.tax_units) > 0 - assert len(result.policyengine_tables.spm_units) > 0 - assert len(result.policyengine_tables.families) > 0 - assert len(result.policyengine_tables.marital_units) > 0 - assert result.synthesis_metadata["bootstrap_strata_columns"] == [] - - def test_bootstrap_infers_state_strata_from_target_scope(self, persons, households): - config = USMicroplexBuildConfig( - synthesis_backend="bootstrap", - policyengine_calibration_target_geo_levels=("state",), - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households) - - assert pipeline._resolve_bootstrap_strata_columns(seed) == ("state_fips",) - - def test_bootstrap_preserves_state_support_when_state_targets_are_requested(self): - seed = pd.DataFrame( - { - "household_id": [0, 1], - "person_id": [0, 1], - "hh_weight": [1000.0, 1.0], - "state_fips": [6, 36], - "age": [40, 41], - "sex": [1, 2], - "education": [3, 3], - "employment_status": [1, 1], - "tenure": [1, 1], - "income": [50_000.0, 60_000.0], - } - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - random_seed=1, - policyengine_calibration_target_geo_levels=("state",), - ) - ) - - synthetic = pipeline._synthesize_bootstrap( - seed, - initial_weight=1.0, - strata_columns=pipeline._resolve_bootstrap_strata_columns(seed), - ) - - assert synthetic["state_fips"].nunique() == 2 - - def test_bootstrap_explicit_missing_strata_column_raises(self, persons, households): - config = USMicroplexBuildConfig( - synthesis_backend="bootstrap", - bootstrap_strata_columns=("missing_geo",), - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households) - - with pytest.raises(ValueError, match="bootstrap_strata_columns"): - pipeline._resolve_bootstrap_strata_columns(seed) - - def test_build_with_synthesizer_backend(self, persons, households): - config = USMicroplexBuildConfig( - n_synthetic=10, - synthesis_backend="synthesizer", - calibration_backend="entropy", - synthesizer_epochs=5, - synthesizer_n_layers=2, - synthesizer_hidden_dim=16, - random_seed=11, - ) - result = build_us_microplex(persons, households, config) - - assert len(result.synthetic_data) == 10 - assert result.synthesizer is not None - assert result.synthesis_metadata["backend"] == "synthesizer" - assert result.synthesis_metadata["condition_vars"] == [ - "age", - "sex", - "education", - "employment_status", - "state_fips", - "tenure", - ] - assert result.synthesis_metadata["target_vars"] == ["income"] - assert result.fusion_plan is not None - assert set(result.fusion_plan.output_entities) == { - EntityType.HOUSEHOLD, - EntityType.PERSON, - } - assert (result.synthetic_data["income"] >= 0).all() - assert result.policyengine_tables is not None - - def test_build_policyengine_entity_tables(self, persons, households): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - seed = pipeline.prepare_seed_data(persons, households) - synthetic = pipeline._finalize_synthetic_population(seed, initial_weight=1.0) - - tables = pipeline.build_policyengine_entity_tables(synthetic) - - assert set(tables.households.columns) >= {"household_id", "household_weight"} - assert set(tables.persons.columns) >= { - "person_id", - "household_id", - "tax_unit_id", - "spm_unit_id", - "family_id", - "marital_unit_id", - } - assert set(tables.tax_units.columns) >= {"tax_unit_id", "household_id"} - assert set(tables.spm_units.columns) >= {"spm_unit_id", "household_id"} - assert set(tables.families.columns) >= {"family_id", "household_id"} - assert set(tables.marital_units.columns) >= {"marital_unit_id", "household_id"} - assert tables.persons["tax_unit_id"].notna().all() - assert tables.persons["spm_unit_id"].notna().all() - assert tables.persons["family_id"].notna().all() - assert tables.persons["marital_unit_id"].notna().all() - assert set(tables.tax_units["filing_status"]).issubset( - {"SINGLE", "JOINT", "SEPARATE", "HEAD_OF_HOUSEHOLD", "SURVIVING_SPOUSE"} - ) - - def test_build_policyengine_entity_tables_preserves_household_contract_inputs( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "weight": [1.0, 1.0, 2.0], - "age": [45, 12, 70], - "income": [60_000.0, 0.0, 25_000.0], - "relationship_to_head": [0, 2, 0], - "state_fips": [6, 6, 36], - "tenure": [1, 1, 2], - "tenure_type": ["OWNER_WITH_MORTGAGE", "OWNER_WITH_MORTGAGE", "RENTER"], - "net_worth": [300_000.0, 300_000.0, 50_000.0], - "auto_loan_balance": [12_000.0, 12_000.0, 0.0], - "auto_loan_interest": [600.0, 600.0, 0.0], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - households = tables.households.sort_values("household_id").reset_index( - drop=True - ) - - assert households["tenure_type"].tolist() == [ - "OWNER_WITH_MORTGAGE", - "RENTER", - ] - assert households["net_worth"].tolist() == [300_000.0, 50_000.0] - assert households["auto_loan_balance"].tolist() == [12_000.0, 0.0] - assert households["auto_loan_interest"].tolist() == [600.0, 0.0] - assert tables.spm_units.sort_values("household_id")[ - "spm_unit_tenure_type" - ].tolist() == [ - "OWNER_WITH_MORTGAGE", - "RENTER", - ] - - def test_build_policyengine_entity_tables_preserves_spm_source_inputs( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "spm_unit_id": [100, 100, 200], - "weight": [1.0, 1.0, 2.0], - "age": [45, 12, 70], - "income": [60_000.0, 0.0, 25_000.0], - "relationship_to_head": [0, 2, 0], - "receives_housing_assistance": [False, True, False], - "takes_up_housing_assistance_if_eligible": [False, True, False], - "takes_up_snap_if_eligible": [False, True, False], - "spm_unit_energy_subsidy": [90.0, 90.0, 0.0], - "spm_unit_pre_subsidy_childcare_expenses": [1500.0, 1500.0, 0.0], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - spm_units = tables.spm_units.sort_values("household_id").reset_index(drop=True) - - assert len(spm_units) == 2 - assert spm_units["receives_housing_assistance"].tolist() == [True, False] - assert spm_units["takes_up_housing_assistance_if_eligible"].tolist() == [ - True, - False, - ] - assert spm_units["takes_up_snap_if_eligible"].tolist() == [True, False] - assert spm_units["spm_unit_energy_subsidy"].tolist() == [90.0, 0.0] - assert spm_units["spm_unit_pre_subsidy_childcare_expenses"].tolist() == [ - 1500.0, - 0.0, - ] - - def test_build_policyengine_entity_tables_adds_deterministic_snap_takeup( - self, - monkeypatch, - ): - calls: list[tuple[str, int]] = [] - - def fake_load_takeup_rate(variable_name: str, year: int) -> float: - calls.append((variable_name, year)) - return 0.0 - - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_takeup_rate", - fake_load_takeup_rate, - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "spm_unit_id": [100, 100, 200], - "weight": [1.0, 1.0, 2.0], - "age": [45, 12, 70], - "income": [60_000.0, 0.0, 25_000.0], - "relationship_to_head": [0, 2, 0], - "takes_up_aca_if_eligible": [True, True, True], - "would_file_taxes_voluntarily": [False, False, False], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - spm_units = tables.spm_units.sort_values("household_id").reset_index(drop=True) - - assert calls == [ - ("head_start", 2024), - ("early_head_start", 2024), - ("dc_ptc", 2024), - ("snap", 2024), - ("tanf", 2024), - ] - assert spm_units["takes_up_snap_if_eligible"].tolist() == [False, False] - - def test_build_policyengine_entity_tables_recomputes_child_count_contract_inputs( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3, 4], - "household_id": [10, 10, 10, 20], - "weight": [1.0, 1.0, 1.0, 2.0], - "age": [45, 4, 17, 18], - "income": [60_000.0, 0.0, 0.0, 25_000.0], - "relationship_to_head": [0, 2, 2, 0], - "count_under_18": [99, 99, 99, 99], - "count_under_6": [99, 99, 99, 99], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - persons = tables.persons.sort_values("person_id").reset_index(drop=True) - - assert persons["count_under_18"].tolist() == [2, 2, 2, 0] - assert persons["count_under_6"].tolist() == [1, 1, 1, 0] - - def test_build_policyengine_entity_tables_uses_household_level_spm_fallback( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3, 4], - "household_id": [10, 10, 10, 10], - "weight": [1.0, 1.0, 1.0, 1.0], - "age": [45, 43, 12, 30], - "income": [60_000.0, 15_000.0, 0.0, 20_000.0], - "relationship_to_head": [0, 1, 2, 3], - "marital_status": [1, 1, 7, 7], - "state_fips": [6, 6, 6, 6], - "tenure": [1, 1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - - assert len(tables.spm_units) == 1 - assert person_rows["spm_unit_id"].nunique() == 1 - assert len(tables.families) == 2 - assert person_rows["family_id"].nunique() == 2 - assert person_rows.loc[:2, "family_id"].nunique() == 1 - assert person_rows.loc[3, "family_id"] != person_rows.loc[0, "family_id"] - - def test_build_policyengine_entity_tables_uses_family_relationship_for_family_units( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3, 4, 5], - "household_id": [10, 10, 10, 10, 10], - "weight": [1.0, 1.0, 1.0, 1.0, 1.0], - "age": [45, 43, 12, 70, 30], - "income": [60_000.0, 15_000.0, 0.0, 5_000.0, 20_000.0], - "relationship_to_head": [0, 1, 2, 3, 3], - "family_relationship": [1, 2, 3, 4, 0], - "marital_status": [1, 1, 7, 4, 7], - "state_fips": [6, 6, 6, 6, 6], - "tenure": [1, 1, 1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - - assert len(tables.spm_units) == 1 - assert len(tables.families) == 2 - assert person_rows.loc[:3, "family_id"].nunique() == 1 - assert person_rows.loc[4, "family_id"] != person_rows.loc[0, "family_id"] - - def test_build_policyengine_entity_tables_preserves_complete_existing_group_ids( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3, 4], - "household_id": [10, 10, 10, 20], - "weight": [1.0, 1.0, 1.0, 2.0], - "age": [45, 12, 30, 70], - "income": [60_000.0, 0.0, 20_000.0, 25_000.0], - "relationship_to_head": [0, 2, 3, 0], - "family_id": [1, 1, 2, 1], - "spm_unit_id": [1, 2, 2, 1], - "marital_unit_id": [1, 2, 3, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - - assert len(tables.families) == 3 - assert len(tables.spm_units) == 3 - assert len(tables.marital_units) == 4 - assert person_rows.loc[0, "family_id"] == person_rows.loc[1, "family_id"] - assert person_rows.loc[0, "family_id"] != person_rows.loc[3, "family_id"] - assert person_rows.loc[1, "spm_unit_id"] == person_rows.loc[2, "spm_unit_id"] - assert person_rows.loc[0, "spm_unit_id"] != person_rows.loc[3, "spm_unit_id"] - assert person_rows["marital_unit_id"].nunique() == 4 - - def test_build_policyengine_entity_tables_derives_is_household_head(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "weight": [1.0, 1.0, 2.0], - "age": [45, 12, 70], - "income": [60_000.0, 0.0, 25_000.0], - "relationship_to_head": [0, 2, 0], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - persons = tables.persons.sort_values("person_id").reset_index(drop=True) - - assert persons["is_household_head"].tolist() == [True, False, True] - - def test_build_policyengine_entity_tables_derives_tax_input_columns(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "weight": [1.0, 1.0], - "age": [45, 43], - "sex": [1, 2], - "race": [4, 2], - "hispanic": [2, 1], - "income": [60_000.0, 15_000.0], - "wage_income": [50_000.0, 10_000.0], - "self_employment_income": [5_000.0, 0.0], - "taxable_interest_income": [100.0, 20.0], - "ordinary_dividend_income": [80.0, 30.0], - "qualified_dividend_income": [30.0, 5.0], - "short_term_capital_gains": [10.0, 0.0], - "long_term_capital_gains": [40.0, 5.0], - "rental_income": [200.0, 0.0], - "gross_social_security": [0.0, 800.0], - "ssi": [0.0, 600.0], - "taxable_pension_income": [0.0, 300.0], - "unemployment_compensation": [0.0, 150.0], - "medicaid": [0.0, 1_250.0], - "medicaid_enrolled": [False, True], - "health_insurance_premiums_without_medicare_part_b": [120.0, 80.0], - "state_income_tax_paid": [400.0, 50.0], - "filing_status": ["JOINT", "JOINT"], - "relationship_to_head": [0, 1], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_unit_rows = tables.tax_units.sort_values("household_id").reset_index( - drop=True - ) - - assert person_rows["employment_income_before_lsr"].tolist() == [ - 50_000.0, - 10_000.0, - ] - assert person_rows["self_employment_income_before_lsr"].tolist() == [ - 5_000.0, - 0.0, - ] - assert person_rows["taxable_interest_income"].tolist() == [100.0, 20.0] - assert person_rows["dividend_income"].tolist() == [80.0, 30.0] - assert person_rows["qualified_dividend_income"].tolist() == [30.0, 5.0] - assert person_rows["non_qualified_dividend_income"].tolist() == [50.0, 25.0] - assert person_rows["short_term_capital_gains"].tolist() == [10.0, 0.0] - assert person_rows["long_term_capital_gains_before_response"].tolist() == [ - 40.0, - 5.0, - ] - assert person_rows["social_security_retirement"].tolist() == [0.0, 800.0] - assert person_rows["ssi"].tolist() == [0.0, 600.0] - assert person_rows["takes_up_ssi_if_eligible"].tolist() == [False, True] - assert person_rows["taxable_private_pension_income"].tolist() == [0.0, 300.0] - assert person_rows["unemployment_compensation"].tolist() == [0.0, 150.0] - assert person_rows["is_female"].tolist() == [False, True] - assert person_rows["cps_race"].tolist() == [4, 2] - assert person_rows["is_hispanic"].tolist() == [False, True] - assert person_rows["medicaid"].tolist() == [0.0, 1_250.0] - assert person_rows["medicaid_enrolled"].tolist() == [False, True] - assert ( - tax_unit_rows["health_insurance_premiums_without_medicare_part_b"].sum() - == 200.0 - ) - assert person_rows["state_income_tax_reported"].tolist() == [400.0, 50.0] - - def test_build_policyengine_entity_tables_adds_deterministic_aca_takeup( - self, - monkeypatch, - ): - calls: list[tuple[str, int]] = [] - - def fake_load_takeup_rate(variable_name: str, year: int) -> float: - calls.append((variable_name, year)) - return 0.0 - - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_takeup_rate", - fake_load_takeup_rate, - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 20, 30], - "weight": [1.0, 1.0, 1.0], - "age": [34, 42, 29], - "sex": [2, 1, 2], - "income": [40_000.0, 65_000.0, 32_000.0], - "filing_status": ["SINGLE", "SINGLE", "SINGLE"], - "relationship_to_head": [0, 0, 0], - "state_fips": [6, 12, 48], - "tenure": [1, 1, 1], - "has_marketplace_health_coverage": [True, False, True], - "takes_up_snap_if_eligible": [True, True, True], - "would_file_taxes_voluntarily": [False, False, False], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - - tax_units = tables.tax_units.sort_values("household_id").reset_index(drop=True) - assert calls == [ - ("head_start", 2024), - ("early_head_start", 2024), - ("aca", 2024), - ("dc_ptc", 2024), - ("tanf", 2024), - ] - assert tax_units["takes_up_aca_if_eligible"].tolist() == [ - False, - False, - False, - ] - - def test_build_policyengine_entity_tables_preserves_explicit_aca_takeup(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 20], - "weight": [1.0, 1.0], - "age": [34, 42], - "sex": [2, 1], - "income": [40_000.0, 65_000.0], - "filing_status": ["SINGLE", "SINGLE"], - "relationship_to_head": [0, 0], - "state_fips": [6, 12], - "tenure": [1, 1], - "has_marketplace_health_coverage": [False, True], - "takes_up_aca_if_eligible": [True, False], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - - tax_units = tables.tax_units.sort_values("household_id").reset_index(drop=True) - assert tax_units["takes_up_aca_if_eligible"].tolist() == [True, False] - - def test_attach_policyengine_marketplace_ratio_materializes_intermediates( - self, - monkeypatch, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20], - "household_weight": [1.0, 1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 20], - "tax_unit_id": [100, 200], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 200], - "household_id": [10, 20], - "health_insurance_premiums_without_medicare_part_b": [ - 300.0, - 50.0, - ], - "takes_up_aca_if_eligible": [True, True], - } - ), - ) - - captured_variables: list[tuple[str, ...]] = [] - - def fake_materialize(tables_arg, *, variables, **kwargs): - captured_variables.append(tuple(variables)) - tax_units = tables_arg.tax_units.copy() - tax_units["aca_ptc"] = [700.0, 0.0] - tax_units["slcsp"] = [1_000.0, 1_000.0] - return PolicyEngineUSVariableMaterializationResult( - tables=PolicyEngineUSEntityTableBundle( - households=tables_arg.households, - persons=tables_arg.persons, - tax_units=tax_units, - spm_units=tables_arg.spm_units, - families=tables_arg.families, - marital_units=tables_arg.marital_units, - ), - bindings={ - "aca_ptc": PolicyEngineUSVariableBinding( - entity=EntityType.TAX_UNIT, - column="aca_ptc", - ), - "slcsp": PolicyEngineUSVariableBinding( - entity=EntityType.TAX_UNIT, - column="slcsp", - ), - }, - materialized_variables=tuple(variables), - ) - - monkeypatch.setattr( - us_pipeline_module, - "materialize_policyengine_us_variables_safely", - fake_materialize, - ) - - updated = pipeline._attach_policyengine_marketplace_plan_benchmark_ratio( - tables, - target_period=2024, - ) - - assert captured_variables == [("aca_ptc", "slcsp")] - np.testing.assert_allclose( - updated.tax_units["selected_marketplace_plan_benchmark_ratio"], - np.array([1.0, 0.5]), - ) - - def test_build_policyengine_entity_tables_adds_ecps_stochastic_takeup_inputs( - self, - monkeypatch, - ): - scalar_calls: list[tuple[str, int]] = [] - medicaid_calls: list[int] = [] - pregnancy_calls: list[int] = [] - eitc_calls: list[int] = [] - voluntary_calls: list[int] = [] - - def fake_load_takeup_rate(variable_name: str, year: int) -> float: - scalar_calls.append((variable_name, year)) - return { - "head_start": 0.0, - "early_head_start": 1.0, - "dc_ptc": 1.0, - "snap": 1.0, - "tanf": 0.0, - "aca": 1.0, - }[variable_name] - - def fake_load_medicaid_rates(year: int) -> dict[str, float]: - medicaid_calls.append(year) - return {"CA": 0.0, "TX": 1.0} - - def fake_load_pregnancy_rates(year: int) -> dict[str, float]: - pregnancy_calls.append(year) - return {"CA": 1.0, "TX": 0.0} - - def fake_load_eitc_rates(year: int) -> dict[int, float]: - eitc_calls.append(year) - return {0: 0.0, 1: 1.0, 2: 1.0, 3: 1.0} - - def fake_load_voluntary_rates( - year: int, - ) -> dict[str, dict[str, dict[str, float]]]: - voluntary_calls.append(year) - return { - children: { - wage: {age: 1.0 for age in ("under_65", "age_65_plus")} - for wage in ("zero", "low", "medium", "high") - } - for children in ("no_children", "with_children") - } - - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_takeup_rate", - fake_load_takeup_rate, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_medicaid_takeup_rates", - fake_load_medicaid_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_pregnancy_rates", - fake_load_pregnancy_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_eitc_takeup_rates", - fake_load_eitc_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_voluntary_filing_rates", - fake_load_voluntary_rates, - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 20, 20], - "spm_unit_id": [100, 200, 200], - "weight": [1.0, 1.0, 1.0], - "age": [34, 42, 8], - "sex": [2, 1, 2], - "income": [40_000.0, 35_000.0, 0.0], - "relationship_to_head": [0, 0, 2], - "state_fips": [6, 48, 48], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - - persons = tables.persons.sort_values("person_id").reset_index(drop=True) - assert persons["takes_up_medicaid_if_eligible"].tolist() == [ - False, - True, - True, - ] - assert persons["takes_up_head_start_if_eligible"].tolist() == [ - False, - False, - False, - ] - assert persons["takes_up_early_head_start_if_eligible"].tolist() == [ - True, - True, - True, - ] - assert persons["is_pregnant"].tolist() == [True, False, False] - - tax_units = tables.tax_units.sort_values("household_id").reset_index(drop=True) - assert tax_units["takes_up_aca_if_eligible"].tolist() == [True, True] - assert tax_units["takes_up_dc_ptc"].tolist() == [True, True] - assert tax_units["takes_up_eitc"].tolist() == [False, True] - assert tax_units["would_file_taxes_voluntarily"].tolist() == [True, False] - - spm_units = tables.spm_units.sort_values("household_id").reset_index(drop=True) - assert spm_units["takes_up_snap_if_eligible"].tolist() == [True, True] - assert spm_units["takes_up_tanf_if_eligible"].tolist() == [False, False] - assert scalar_calls == [ - ("head_start", 2024), - ("early_head_start", 2024), - ("aca", 2024), - ("dc_ptc", 2024), - ("snap", 2024), - ("tanf", 2024), - ] - assert medicaid_calls == [2024] - assert pregnancy_calls == [2024] - assert eitc_calls == [2024] - assert voluntary_calls == [2024] - - def test_attach_policyengine_pregnancy_inputs_assigns_eligible_females( - self, - monkeypatch, - ): - class FakeRng: - def random(self, size: int) -> np.ndarray: - return np.zeros(size) - - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_pregnancy_rates", - lambda year: {"CA": 0.10, "NY": 0.0}, - ) - monkeypatch.setattr( - us_pipeline_module, - "_microplex_seeded_rng", - lambda variable_name, *, salt=None: FakeRng(), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - persons = pd.DataFrame( - { - "age": [20, 44, 45, 30, 20], - "sex": [2, 2, 2, 1, 2], - "state_fips": [6, 36, 6, 6, 99], - } - ) - - result = pipeline._attach_policyengine_pregnancy_inputs(persons) - - assert result["is_pregnant"].tolist() == [ - True, - False, - False, - False, - True, - ] - - def test_attach_policyengine_pregnancy_inputs_preserves_explicit_column( - self, - monkeypatch, - ): - def fail_rates(year: int) -> dict[str, float]: - raise AssertionError(f"unexpected pregnancy rate load: {year}") - - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_pregnancy_rates", - fail_rates, - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - persons = pd.DataFrame({"is_pregnant": [1, 0, True, False]}) - - result = pipeline._attach_policyengine_pregnancy_inputs(persons) - - assert result["is_pregnant"].tolist() == [True, False, True, False] - - def test_build_policyengine_entity_tables_adds_wic_takeup_inputs( - self, - monkeypatch, - ): - wic_takeup_calls: list[int] = [] - wic_risk_calls: list[int] = [] - - def fake_wic_takeup_rates(year: int) -> dict[str, float]: - wic_takeup_calls.append(year) - return { - "PREGNANT": 0.0, - "POSTPARTUM": 1.0, - "BREASTFEEDING": 0.0, - "INFANT": 1.0, - "CHILD": 0.0, - "NONE": 0.0, - } - - def fake_wic_risk_rates(year: int) -> dict[str, float]: - wic_risk_calls.append(year) - return { - "PREGNANT": 0.0, - "POSTPARTUM": 0.0, - "BREASTFEEDING": 0.0, - "INFANT": 0.0, - "CHILD": 1.0, - "NONE": 0.0, - } - - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_wic_takeup_rates", - fake_wic_takeup_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_wic_nutritional_risk_rates", - fake_wic_risk_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_pregnancy_rates", - lambda year: {}, - ) - monkeypatch.setattr( - us_pipeline_module, - "_microplex_seeded_rng", - lambda variable_name, *, salt=None: np.random.default_rng(0), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - population = pd.DataFrame( - { - "person_id": [1, 2, 3, 4], - "household_id": [10, 10, 30, 40], - "family_id": [10, 10, 30, 40], - "spm_unit_id": [10, 10, 30, 40], - "weight": [1.0, 1.0, 1.0, 1.0], - "age": [30, 0, 4, 40], - "sex": [2, 1, 2, 1], - "income": [40_000.0, 0.0, 0.0, 35_000.0], - "relationship_to_head": [0, 2, 0, 0], - "state_fips": [6, 6, 6, 6], - "own_children_in_household": [1, 0, 0, 0], - "receives_wic": [False, True, False, False], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - - persons = tables.persons.sort_values("person_id").reset_index(drop=True) - assert persons["would_claim_wic"].tolist() == [True, True, False, False] - assert persons["is_wic_at_nutritional_risk"].tolist() == [ - False, - True, - True, - False, - ] - assert wic_takeup_calls == [2024] - assert wic_risk_calls == [2024] - - def test_build_policyengine_entity_tables_preserves_explicit_stochastic_takeup_inputs( - self, - monkeypatch, - ): - def fail_scalar_rate(variable_name: str, year: int) -> float: - raise AssertionError(f"unexpected scalar rate load: {variable_name} {year}") - - def fail_medicaid_rates(year: int) -> dict[str, float]: - raise AssertionError(f"unexpected Medicaid rate load: {year}") - - def fail_pregnancy_rates(year: int) -> dict[str, float]: - raise AssertionError(f"unexpected pregnancy rate load: {year}") - - def fail_eitc_rates(year: int) -> dict[int, float]: - raise AssertionError(f"unexpected EITC rate load: {year}") - - def fail_voluntary_rates(year: int) -> dict: - raise AssertionError(f"unexpected voluntary filing rate load: {year}") - - def fail_wic_takeup_rates(year: int) -> dict[str, float]: - raise AssertionError(f"unexpected WIC take-up rate load: {year}") - - def fail_wic_risk_rates(year: int) -> dict[str, float]: - raise AssertionError(f"unexpected WIC nutritional-risk rate load: {year}") - - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_takeup_rate", - fail_scalar_rate, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_medicaid_takeup_rates", - fail_medicaid_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_pregnancy_rates", - fail_pregnancy_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_eitc_takeup_rates", - fail_eitc_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_voluntary_filing_rates", - fail_voluntary_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_wic_takeup_rates", - fail_wic_takeup_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_wic_nutritional_risk_rates", - fail_wic_risk_rates, - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "spm_unit_id": [100, 100], - "weight": [1.0, 1.0], - "age": [34, 8], - "sex": [2, 2], - "income": [40_000.0, 0.0], - "relationship_to_head": [0, 2], - "state_fips": [6, 6], - "takes_up_medicaid_if_eligible": [False, True], - "is_pregnant": [False, True], - "takes_up_head_start_if_eligible": [False, True], - "takes_up_early_head_start_if_eligible": [True, False], - "takes_up_aca_if_eligible": [False, True], - "takes_up_dc_ptc": [False, True], - "takes_up_eitc": [False, True], - "would_file_taxes_voluntarily": [True, False], - "takes_up_snap_if_eligible": [False, True], - "takes_up_tanf_if_eligible": [True, False], - "would_claim_wic": [False, True], - "is_wic_at_nutritional_risk": [True, False], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - - persons = tables.persons.sort_values("person_id").reset_index(drop=True) - assert persons["takes_up_medicaid_if_eligible"].tolist() == [False, True] - assert persons["is_pregnant"].tolist() == [False, True] - assert persons["takes_up_head_start_if_eligible"].tolist() == [False, True] - assert persons["takes_up_early_head_start_if_eligible"].tolist() == [ - True, - False, - ] - assert persons["would_claim_wic"].tolist() == [False, True] - assert persons["is_wic_at_nutritional_risk"].tolist() == [True, False] - - tax_units = tables.tax_units.sort_values("household_id").reset_index(drop=True) - assert tax_units["takes_up_aca_if_eligible"].tolist() == [True] - assert tax_units["takes_up_dc_ptc"].tolist() == [True] - assert tax_units["takes_up_eitc"].tolist() == [True] - assert tax_units["would_file_taxes_voluntarily"].tolist() == [True] - - spm_units = tables.spm_units.sort_values("household_id").reset_index(drop=True) - assert spm_units["takes_up_snap_if_eligible"].tolist() == [True] - assert spm_units["takes_up_tanf_if_eligible"].tolist() == [True] - - def test_build_policyengine_entity_tables_uses_eitc_children_for_eitc_takeup( - self, - monkeypatch, - ): - eitc_calls: list[int] = [] - - def fail_scalar_rate(variable_name: str, year: int) -> float: - raise AssertionError(f"unexpected scalar rate load: {variable_name} {year}") - - def fail_medicaid_rates(year: int) -> dict[str, float]: - raise AssertionError(f"unexpected Medicaid rate load: {year}") - - def fake_eitc_rates(year: int) -> dict[int, float]: - eitc_calls.append(year) - return {0: 0.0, 1: 1.0, 2: 1.0, 3: 1.0} - - def fail_voluntary_rates(year: int) -> dict: - raise AssertionError(f"unexpected voluntary filing rate load: {year}") - - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_takeup_rate", - fail_scalar_rate, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_medicaid_takeup_rates", - fail_medicaid_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_eitc_takeup_rates", - fake_eitc_rates, - ) - monkeypatch.setattr( - us_pipeline_module, - "_load_microplex_voluntary_filing_rates", - fail_voluntary_rates, - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - population = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "spm_unit_id": [100], - "weight": [1.0], - "age": [34], - "sex": [2], - "income": [40_000.0], - "relationship_to_head": [0], - "state_fips": [6], - "eitc_children": [1], - "eitc_child_count": [0], - "takes_up_medicaid_if_eligible": [True], - "takes_up_head_start_if_eligible": [False], - "takes_up_early_head_start_if_eligible": [False], - "takes_up_aca_if_eligible": [True], - "takes_up_dc_ptc": [False], - "would_file_taxes_voluntarily": [False], - "takes_up_snap_if_eligible": [True], - "takes_up_tanf_if_eligible": [False], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - - tax_units = tables.tax_units.sort_values("household_id").reset_index(drop=True) - assert tax_units["takes_up_eitc"].tolist() == [True] - assert "_mp_eitc_child_count_for_takeup" not in tax_units.columns - assert eitc_calls == [2024] - - def test_build_policyengine_entity_tables_fallback_employment_excludes_transfer_income( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "weight": [1.0], - "age": [62], - "sex": [2], - "income": [18_000.0], - "ssi": [9_000.0], - "public_assistance": [3_000.0], - "gross_social_security": [2_000.0], - "filing_status": ["SINGLE"], - "relationship_to_head": [0], - "state_fips": [6], - "tenure": [1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_row = tables.persons.iloc[0] - - assert person_row["employment_income_before_lsr"] == 4_000.0 - assert person_row["ssi"] == 9_000.0 - assert person_row["social_security_retirement"] == 2_000.0 - - def test_build_policyengine_entity_tables_allocates_social_security_residual_to_retirement( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "weight": [1.0], - "age": [62], - "sex": [2], - "income": [2_000.0], - "gross_social_security": [2_000.0], - "social_security_disability": [500.0], - "filing_status": ["SINGLE"], - "relationship_to_head": [0], - "state_fips": [6], - "tenure": [1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_row = tables.persons.iloc[0] - - assert person_row["social_security_retirement"] == 1_500.0 - assert person_row["social_security_disability"] == 500.0 - - def test_build_policyengine_entity_tables_derives_dividend_totals_from_atomic_components( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "weight": [1.0], - "age": [45], - "income": [60_000.0], - "wage_income": [50_000.0], - "ordinary_dividend_income": [50.0], - "dividend_income": [0.0], - "qualified_dividend_income": [30.0], - "non_qualified_dividend_income": [12.0], - "filing_status": ["SINGLE"], - "relationship_to_head": [0], - "state_fips": [6], - "tenure": [1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_row = tables.persons.iloc[0] - - assert person_row["qualified_dividend_income"] == 30.0 - assert person_row["non_qualified_dividend_income"] == 12.0 - assert person_row["ordinary_dividend_income"] == 42.0 - assert person_row["dividend_income"] == 42.0 - - def test_build_policyengine_entity_tables_derives_relationships_from_family_relationship( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "family_relationship": [0, 1, 2], - "marital_status": [1, 1, 7], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert person_rows["relationship_to_head"].tolist() == [0, 1, 2] - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "JOINT" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_derives_relationships_from_one_based_family_relationship( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "family_relationship": [1, 2, 3], - "marital_status": [1, 1, 7], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert person_rows["relationship_to_head"].tolist() == [0, 1, 2] - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "JOINT" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_uses_spouse_and_dependent_flags_when_relationship_missing( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "is_spouse": [0, 1, 0], - "is_dependent": [0, 0, 1], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert person_rows["relationship_to_head"].tolist() == [0, 1, 2] - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "JOINT" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_prefers_richer_family_relationship_over_collapsed_relationship_to_head( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "family_relationship": [0, 1, 2], - "relationship_to_head": [0, 3, 3], - "marital_status": [1, 1, 7], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert person_rows["relationship_to_head"].tolist() == [0, 1, 2] - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "JOINT" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_repairs_households_without_a_head(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "relationship_to_head": [1, 1, 2], - "marital_status": [1, 1, 7], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert person_rows["relationship_to_head"].tolist() == [0, 1, 2] - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "JOINT" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_marks_separated_head_as_separate(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "marital_status": [6, 7], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "SEPARATE" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_splits_separated_spouses_into_two_units( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "weight": [1.0, 1.0], - "age": [45, 43], - "income": [60_000.0, 15_000.0], - "relationship_to_head": [0, 1], - "marital_status": [6, 6], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - - assert len(tax_units) == 2 - assert tax_units["filing_status"].tolist() == ["SEPARATE", "SEPARATE"] - assert person_rows["tax_unit_id"].nunique() == 2 - - def test_build_policyengine_entity_tables_splits_separated_spouses_and_keeps_dependents_with_head( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "relationship_to_head": [0, 1, 2], - "marital_status": [6, 6, 7], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - - assert len(tax_units) == 2 - assert tax_units.iloc[0]["filing_status"] == "SEPARATE" - assert tax_units.iloc[0]["n_dependents"] == 1 - assert tax_units.iloc[1]["filing_status"] == "SEPARATE" - assert tax_units.iloc[1]["n_dependents"] == 0 - dependent_tax_unit_id = int( - person_rows.loc[person_rows["person_id"] == 3, "tax_unit_id"].iloc[0] - ) - assert dependent_tax_unit_id == int(tax_units.iloc[0]["tax_unit_id"]) - - def test_build_policyengine_entity_tables_splits_spouse_coded_pair_without_marriage_evidence( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "weight": [1.0, 1.0], - "age": [45, 43], - "income": [60_000.0, 15_000.0], - "relationship_to_head": [0, 1], - "marital_status": [7, 7], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - - assert len(tax_units) == 2 - assert tax_units["filing_status"].tolist() == ["SINGLE", "SINGLE"] - assert person_rows["tax_unit_id"].nunique() == 2 - - def test_build_policyengine_entity_tables_marks_widowed_head_with_child_as_surviving_spouse( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "marital_status": [4, 7], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "SURVIVING_SPOUSE" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_prefers_explicit_head_of_household_code( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "marital_status": [5, 7], - "filing_status_code": [4, np.nan], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "HEAD_OF_HOUSEHOLD" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_preserves_explicit_role_flag_head_of_household_code( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "person_number": [1, 2], - "spouse_person_number": [0, 0], - "tax_unit_is_joint": [0.0, 0.0], - "tax_unit_count_dependents": [1.0, 1.0], - "is_tax_unit_head": [1.0, 0.0], - "is_tax_unit_spouse": [0.0, 0.0], - "is_tax_unit_dependent": [0.0, 1.0], - "filing_status_code": [4, np.nan], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "HEAD_OF_HOUSEHOLD" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_does_not_promote_non_hoh_role_flag_codes( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3, 4], - "household_id": [10, 10, 20, 20], - "tax_unit_id": [100, 100, 200, 200], - "weight": [1.0, 1.0, 1.0, 1.0], - "age": [45, 12, 44, 10], - "income": [60_000.0, 0.0, 55_000.0, 0.0], - "relationship_to_head": [0, 2, 0, 2], - "person_number": [1, 2, 1, 2], - "spouse_person_number": [0, 0, 0, 0], - "tax_unit_is_joint": [0.0, 0.0, 0.0, 0.0], - "tax_unit_count_dependents": [1.0, 1.0, 1.0, 1.0], - "is_tax_unit_head": [1.0, 0.0, 1.0, 0.0], - "is_tax_unit_spouse": [0.0, 0.0, 0.0, 0.0], - "is_tax_unit_dependent": [0.0, 1.0, 0.0, 1.0], - "filing_status_code": [1, np.nan, 2, np.nan], - "state_fips": [6, 6, 6, 6], - "tenure": [1, 1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert tax_units["filing_status"].tolist() == ["SINGLE", "SINGLE"] - assert tax_units["n_dependents"].tolist() == [1, 1] - - def test_build_policyengine_entity_tables_does_not_infer_head_of_household_from_marital_status_alone( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "marital_status": [5, 7], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert len(tax_units) == 1 - assert tax_units.iloc[0]["filing_status"] == "SINGLE" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_can_preserve_existing_tax_unit_ids(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "tax_unit_id": [100, 100, 200], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "relationship_to_head": [0, 1, 2], - "marital_status": [1, 1, 7], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert person_rows["tax_unit_id"].tolist() == [100, 100, 200] - assert tax_units["tax_unit_id"].tolist() == [100, 200] - assert tax_units["filing_status"].tolist() == ["JOINT", "SINGLE"] - assert tax_units["n_dependents"].tolist() == [0, 0] - - def test_build_policyengine_entity_tables_prefers_tax_unit_role_flags_over_bad_ids( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "tax_unit_id": [100, 101, 102], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "relationship_to_head": [0, 1, 2], - "marital_status": [1, 1, 7], - "person_number": [1, 2, 3], - "spouse_person_number": [2, 1, 0], - "tax_unit_is_joint": [1.0, 1.0, 1.0], - "tax_unit_count_dependents": [1.0, 1.0, 1.0], - "is_tax_unit_head": [1.0, 0.0, 0.0], - "is_tax_unit_spouse": [0.0, 1.0, 0.0], - "is_tax_unit_dependent": [0.0, 0.0, 1.0], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert len(tax_units) == 1 - assert person_rows["tax_unit_id"].nunique() == 1 - assert tax_units.iloc[0]["filing_status"] == "JOINT" - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_microunit_overrides_bad_cps_tax_unit_ids( - self, - ): - # microunit is the DEFAULT tax-unit constructor: when the high-fidelity CPS - # fields (person_number + family_relationship) are present it re-partitions - # the household and intentionally REPLACES the unreliable CPS-provided - # tax_unit_id (Census TAX_ID) -- even though - # policyengine_prefer_existing_tax_unit_ids defaults to True (that path is a - # fallback for households microunit does not construct, not a competing - # authority). This locks in "replace the CPS tax units, keep the SPM units". - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - assert pipeline.config.policyengine_prefer_existing_tax_unit_ids is True - population = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - # CPS TAX_ID nonsensically splits the dependent child into its own unit. - "tax_unit_id": [100, 100, 200], - # SPM units, by contrast, must be preserved. - "spm_unit_id": [500, 500, 500], - "weight": [1.0, 1.0, 1.0], - "age": [45, 43, 12], - "income": [60_000.0, 15_000.0, 0.0], - "person_number": [1, 2, 3], - "spouse_person_number": [2, 1, 0], - "family_relationship": [1, 2, 3], # CPS A_FAMREL: ref, spouse, child - "marital_status": [1, 1, 7], - "state_fips": [6, 6, 6], - "tenure": [1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units - - # microunit folds couple + child into ONE unit, discarding the [100,100,200] - # split (which preservation would have kept as two units). - assert len(tax_units) == 1 - assert person_rows["tax_unit_id"].nunique() == 1 - assert tax_units.iloc[0]["n_dependents"] == 1 - # The SPM unit is untouched (replace tax, keep SPM). - assert person_rows["spm_unit_id"].nunique() == 1 - - def test_build_policyengine_entity_tables_resolves_spouse_head_role_conflicts( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 101], - "weight": [1.0, 1.0], - "age": [45, 43], - "income": [60_000.0, 15_000.0], - "relationship_to_head": [0, 1], - "family_relationship": [1, 2], - "person_number": [1, 2], - "spouse_person_number": [2, 1], - "tax_unit_is_joint": [1.0, 1.0], - "tax_unit_count_dependents": [0.0, 0.0], - "is_tax_unit_head": [1.0, 1.0], - "is_tax_unit_spouse": [0.0, 1.0], - "is_tax_unit_dependent": [0.0, 0.0], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert len(tax_units) == 1 - assert person_rows["tax_unit_id"].nunique() == 1 - assert tax_units.iloc[0]["filing_status"] == "JOINT" - - def test_build_policyengine_entity_tables_resolves_dependent_head_role_conflicts( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 101], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "family_relationship": [1, 3], - "person_number": [1, 2], - "spouse_person_number": [0, 0], - "tax_unit_is_joint": [0.0, 0.0], - "tax_unit_count_dependents": [1.0, 1.0], - "is_tax_unit_head": [1.0, 1.0], - "is_tax_unit_spouse": [0.0, 0.0], - "is_tax_unit_dependent": [0.0, 1.0], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - # filing_status is PE-computed (delegated; microplex does not export it), - # so only microunit's partition is asserted here. - assert len(tax_units) == 1 - assert person_rows["tax_unit_id"].nunique() == 1 - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_resolves_spouse_dependent_role_conflicts( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "family_relationship": [1, 3], - "person_number": [1, 2], - "spouse_person_number": [0, 0], - "tax_unit_is_joint": [0.0, 1.0], - "tax_unit_count_dependents": [1.0, 1.0], - "is_tax_unit_head": [1.0, 0.0], - "is_tax_unit_spouse": [0.0, 1.0], - "is_tax_unit_dependent": [0.0, 1.0], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - # filing_status delegated to PE; assert only microunit's partition. - assert len(tax_units) == 1 - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_repairs_missing_role_flag_heads(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 101], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "family_relationship": [1, 3], - "person_number": [1, 2], - "spouse_person_number": [0, 0], - "tax_unit_is_joint": [0.0, 0.0], - "tax_unit_count_dependents": [1.0, 1.0], - "is_tax_unit_head": [0.0, 0.0], - "is_tax_unit_spouse": [0.0, 0.0], - "is_tax_unit_dependent": [0.0, 1.0], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - # filing_status delegated to PE; assert only microunit's partition. - assert len(tax_units) == 1 - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_folds_young_head_hint_dependents(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 101], - "weight": [1.0, 1.0], - "age": [45, 22], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "family_relationship": [1, 3], - "person_number": [1, 2], - "spouse_person_number": [0, 0], - "tax_unit_is_joint": [0.0, 0.0], - "tax_unit_count_dependents": [1.0, 1.0], - "is_tax_unit_head": [1.0, 1.0], - "is_tax_unit_spouse": [0.0, 0.0], - "is_tax_unit_dependent": [0.0, 0.0], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - # microunit applies the real qualifying-child age rule: a 19+ non-student - # own-child is NOT folded as a dependent (it gets its own tax unit), unlike - # the legacy role-flag heuristic. Threading student enrollment (A_HSCOL) so - # the qualifying-child-to-24 student extension fires is a tracked follow-up. - assert len(tax_units) == 2 - assert int(tax_units["n_dependents"].sum()) == 0 - - def test_build_policyengine_entity_tables_uses_legacy_path_without_cps_fields( - self, - ): - # Without the high-fidelity CPS fields (person_number/family_relationship), - # microunit cannot construct, so the legacy role-flag reconstruction (the - # fallback) handles the conflict. Preserves coverage of that path now that - # the real-data path defaults to microunit. - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 101], - "weight": [1.0, 1.0], - "age": [45, 12], - "income": [60_000.0, 0.0], - "relationship_to_head": [0, 2], - "is_tax_unit_head": [1.0, 1.0], - "is_tax_unit_spouse": [0.0, 0.0], - "is_tax_unit_dependent": [0.0, 1.0], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units - assert len(tax_units) == 1 - assert tax_units.iloc[0]["n_dependents"] == 1 - - def test_build_policyengine_entity_tables_keeps_positive_income_adult_heads(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 101], - "weight": [1.0, 1.0], - "age": [45, 25], - "income": [60_000.0, 20_000.0], - "relationship_to_head": [0, 2], - "family_relationship": [1, 3], - "person_number": [1, 2], - "spouse_person_number": [0, 0], - "tax_unit_is_joint": [0.0, 0.0], - "tax_unit_count_dependents": [0.0, 0.0], - "is_tax_unit_head": [1.0, 1.0], - "is_tax_unit_spouse": [0.0, 0.0], - "is_tax_unit_dependent": [0.0, 0.0], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert len(tax_units) == 2 - assert tax_units["filing_status"].tolist() == ["SINGLE", "SINGLE"] - assert tax_units["n_dependents"].tolist() == [0, 0] - - def test_build_policyengine_entity_tables_preserves_tax_unit_agi_inputs(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_prefer_existing_tax_unit_ids=True) - ) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "weight": [1.0, 1.0], - "age": [45, 43], - "income": [60_000.0, 15_000.0], - "relationship_to_head": [0, 1], - "filing_status": ["JOINT", "JOINT"], - "domestic_production_ald": [7.0, 2.0], - "health_savings_account_ald": [60.0, 15.0], - "recapture_of_investment_credit": [3.0, 4.0], - "self_employed_health_insurance_ald": [20.0, 5.0], - "self_employed_pension_contribution_ald": [30.0, 10.0], - "unrecaptured_section_1250_gain": [11.0, 13.0], - "unreported_payroll_tax": [17.0, 19.0], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert tax_units["domestic_production_ald"].tolist() == [9.0] - assert tax_units["health_savings_account_ald"].tolist() == [75.0] - assert tax_units["recapture_of_investment_credit"].tolist() == [7.0] - assert tax_units["self_employed_health_insurance_ald"].tolist() == [25.0] - assert tax_units["self_employed_pension_contribution_ald"].tolist() == [40.0] - assert tax_units["unrecaptured_section_1250_gain"].tolist() == [24.0] - assert tax_units["unreported_payroll_tax"].tolist() == [36.0] - - def test_build_policyengine_entity_tables_deduplicates_repeated_tax_unit_ald_values( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_prefer_existing_tax_unit_ids=True) - ) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "weight": [1.0, 1.0], - "age": [45, 43], - "income": [60_000.0, 15_000.0], - "relationship_to_head": [0, 1], - "filing_status": ["JOINT", "JOINT"], - "self_employed_pension_contribution_ald": [30.0, 30.0], - "unrecaptured_section_1250_gain": [50.0, 50.0], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert tax_units["self_employed_pension_contribution_ald"].tolist() == [30.0] - assert tax_units["unrecaptured_section_1250_gain"].tolist() == [50.0] - - def test_build_policyengine_entity_tables_preserved_tax_units_require_reciprocal_spouse_pointer_for_joint( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_prefer_existing_tax_unit_ids=True) - ) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "weight": [1.0, 1.0], - "age": [45, 43], - "income": [60_000.0, 15_000.0], - "relationship_to_head": [0, 1], - "person_number": [1, 2], - "spouse_person_number": [0, 0], - "marital_status": [5, 7], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert tax_units["tax_unit_id"].tolist() == [100] - assert tax_units["filing_status"].tolist() == ["SINGLE"] - assert tax_units["n_dependents"].tolist() == [1] - - def test_build_policyengine_entity_tables_preserved_tax_units_keep_joint_for_reciprocal_spouse_pointer( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_prefer_existing_tax_unit_ids=True) - ) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "weight": [1.0, 1.0], - "age": [45, 43], - "income": [60_000.0, 15_000.0], - "relationship_to_head": [0, 1], - "person_number": [1, 2], - "spouse_person_number": [2, 1], - "marital_status": [1, 1], - "state_fips": [6, 6], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert tax_units["tax_unit_id"].tolist() == [100] - assert tax_units["filing_status"].tolist() == ["JOINT"] - assert tax_units["n_dependents"].tolist() == [0] - - def test_build_policyengine_entity_tables_falls_back_when_existing_tax_unit_ids_cross_households( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_prefer_existing_tax_unit_ids=True) - ) - population = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 20], - "tax_unit_id": [100, 100], - "weight": [1.0, 1.0], - "age": [45, 39], - "income": [60_000.0, 40_000.0], - "relationship_to_head": [0, 0], - "marital_status": [7, 7], - "state_fips": [6, 36], - "tenure": [1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values("tax_unit_id").reset_index(drop=True) - - assert person_rows["tax_unit_id"].nunique() == 2 - assert tax_units["household_id"].tolist() == [10, 20] - - def test_build_policyengine_entity_tables_partially_preserves_existing_tax_unit_ids( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_prefer_existing_tax_unit_ids=True) - ) - population = pd.DataFrame( - { - "person_id": [1, 2, 3, 4, 5], - "household_id": [10, 10, 10, 20, 20], - "tax_unit_id": [100, 100, 200, np.nan, np.nan], - "weight": [1.0, 1.0, 1.0, 1.0, 1.0], - "age": [45, 43, 12, 38, 8], - "income": [60_000.0, 15_000.0, 0.0, 42_000.0, 0.0], - "relationship_to_head": [0, 1, 2, 0, 2], - "marital_status": [1, 1, 7, 7, 7], - "state_fips": [6, 6, 6, 36, 36], - "tenure": [1, 1, 1, 1, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(population) - person_rows = tables.persons.sort_values("person_id").reset_index(drop=True) - tax_units = tables.tax_units.sort_values( - ["household_id", "tax_unit_id"] - ).reset_index(drop=True) - - assert person_rows.loc[:2, "tax_unit_id"].tolist() == [100, 100, 200] - hh20_person_tax_units = person_rows.loc[ - person_rows["household_id"] == 20, "tax_unit_id" - ] - assert hh20_person_tax_units.notna().all() - assert hh20_person_tax_units.nunique() == 1 - assert int(hh20_person_tax_units.iloc[0]) > 200 - assert tax_units.loc[ - tax_units["household_id"] == 10, "tax_unit_id" - ].tolist() == [100, 200] - assert tax_units.loc[ - tax_units["household_id"] == 20, "tax_unit_id" - ].tolist() == [201] - - def test_build_from_source_providers_accepts_year_specific_query_keys(self): - households = pd.DataFrame( - { - "household_id": ["1"], - "state_fips": [6], - "household_weight": [1.0], - "year": [2024], - } - ) - persons = pd.DataFrame( - { - "person_id": ["1:1"], - "household_id": ["1"], - "age": [40], - "sex": [1], - "education": [3], - "employment_status": [1], - "income": [50_000.0], - "weight": [1.0], - "year": [2024], - } - ) - - descriptor = SourceDescriptor( - name="toy_source", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips",), - weight_column="household_weight", - period_column="year", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "age", - "sex", - "education", - "employment_status", - "income", - ), - weight_column="weight", - period_column="year", - ), - ), - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="toy_source_2024", - shareability=descriptor.shareability, - time_structure=descriptor.time_structure, - observations=descriptor.observations, - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - class YearNamedProvider: - year = 2024 - _descriptor_cache = None - - @property - def descriptor(self): - return self._descriptor_cache or descriptor - - def load_frame(self, query=None): - self.last_query = query - self._descriptor_cache = frame.source - return frame - - provider = YearNamedProvider() - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=1, - synthesis_backend="bootstrap", - ) - ) - - result = pipeline.build_from_source_providers( - [provider], - queries={ - "toy_source_2024": SourceQuery( - provider_filters={"sample_n": 1, "random_seed": 7} - ) - }, - ) - - assert provider.last_query is not None - assert provider.last_query.provider_filters["sample_n"] == 1 - assert result.source_frame is not None - assert result.source_frame.source.name == "toy_source_2024" - - def test_integrate_donor_sources_models_dividends_compositionally( - self, - monkeypatch, - ): - captured: dict[str, object] = {} - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 19], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [60_000.0, 12_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [80.0, 90.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002], - "household_id": [101, 102], - "age": [44, 21], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [58_000.0, 13_000.0], - "qualified_dividend_income": [20.0, 7.0], - "non_qualified_dividend_income": [8.0, 3.0], - "ordinary_dividend_income": [28.0, 10.0], - "dividend_income": [500.0, 200.0], - } - ) - - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "ordinary_dividend_income", - "dividend_income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - class FakeSynthesizer: - def __init__(self, *args, **kwargs): - _ = args - captured["init_kwargs"] = dict(kwargs) - self.target_vars = kwargs.get("target_vars", []) - - def fit(self, *args, **kwargs): - _ = args - captured["fit_kwargs"] = dict(kwargs) - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - if "dividend_income" in self.target_vars: - result["dividend_income"] = [28.0, 10.0] - if "qualified_dividend_share" in self.target_vars: - result["qualified_dividend_share"] = [20.0 / 28.0, 0.7] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_epochs=7, - donor_imputer_batch_size=33, - donor_imputer_learning_rate=5e-4, - donor_imputer_n_layers=3, - donor_imputer_hidden_dim=48, - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert integration["integrated_variables"] == [ - "non_qualified_dividend_income", - "qualified_dividend_income", - ] - assert integration["seed_data"]["qualified_dividend_income"].round( - 6 - ).tolist() == [ - 20.0, - 7.0, - ] - assert integration["seed_data"]["non_qualified_dividend_income"].round( - 6 - ).tolist() == [ - 8.0, - 3.0, - ] - assert integration["seed_data"]["ordinary_dividend_income"].round( - 6 - ).tolist() == [ - 28.0, - 10.0, - ] - assert integration["seed_data"]["dividend_income"].round(6).tolist() == [ - 28.0, - 10.0, - ] - assert "qualified_dividend_share" not in integration["seed_data"].columns - assert captured["init_kwargs"]["n_layers"] == 3 - assert captured["init_kwargs"]["hidden_dim"] == 48 - assert captured["fit_kwargs"]["epochs"] == 7 - assert captured["fit_kwargs"]["batch_size"] == 33 - assert captured["fit_kwargs"]["learning_rate"] == 5e-4 - - def test_integrate_donor_sources_models_unrelated_tax_variables_in_separate_blocks( - self, - monkeypatch, - ): - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 19], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [60_000.0, 12_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [80.0, 90.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002], - "household_id": [101, 102], - "age": [44, 21], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [58_000.0, 13_000.0], - "qualified_dividend_income": [20.0, 7.0], - "non_qualified_dividend_income": [8.0, 3.0], - "partnership_s_corp_income": [1_000.0, 200.0], - } - ) - - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "partnership_s_corp_income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - target_var_calls: list[tuple[str, ...]] = [] - - class FakeSynthesizer: - def __init__(self, *args, **kwargs): - _ = args - self.target_vars = tuple(kwargs.get("target_vars", [])) - target_var_calls.append(self.target_vars) - - def fit(self, *args, **kwargs): - _ = args - _ = kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - if self.target_vars == ("dividend_income", "qualified_dividend_share"): - result["dividend_income"] = [28.0, 10.0] - result["qualified_dividend_share"] = [20.0 / 28.0, 0.7] - if self.target_vars == ("partnership_s_corp_income",): - result["partnership_s_corp_income"] = [1_000.0, 200.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert target_var_calls == [ - ("dividend_income", "qualified_dividend_share"), - ("partnership_s_corp_income",), - ] - assert integration["seed_data"]["qualified_dividend_income"].round( - 6 - ).tolist() == [ - 20.0, - 7.0, - ] - assert integration["seed_data"]["non_qualified_dividend_income"].round( - 6 - ).tolist() == [ - 8.0, - 3.0, - ] - assert integration["seed_data"]["partnership_s_corp_income"].round( - 6 - ).tolist() == [ - 1_000.0, - 200.0, - ] - - def test_integrate_donor_sources_can_use_zi_qrf_backend(self, monkeypatch): - captured: dict[str, object] = {} - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 19], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [60_000.0, 12_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [80.0, 90.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002], - "household_id": [101, 102], - "age": [44, 21], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [58_000.0, 13_000.0], - "public_assistance": [200.0, 0.0], - } - ) - - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="benefit_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "public_assistance", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - class FakeQRFImputer: - def __init__(self, **kwargs): - captured["init_kwargs"] = kwargs - - def fit(self, frame, **kwargs): - captured["fit_columns"] = list(frame.columns) - captured["fit_kwargs"] = kwargs - return self - - def generate(self, frame, seed=None): - _ = seed - return frame.assign(public_assistance=[190.0, 10.0]) - - monkeypatch.setattr( - "microplex_us.pipelines.us.ColumnwiseQRFDonorImputer", - FakeQRFImputer, - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_backend="zi_qrf", - donor_imputer_qrf_n_estimators=77, - donor_imputer_qrf_zero_threshold=0.1, - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert integration["integrated_variables"] == ["public_assistance"] - assert captured["init_kwargs"]["n_estimators"] == 77 - assert captured["init_kwargs"]["zero_threshold"] == 0.1 - assert captured["init_kwargs"]["zero_inflated_vars"] == {"public_assistance"} - assert captured["init_kwargs"]["nonnegative_vars"] == set() - assert "weight" in captured["fit_columns"] - assert captured["fit_kwargs"]["weight_col"] == "weight" - assert set(integration["seed_data"]["public_assistance"].tolist()) <= { - 0.0, - 200.0, - } - - def test_support_sensitive_donor_vars_do_not_force_clamps(self, monkeypatch): - captured: dict[str, dict[str, object]] = {} - - class FakeRegimeAwareDonorImputer: - def __init__(self, **kwargs): - captured["regime_aware"] = kwargs - - class FakeQRFImputer: - def __init__(self, **kwargs): - captured["zi_qrf"] = kwargs - - monkeypatch.setattr( - "microplex_us.pipelines.us.RegimeAwareDonorImputer", - FakeRegimeAwareDonorImputer, - ) - monkeypatch.setattr( - "microplex_us.pipelines.us.ColumnwiseQRFDonorImputer", - FakeQRFImputer, - ) - - target_vars = ("partnership_s_corp_income", "public_assistance") - - regime_pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - donor_imputer_backend="regime_aware", - donor_imputer_qrf_n_estimators=77, - donor_imputer_qrf_max_train_samples=1234, - ) - ) - regime_pipeline._build_donor_imputer( - condition_vars=["age"], - target_vars=target_vars, - ) - - qrf_pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - donor_imputer_backend="zi_qrf", - ) - ) - qrf_pipeline._build_donor_imputer( - condition_vars=["age"], - target_vars=target_vars, - ) - - assert "nonnegative_vars" not in captured["regime_aware"] - assert captured["regime_aware"]["n_estimators"] == 77 - assert captured["regime_aware"]["max_train_samples"] == 1234 - assert captured["zi_qrf"]["nonnegative_vars"] == set() - assert captured["zi_qrf"]["zero_inflated_vars"] == { - "partnership_s_corp_income", - "public_assistance", - } - - def test_integrate_donor_sources_preserves_informative_scaffold_values( - self, monkeypatch - ): - cps_households = pd.DataFrame( - { - "household_id": [1], - "hh_weight": [100.0], - "state_fips": [6], - "tenure": [1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10], - "household_id": [1], - "age": [45], - "sex": [1], - "education": [3], - "employment_status": [1], - "income": [60_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101], - "hh_weight": [80.0], - "state_fips": [6], - "tenure": [1], - "household_weight": [999.0], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001], - "household_id": [101], - "age": [44], - "sex": [1], - "education": [3], - "employment_status": [0], - "income": [5.0], - "qualified_dividend_income": [20.0], - "non_qualified_dividend_income": [8.0], - "tax_unit_id": [12345], - } - ) - - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure", "household_weight"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "tax_unit_id", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - class FakeSynthesizer: - def __init__(self, *args, **kwargs): - _ = args, kwargs - self.target_vars = tuple(kwargs.get("target_vars", [])) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - if self.target_vars == ("dividend_income", "qualified_dividend_share"): - result["dividend_income"] = [28.0] - result["qualified_dividend_share"] = [20.0 / 28.0] - if self.target_vars == ("income",): - result["income"] = [5.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(n_synthetic=1, synthesis_backend="bootstrap") - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - seed_data["income"] = [60_000.0] - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert "household_weight" not in integration["integrated_variables"] - assert "tax_unit_id" not in integration["integrated_variables"] - assert "income" not in integration["integrated_variables"] - assert integration["seed_data"]["income"].tolist() == [60_000.0] - - def test_integrate_donor_sources_allows_authoritative_override_for_shared_irs_variables( - self, monkeypatch - ): - captured: list[tuple[str, ...]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = kwargs - self.target_vars = tuple(target_vars) - captured.append(tuple(condition_vars)) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - if self.target_vars == ("self_employment_income",): - result["self_employment_income"] = np.linspace( - -3.0, - 3.0, - len(result), - ) - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 110.0, 120.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [45, 28, 62], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 1, 0], - "income": [60_000.0, 25_000.0, 12_000.0], - "self_employment_income": [75.0, 100.0, 50.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [44, 29, 61], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 1, 0], - "income": [58_000.0, 26_000.0, 13_000.0], - "self_employment_income": [-250.0, 0.0, 500.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "self_employment_income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="irs_soi_puf_2024", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "self_employment_income", - ), - ), - ), - variable_capabilities={ - "self_employment_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ) - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=3, - synthesis_backend="bootstrap", - donor_imputer_authoritative_override_variables=( - "self_employment_income", - ), - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert "self_employment_income" in integration["integrated_variables"] - assert captured[-1] == ( - "age", - "education", - "employment_status", - "income", - "sex", - "state_fips", - "tenure", - ) - assert integration["seed_data"]["self_employment_income"].tolist() == [ - -250.0, - 0.0, - 500.0, - ] - - def test_integrate_donor_sources_appends_puf_support_clone_before_later_donors( - self, monkeypatch - ): - generated_lengths: list[tuple[tuple[str, ...], int]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = condition_vars, kwargs - self.target_vars = tuple(target_vars) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - generated_lengths.append((self.target_vars, len(frame))) - result = frame.copy() - for target in self.target_vars: - result[target] = np.linspace(1.0, float(len(result)), len(result)) - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 200.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 62], - "sex": [1, 2], - "education": [3, 4], - "employment_status": [1, 0], - "income": [60_000.0, 12_000.0], - "self_employment_income": [75.0, 50.0], - "taxpayer_id_type": [1, 2], - } - ) - puf_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [80.0, 90.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - puf_persons = pd.DataFrame( - { - "person_id": [1001, 1002], - "household_id": [101, 102], - "age": [44, 61], - "sex": [1, 2], - "education": [3, 4], - "employment_status": [1, 0], - "income": [58_000.0, 13_000.0], - "self_employment_income": [-250.0, 500.0], - "taxable_interest_income": [10.0, 20.0], - "state_income_tax_paid": [400.0, 50.0], - } - ) - sipp_households = pd.DataFrame( - { - "household_id": [201, 202], - "hh_weight": [70.0, 75.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - sipp_persons = pd.DataFrame( - { - "person_id": [2001, 2002], - "household_id": [201, 202], - "age": [45, 62], - "sex": [1, 2], - "education": [3, 4], - "employment_status": [1, 0], - "income": [59_000.0, 14_000.0], - "ssi_reported": [0.0, 100.0], - } - ) - - def frame_for(name, households, persons, capabilities): - return ObservationFrame( - source=SourceDescriptor( - name=name, - shareability=Shareability.PUBLIC - if name.startswith("cps") - else Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=tuple( - column - for column in persons.columns - if column != "person_id" - ), - ), - ), - variable_capabilities={ - variable: SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ) - for variable in capabilities - }, - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="seed", - puf_support_clone_enabled=True, - puf_support_clone_overlap_variables=("self_employment_income",), - puf_support_clone_both_halves_override_variables=(), - ) - ) - cps_input = pipeline.prepare_source_input( - frame_for( - "cps_asec_test", cps_households, cps_persons, ("taxpayer_id_type",) - ) - ) - puf_input = pipeline.prepare_source_input( - frame_for( - "irs_soi_puf_2024", - puf_households, - puf_persons, - ( - "self_employment_income", - "taxable_interest_income", - "state_income_tax_paid", - ), - ) - ) - sipp_input = pipeline.prepare_source_input( - frame_for("sipp_2023", sipp_households, sipp_persons, ("ssi_reported",)) - ) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[sipp_input, puf_input], - ) - result = integration["seed_data"] - - assert integration["processed_donor_source_order"] == [ - "irs_soi_puf_2024", - "sipp_2023", - ] - assert integration["puf_clone_source_order"] == ["irs_soi_puf_2024"] - assert result["person_is_puf_clone"].tolist() == [0.0, 0.0, 1.0, 1.0] - assert result["hh_weight"].tolist() == [100.0, 200.0, 0.0, 0.0] - assert result["self_employment_income"].iloc[:2].tolist() == [75.0, 50.0] - assert result["self_employment_income"].iloc[2:].tolist() == [-250.0, 500.0] - assert result["taxpayer_id_type"].tolist() == [1, 2, 1, 2] - assert result["taxable_interest_income"].iloc[:2].tolist() == [0.0, 0.0] - assert result["taxable_interest_income"].iloc[2:].tolist() == [10.0, 20.0] - assert "state_income_tax_paid" in result.columns - assert "tax_unit_id" not in result.columns - assert integration["puf_support_clone_summary"][ - "dropped_generated_entity_id_columns" - ] == ["tax_unit_id"] - assert result.index.tolist() == [0, 1, 2, 3] - assert generated_lengths[-1] == (("ssi_reported",), 4) - assert "ssi_reported" in result.columns - - def test_finalize_puf_support_clone_can_collapse_donor_only_values_to_cps_rows( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - puf_support_clone_output_mode="collapse_to_scaffold", - puf_support_clone_both_halves_override_variables=(), - ) - ) - original = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 62], - "self_employment_income": [75.0, 50.0], - } - ) - clone = pd.DataFrame( - { - "person_id": [30, 40], - "household_id": [3, 4], - "age": [45, 62], - us_pipeline_module.PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN: [0, 1], - "self_employment_income": [-250.0, 500.0], - "taxable_interest_income": [10.0, 20.0], - "partnership_s_corp_income": [-700.0, 1_200.0], - "state_income_tax_paid": [400.0, 50.0], - "tax_unit_id": [101, 102], - } - ) - - result, summary = pipeline._finalize_puf_support_clone_frame( - original=original, - imputed_clone=clone, - donor_source_name="irs_soi_puf_2024", - integrated_variables=[ - "self_employment_income", - "taxable_interest_income", - "partnership_s_corp_income", - "state_income_tax_paid", - "tax_unit_id", - ], - preclone_columns=set(original.columns), - donor_seed_columns=set(clone.columns), - donor_observed=set(clone.columns), - ) - - assert result.index.tolist() == [0, 1] - assert result["person_is_puf_clone"].tolist() == [0.0, 0.0] - assert result["person_id"].tolist() == [10, 20] - assert result["household_id"].tolist() == [1, 2] - assert result["self_employment_income"].tolist() == [-250.0, 500.0] - assert result["taxable_interest_income"].tolist() == [10.0, 20.0] - assert result["partnership_s_corp_income"].tolist() == [-700.0, 1_200.0] - assert result["state_income_tax_paid"].tolist() == [400.0, 50.0] - assert "tax_unit_id" not in result.columns - assert summary["output_mode"] == "collapse_to_scaffold" - assert summary["clone_row_count"] == 2 - assert summary["emitted_clone_row_count"] == 0 - assert summary["final_row_count"] == 2 - assert summary["dropped_generated_entity_id_columns"] == ["tax_unit_id"] - assert summary["collapse_copy_variables"] == [ - "partnership_s_corp_income", - "self_employment_income", - "state_income_tax_paid", - "taxable_interest_income", - ] - assert summary["overlap_collapse_override_variables"] == [ - "self_employment_income", - ] - assert summary["source_row_alignment"] == { - "enabled": True, - "column": us_pipeline_module.PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN, - "row_count": 2, - "clone_was_reordered": False, - } - - def test_finalize_puf_support_clone_keeps_cps_measured_income_totals( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - puf_support_clone_output_mode="collapse_to_scaffold", - puf_support_clone_both_halves_override_variables=(), - ) - ) - original = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [70, 45], - "employment_income": [30_000.0, 10_000.0], - "wage_income": [40_000.0, 12_000.0], - "social_security": [18_000.0, 0.0], - "social_security_retirement": [18_000.0, 0.0], - "social_security_disability": [0.0, 0.0], - } - ) - clone = pd.DataFrame( - { - "person_id": [30, 40], - "household_id": [3, 4], - "age": [70, 45], - us_pipeline_module.PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN: [0, 1], - "employment_income": [90_000.0, 20_000.0], - "employment_income_before_lsr": [120_000.0, 45_000.0], - "social_security": [60_000.0, 25_000.0], - "social_security_retirement": [1_000.0, 0.0], - "social_security_disability": [1_000.0, 500.0], - } - ) - - result, summary = pipeline._finalize_puf_support_clone_frame( - original=original, - imputed_clone=clone, - donor_source_name="irs_soi_puf_2024", - integrated_variables=[ - "employment_income", - "employment_income_before_lsr", - "social_security", - ], - preclone_columns=set(original.columns), - donor_seed_columns=set(clone.columns), - donor_observed=set(clone.columns), - ) - - assert result["employment_income"].tolist() == [90_000.0, 20_000.0] - assert "employment_income_before_lsr" not in result.columns - assert result["social_security"].tolist() == [18_000.0, 0.0] - assert result["social_security_retirement"].tolist() == [18_000.0, 0.0] - assert result["social_security_disability"].tolist() == [0.0, 0.0] - - augmented = pipeline._augment_policyengine_person_inputs(result) - - assert augmented["employment_income_before_lsr"].tolist() == [ - 90_000.0, - 20_000.0, - ] - assert augmented["social_security_retirement"].tolist() == [18_000.0, 0.0] - assert "social_security" not in summary["collapse_copy_variables"] - assert "social_security" not in summary["overlap_collapse_override_variables"] - assert summary["donor_only_collapse_excluded_variables"] == [ - "employment_income_before_lsr" - ] - - def test_finalize_puf_support_clone_preserves_puf_tax_details_by_default( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - puf_support_clone_output_mode="collapse_to_scaffold", - puf_support_clone_both_halves_override_variables=(), - ) - ) - original = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 62], - "employment_income": [30_000.0, 10_000.0], - "self_employment_income": [500.0, 250.0], - "long_term_capital_gains": [1_000.0, 0.0], - "short_term_capital_gains": [100.0, 0.0], - "capital_gains": [1_100.0, 0.0], - "interest_income": [3.0, 4.0], - # Regression coverage for preclone components: these may exist on - # the CPS scaffold already, but PUF-integrated leaves must still - # survive collapse back to the scaffold rows. - "taxable_interest_income": [3.0, 4.0], - "tax_exempt_interest_income": [3.0, 4.0], - "dividend_income": [10.0, 5.0], - "qualified_dividend_income": [10.0, 5.0], - "non_qualified_dividend_income": [10.0, 5.0], - "pension_income": [100.0, 200.0], - "taxable_pension_income": [100.0, 200.0], - "tax_exempt_pension_income": [100.0, 200.0], - "unemployment_compensation": [100.0, 0.0], - "taxable_unemployment_compensation": [100.0, 0.0], - } - ) - clone = pd.DataFrame( - { - "person_id": [30, 40], - "household_id": [3, 4], - "age": [45, 62], - us_pipeline_module.PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN: [0, 1], - "employment_income": [90_000.0, 20_000.0], - "self_employment_income": [-4_000.0, 8_000.0], - "long_term_capital_gains": [50_000.0, -1_000.0], - "short_term_capital_gains": [2_500.0, -500.0], - "capital_gains": [52_500.0, -1_500.0], - "taxable_interest_income": [1_000.0, 0.0], - "tax_exempt_interest_income": [500.0, 0.0], - "qualified_dividend_income": [20.0, 0.0], - "non_qualified_dividend_income": [5.0, 0.0], - "ordinary_dividend_income": [25.0, 0.0], - "dividend_income": [25.0, 0.0], - "taxable_pension_income": [90.0, 0.0], - "tax_exempt_pension_income": [10.0, 0.0], - "taxable_unemployment_compensation": [600.0, 700.0], - } - ) - - result, summary = pipeline._finalize_puf_support_clone_frame( - original=original, - imputed_clone=clone, - donor_source_name="irs_soi_puf_2024", - integrated_variables=[ - "taxable_interest_income", - "tax_exempt_interest_income", - "employment_income", - "self_employment_income", - "long_term_capital_gains", - "short_term_capital_gains", - "capital_gains", - "qualified_dividend_income", - "non_qualified_dividend_income", - "taxable_pension_income", - "tax_exempt_pension_income", - "taxable_unemployment_compensation", - ], - preclone_columns=set(original.columns), - donor_seed_columns=set(clone.columns), - donor_observed=set(clone.columns), - ) - - assert result["employment_income"].tolist() == [90_000.0, 20_000.0] - assert result["self_employment_income"].tolist() == [-4_000.0, 8_000.0] - assert result["long_term_capital_gains"].tolist() == [50_000.0, -1_000.0] - assert result["short_term_capital_gains"].tolist() == [2_500.0, -500.0] - assert result["capital_gains"].tolist() == [52_500.0, -1_500.0] - assert result["taxable_interest_income"].tolist() == [1_000.0, 0.0] - assert result["tax_exempt_interest_income"].tolist() == [500.0, 0.0] - assert result["interest_income"].tolist() == [1_500.0, 0.0] - assert result["taxable_unemployment_compensation"].tolist() == [600.0, 700.0] - assert result["unemployment_compensation"].tolist() == [600.0, 700.0] - assert result["dividend_income"].tolist() == [25.0, 0.0] - assert result["ordinary_dividend_income"].tolist() == [25.0, 0.0] - assert result["qualified_dividend_income"].tolist() == [20.0, 0.0] - assert result["non_qualified_dividend_income"].tolist() == [5.0, 0.0] - assert result["taxable_pension_income"].tolist() == [90.0, 0.0] - assert result["tax_exempt_pension_income"].tolist() == [10.0, 0.0] - assert result["pension_income"].tolist() == [100.0, 0.0] - passthrough = summary["cps_measured_total_passthrough"] - assert passthrough["enabled"] is False - assert passthrough["passthrough_variables"] == [] - assert passthrough["dividend_components_scaled_to_cps_total"] is False - assert set(passthrough["identity_reconciled_variables"]) >= { - "dividend_income", - "interest_income", - "ordinary_dividend_income", - "pension_income", - "unemployment_compensation", - } - assert set(summary["collapse_copy_variables"]) >= { - "dividend_income", - "employment_income", - "interest_income", - "long_term_capital_gains", - "non_qualified_dividend_income", - "ordinary_dividend_income", - "pension_income", - "qualified_dividend_income", - "self_employment_income", - "short_term_capital_gains", - "tax_exempt_interest_income", - "tax_exempt_pension_income", - "taxable_interest_income", - "taxable_pension_income", - "taxable_unemployment_compensation", - "unemployment_compensation", - } - assert set(summary["overlap_collapse_override_variables"]) >= { - "capital_gains", - "employment_income", - "long_term_capital_gains", - "self_employment_income", - "short_term_capital_gains", - "tax_exempt_interest_income", - "tax_exempt_pension_income", - "taxable_interest_income", - "taxable_pension_income", - "taxable_unemployment_compensation", - } - assert summary["source_row_alignment"]["clone_was_reordered"] is False - - def test_finalize_puf_support_clone_aligns_shuffled_clone_by_source_row_id( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - puf_support_clone_output_mode="collapse_to_scaffold", - puf_support_clone_both_halves_override_variables=(), - ) - ) - original = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 62], - "self_employment_income": [75.0, 50.0], - } - ) - clone = pd.DataFrame( - { - "person_id": [40, 30], - "household_id": [4, 3], - "age": [62, 45], - us_pipeline_module.PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN: [1, 0], - "self_employment_income": [500.0, -250.0], - } - ) - - result, summary = pipeline._finalize_puf_support_clone_frame( - original=original, - imputed_clone=clone, - donor_source_name="irs_soi_puf_2024", - integrated_variables=["self_employment_income"], - preclone_columns=set(original.columns), - donor_seed_columns=set(clone.columns), - donor_observed=set(clone.columns), - ) - - assert result["person_id"].tolist() == [10, 20] - assert result["self_employment_income"].tolist() == [-250.0, 500.0] - assert summary["source_row_alignment"] == { - "enabled": True, - "column": us_pipeline_module.PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN, - "row_count": 2, - "clone_was_reordered": True, - } - - def test_finalize_puf_support_clone_can_scale_tax_details_to_cps_totals( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - puf_support_clone_output_mode="collapse_to_scaffold", - puf_support_clone_both_halves_override_variables=(), - puf_support_clone_scale_tax_details_to_cps_totals=True, - ) - ) - original = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 62], - "interest_income": [3.0, 4.0], - "taxable_interest_income": [3.0, 4.0], - "tax_exempt_interest_income": [3.0, 4.0], - "dividend_income": [10.0, 5.0], - "qualified_dividend_income": [10.0, 5.0], - "non_qualified_dividend_income": [10.0, 5.0], - "pension_income": [100.0, 200.0], - "taxable_pension_income": [100.0, 200.0], - "tax_exempt_pension_income": [100.0, 200.0], - "unemployment_compensation": [100.0, 0.0], - "taxable_unemployment_compensation": [100.0, 0.0], - } - ) - clone = pd.DataFrame( - { - "person_id": [30, 40], - "household_id": [3, 4], - "age": [45, 62], - us_pipeline_module.PUF_SUPPORT_CLONE_SOURCE_ROW_ID_COLUMN: [0, 1], - "taxable_interest_income": [1_000.0, 0.0], - "tax_exempt_interest_income": [500.0, 0.0], - "qualified_dividend_income": [20.0, 0.0], - "non_qualified_dividend_income": [5.0, 0.0], - "ordinary_dividend_income": [25.0, 0.0], - "dividend_income": [25.0, 0.0], - "taxable_pension_income": [90.0, 0.0], - "tax_exempt_pension_income": [10.0, 0.0], - "taxable_unemployment_compensation": [600.0, 700.0], - } - ) - - result, summary = pipeline._finalize_puf_support_clone_frame( - original=original, - imputed_clone=clone, - donor_source_name="irs_soi_puf_2024", - integrated_variables=[ - "taxable_interest_income", - "tax_exempt_interest_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "taxable_pension_income", - "tax_exempt_pension_income", - "taxable_unemployment_compensation", - ], - preclone_columns=set(original.columns), - donor_seed_columns=set(clone.columns), - donor_observed=set(clone.columns), - ) - - assert result["taxable_interest_income"].round(6).tolist() == [2.0, 2.72] - assert result["tax_exempt_interest_income"].round(6).tolist() == [1.0, 1.28] - assert result["interest_income"].round(6).tolist() == [3.0, 4.0] - assert result["taxable_unemployment_compensation"].tolist() == [100.0, 0.0] - assert result["unemployment_compensation"].tolist() == [100.0, 0.0] - assert result["dividend_income"].tolist() == [10.0, 5.0] - assert result["ordinary_dividend_income"].tolist() == [10.0, 5.0] - assert result["qualified_dividend_income"].round(6).tolist() == [ - 8.0, - 3.9, - ] - assert result["non_qualified_dividend_income"].round(6).tolist() == [ - 2.0, - 1.1, - ] - assert result["taxable_pension_income"].round(6).tolist() == [ - 90.0, - 118.0, - ] - assert result["tax_exempt_pension_income"].round(6).tolist() == [ - 10.0, - 82.0, - ] - assert result["pension_income"].round(6).tolist() == [100.0, 200.0] - passthrough = summary["cps_measured_total_passthrough"] - assert passthrough["enabled"] is True - assert passthrough["passthrough_variables"] == [ - "non_qualified_dividend_income", - "qualified_dividend_income", - "tax_exempt_interest_income", - "tax_exempt_pension_income", - "taxable_interest_income", - "taxable_pension_income", - "taxable_unemployment_compensation", - ] - assert passthrough["dividend_components_scaled_to_cps_total"] is True - assert set(passthrough["identity_reconciled_variables"]) >= { - "dividend_income", - "interest_income", - "ordinary_dividend_income", - "pension_income", - "unemployment_compensation", - } - - def test_integrate_donor_sources_collapses_puf_support_clone_before_later_donors( - self, monkeypatch - ): - generated_lengths: list[tuple[tuple[str, ...], int]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = condition_vars, kwargs - self.target_vars = tuple(target_vars) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - generated_lengths.append((self.target_vars, len(frame))) - result = frame.copy() - for target in self.target_vars: - result[target] = np.linspace(1.0, float(len(result)), len(result)) - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - def frame_for(name, households, persons, capabilities): - return ObservationFrame( - source=SourceDescriptor( - name=name, - shareability=Shareability.PUBLIC - if name.startswith("cps") - else Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=tuple( - column - for column in persons.columns - if column != "person_id" - ), - ), - ), - variable_capabilities={ - variable: SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ) - for variable in capabilities - }, - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 200.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 62], - "sex": [1, 2], - "education": [3, 4], - "employment_status": [1, 0], - "income": [60_000.0, 12_000.0], - "self_employment_income": [75.0, 50.0], - "taxpayer_id_type": [1, 2], - } - ) - puf_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [80.0, 90.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - puf_persons = pd.DataFrame( - { - "person_id": [1001, 1002], - "household_id": [101, 102], - "age": [44, 61], - "sex": [1, 2], - "education": [3, 4], - "employment_status": [1, 0], - "income": [58_000.0, 13_000.0], - "self_employment_income": [-250.0, 500.0], - "taxable_interest_income": [10.0, 20.0], - "state_income_tax_paid": [400.0, 50.0], - } - ) - sipp_households = pd.DataFrame( - { - "household_id": [201, 202], - "hh_weight": [70.0, 75.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - sipp_persons = pd.DataFrame( - { - "person_id": [2001, 2002], - "household_id": [201, 202], - "age": [45, 62], - "sex": [1, 2], - "education": [3, 4], - "employment_status": [1, 0], - "income": [59_000.0, 14_000.0], - "ssi_reported": [0.0, 100.0], - } - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="seed", - puf_support_clone_enabled=True, - puf_support_clone_output_mode="collapse_to_scaffold", - puf_support_clone_overlap_variables=("self_employment_income",), - puf_support_clone_both_halves_override_variables=(), - ) - ) - cps_input = pipeline.prepare_source_input( - frame_for( - "cps_asec_test", cps_households, cps_persons, ("taxpayer_id_type",) - ) - ) - puf_input = pipeline.prepare_source_input( - frame_for( - "irs_soi_puf_2024", - puf_households, - puf_persons, - ( - "self_employment_income", - "taxable_interest_income", - "state_income_tax_paid", - ), - ) - ) - sipp_input = pipeline.prepare_source_input( - frame_for("sipp_2023", sipp_households, sipp_persons, ("ssi_reported",)) - ) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[sipp_input, puf_input], - ) - result = integration["seed_data"] - - assert result.index.tolist() == [0, 1] - assert result["person_is_puf_clone"].tolist() == [0.0, 0.0] - assert result["hh_weight"].tolist() == [100.0, 200.0] - assert result["self_employment_income"].tolist() == [-250.0, 500.0] - assert result["taxable_interest_income"].tolist() == [10.0, 20.0] - assert sorted(result["state_income_tax_paid"].tolist()) == [50.0, 400.0] - assert integration["puf_support_clone_summary"]["output_mode"] == ( - "collapse_to_scaffold" - ) - assert integration["puf_support_clone_summary"]["final_row_count"] == 2 - assert integration["puf_support_clone_summary"]["emitted_clone_row_count"] == 0 - assert generated_lengths[-1] == (("ssi_reported",), 2) - - def test_puf_support_clone_refresh_rematches_cps_only_disability_to_puf_income( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - ) - ) - original = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [1, 2], - "age": [40, 40], - "is_male": [1, 1], - "state_fips": [6, 6], - "employment_income": [0.0, 100_000.0], - "self_employment_income": [0.0, 0.0], - "social_security": [0.0, 0.0], - "is_disabled": [1, 0], - "difficulty_hearing": [1, 0], - "meets_ssi_disability_criteria": [1, 0], - } - ) - clone = original.copy() - clone["employment_income"] = [100_000.0, 0.0] - - refreshed, summary = pipeline._refresh_puf_support_clone_cps_only_fields( - original=original, - clone=clone, - integrated_variables=["employment_income"], - preclone_columns=set(original.columns), - ) - - assert refreshed["is_disabled"].tolist() == [0, 1] - assert refreshed["difficulty_hearing"].tolist() == [0, 1] - assert refreshed["meets_ssi_disability_criteria"].tolist() == [0, 1] - assert "employment_income" in summary["condition_variables"] - assert summary["matched_source_row_count"] == 2 - assert "is_disabled" in summary["refreshed_variables"] - - def test_puf_support_clone_refresh_does_not_overwrite_amount_fields(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - ) - ) - original = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [1, 2], - "age": [40, 40], - "is_male": [1, 1], - "state_fips": [6, 6], - "employment_income": [0.0, 100_000.0], - "self_employment_income": [0.0, 0.0], - "social_security": [0.0, 0.0], - "is_disabled": [1, 0], - "disability_benefits": [4_000.0, 0.0], - "weekly_hours_worked": [0.0, 40.0], - "taxable_401k_distributions": [0.0, 2_000.0], - } - ) - clone = original.copy() - clone["employment_income"] = [100_000.0, 0.0] - clone["disability_benefits"] = [123_456.0, 789_012.0] - clone["weekly_hours_worked"] = [12.0, 34.0] - clone["taxable_401k_distributions"] = [56.0, 78.0] - - refreshed, summary = pipeline._refresh_puf_support_clone_cps_only_fields( - original=original, - clone=clone, - integrated_variables=["employment_income"], - preclone_columns=set(original.columns), - ) - - assert refreshed["is_disabled"].tolist() == [0, 1] - assert refreshed["disability_benefits"].tolist() == [123_456.0, 789_012.0] - assert refreshed["weekly_hours_worked"].tolist() == [12.0, 34.0] - assert refreshed["taxable_401k_distributions"].tolist() == [56.0, 78.0] - assert "is_disabled" in summary["refreshed_variables"] - assert "disability_benefits" not in summary["refreshed_variables"] - assert "weekly_hours_worked" not in summary["refreshed_variables"] - assert "taxable_401k_distributions" not in summary["refreshed_variables"] - - def test_puf_support_clone_refresh_reconciles_social_security_subcomponents( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - ) - ) - clone = pd.DataFrame( - { - "age": [45, 70, 40], - "social_security": [12_000.0, 8_000.0, 0.0], - "social_security_retirement": [0.0, 2_000.0, 100.0], - "social_security_disability": [3_000.0, 0.0, 50.0], - } - ) - - reconciled = pipeline._reconcile_puf_support_clone_social_security(clone) - - assert reconciled == [ - "social_security_retirement", - "social_security_disability", - ] - assert clone["social_security_disability"].tolist() == [12_000.0, 0.0, 0.0] - assert clone["social_security_retirement"].tolist() == [0.0, 8_000.0, 0.0] - - def test_integrate_donor_sources_puf_support_clone_validates_scaffold_and_donor( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - synthesis_backend="seed", - puf_support_clone_enabled=True, - ) - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="cps_asec_test", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips",), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=("household_id", "age", "income"), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: pd.DataFrame( - {"household_id": [1], "hh_weight": [1.0], "state_fips": [6]} - ), - EntityType.PERSON: pd.DataFrame( - { - "person_id": [1], - "household_id": [1], - "age": [40], - "income": [1.0], - } - ), - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - cps_input = pipeline.prepare_source_input(frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - with pytest.raises(ValueError, match="requires exactly one PUF donor"): - pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[], - ) - - def test_integrate_donor_sources_zeroes_minor_employment_income_after_authoritative_override( - self, monkeypatch - ): - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = condition_vars, kwargs - self.target_vars = tuple(target_vars) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - if self.target_vars == ("employment_income",): - result["employment_income"] = np.linspace(1.0, 2.0, len(result)) - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [16, 35], - "sex": [1, 2], - "education": [1, 3], - "employment_status": [0, 1], - "income": [5_000.0, 55_000.0], - "employment_income": [500.0, 40_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [90.0, 110.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002], - "household_id": [101, 102], - "age": [17, 36], - "sex": [1, 2], - "education": [1, 3], - "employment_status": [0, 1], - "income": [6_000.0, 56_000.0], - "employment_income": [50_000.0, 80_000.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "employment_income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "employment_income", - ), - ), - ), - variable_capabilities={ - "employment_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=False, - ) - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - donor_imputer_authoritative_override_variables=("employment_income",), - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert "employment_income" in integration["integrated_variables"] - assert integration["seed_data"]["employment_income"].tolist() == [ - 0.0, - 80_000.0, - ] - - def test_integrate_donor_sources_zeroes_retired_senior_employment_income_without_esi( - self, monkeypatch - ): - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = condition_vars, kwargs - self.target_vars = tuple(target_vars) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - if self.target_vars == ("employment_income",): - result["employment_income"] = np.linspace( - 70_000.0, 90_000.0, len(result) - ) - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 130.0], - "state_fips": [6, 36, 48], - "tenure": [1, 2, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [68, 68, 68], - "sex": [1, 2, 1], - "education": [3, 3, 3], - "employment_status": [1, 1, 1], - "income": [45_000.0, 65_000.0, 50_000.0], - "employment_income": [30_000.0, 40_000.0, 35_000.0], - "social_security_retirement": [18_000.0, 18_000.0, 0.0], - "has_esi": [0.0, 1.0, 0.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [90.0, 110.0, 105.0], - "state_fips": [6, 36, 48], - "tenure": [1, 2, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [68, 68, 68], - "sex": [1, 2, 1], - "education": [3, 3, 3], - "employment_status": [1, 1, 1], - "income": [46_000.0, 66_000.0, 51_000.0], - "employment_income": [80_000.0, 85_000.0, 82_000.0], - "social_security_retirement": [19_000.0, 19_000.0, 0.0], - "has_esi": [0.0, 1.0, 0.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "employment_income", - "social_security_retirement", - "has_esi", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "employment_income", - "social_security_retirement", - "has_esi", - ), - ), - ), - variable_capabilities={ - "employment_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=False, - ) - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=3, - synthesis_backend="bootstrap", - donor_imputer_authoritative_override_variables=("employment_income",), - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert "employment_income" in integration["integrated_variables"] - employment_income = integration["seed_data"]["employment_income"].tolist() - assert employment_income[0] == 0.0 - assert employment_income[1] > 0.0 - assert employment_income[2] > 0.0 - - def test_integrate_donor_sources_normalizes_social_security_before_senior_wage_guard( - self, monkeypatch - ): - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = condition_vars, kwargs - self.target_vars = tuple(target_vars) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - if self.target_vars == ("employment_income",): - result["employment_income"] = [70_000.0, 90_000.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [68, 68], - "sex": [1, 2], - "education": [3, 3], - "employment_status": [1, 1], - "income": [45_000.0, 65_000.0], - "employment_income": [30_000.0, 40_000.0], - "social_security": [18_000.0, 0.0], - "has_esi": [0.0, 0.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [90.0, 110.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002], - "household_id": [101, 102], - "age": [68, 68], - "sex": [1, 2], - "education": [3, 3], - "employment_status": [1, 1], - "income": [46_000.0, 66_000.0], - "employment_income": [80_000.0, 85_000.0], - "social_security": [19_000.0, 0.0], - "has_esi": [0.0, 0.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "employment_income", - "social_security", - "has_esi", - ), - ), - ), - variable_capabilities={ - "employment_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=False, - ) - }, - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "employment_income", - "social_security", - "has_esi", - ), - ), - ), - variable_capabilities={ - "employment_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=False, - ) - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - donor_imputer_authoritative_override_variables=("employment_income",), - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert integration["seed_data"]["social_security_retirement"].tolist() == [ - 0.0, - 0.0, - ] - assert integration["seed_data"]["social_security_unclassified"].tolist() == [ - 18_000.0, - 0.0, - ] - assert integration["seed_data"]["employment_income"].tolist() == [0.0, 85_000.0] - - def test_export_policyengine_dataset(self, persons, households, tmp_path): - config = USMicroplexBuildConfig( - n_synthetic=8, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - ) - result = build_us_microplex(persons, households, config) - pipeline = USMicroplexPipeline(config) - - output_path = pipeline.export_policyengine_dataset( - result, tmp_path / "us_microplex.h5" - ) - - assert output_path.exists() - with h5py.File(output_path, "r") as handle: - assert "county_fips" in handle - exported_counties = handle["county_fips"]["2024"][()] - normalized_counties = { - str(value.decode() if isinstance(value, bytes) else value).zfill(5) - for value in np.asarray(exported_counties).tolist() - } - assert normalized_counties == {"06037", "36061", "48201"} - - def test_export_policyengine_dataset_passes_direct_overrides( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - captured: list[tuple[str, ...]] = [] - - original_build_maps = build_policyengine_us_export_variable_maps - - def _capture_build_maps(*args, **kwargs): - captured.append(tuple(kwargs.get("direct_override_variables", ()))) - return original_build_maps(*args, **kwargs) - - monkeypatch.setattr( - "microplex_us.pipelines.us.build_policyengine_us_export_variable_maps", - _capture_build_maps, - ) - - config = USMicroplexBuildConfig( - n_synthetic=8, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - policyengine_direct_override_variables=("filing_status",), - ) - result = build_us_microplex(persons, households, config) - pipeline = USMicroplexPipeline(config) - - output_path = pipeline.export_policyengine_dataset( - result, tmp_path / "us_microplex.h5" - ) - - assert output_path.exists() - assert captured == [("filing_status",)] - - def test_export_policyengine_dataset_normalizes_checkpoint_person_inputs( - self, - tmp_path, - monkeypatch, - ): - captured_persons: list[pd.DataFrame] = [] - - def _identity_marketplace_ratio(self, tables, *, target_period): - return tables - - def _fake_build_maps(tables, **kwargs): - captured_persons.append(tables.persons.copy()) - return { - "household": {}, - "person": { - "rental_income": "rental_income", - "farm_income": "farm_income", - }, - "tax_unit": {}, - "spm_unit": {}, - "family": {}, - } - - def _fake_arrays(*args, **kwargs): - return {} - - def _fake_write(arrays, path, **kwargs): - Path(path).write_text("h5 placeholder") - return Path(path) - - monkeypatch.setattr( - USMicroplexPipeline, - "_attach_policyengine_marketplace_plan_benchmark_ratio", - _identity_marketplace_ratio, - ) - monkeypatch.setattr( - USMicroplexPipeline, - "_resolve_policyengine_tax_benefit_system", - lambda self: SimpleNamespace(variables={}), - ) - monkeypatch.setattr( - us_pipeline_module, - "build_policyengine_us_export_variable_maps", - _fake_build_maps, - ) - monkeypatch.setattr( - us_pipeline_module, - "resolve_policyengine_excluded_export_variables", - lambda *args, **kwargs: set(), - ) - monkeypatch.setattr( - us_pipeline_module, - "build_policyengine_us_time_period_arrays", - _fake_arrays, - ) - monkeypatch.setattr( - us_pipeline_module, - "write_policyengine_us_time_period_dataset", - _fake_write, - ) - - config = USMicroplexBuildConfig(policyengine_dataset_year=2024) - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame({"household_id": [1], "household_weight": [1.0]}), - persons=pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 1, 1], - "age": [45, 50, 55], - "sex": [1, 2, 1], - "income": [1_000.0, 1_000.0, 1_000.0], - "rental_income": [900.0, 900.0, 900.0], - "rental_income_positive": [300.0, 0.0, 50.0], - "rental_income_negative": [100.0, 200.0, 0.0], - "farm_income": [20.0, 30.0, 40.0], - "farm_operations_income": [10.0, -15.0, 0.0], - } - ), - ) - result = USMicroplexBuildResult( - config=config, - seed_data=pd.DataFrame(), - synthetic_data=pd.DataFrame(), - calibrated_data=pd.DataFrame(), - targets=USMicroplexTargets(marginal={}, continuous={}), - calibration_summary={}, - policyengine_tables=tables, - ) - - output_path = USMicroplexPipeline(config).export_policyengine_dataset( - result, - tmp_path / "us_microplex.h5", - ) - - assert output_path.exists() - assert captured_persons[0]["rental_income"].tolist() == [ - 200.0, - -200.0, - 50.0, - ] - assert captured_persons[0]["farm_income"].tolist() == [10.0, -15.0, 40.0] - - def test_augment_policyengine_person_inputs_materializes_non_sch_d_capital_gains( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "non_sch_d_capital_gains": [250.0], - "age": [45], - "sex": [1], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["non_sch_d_capital_gains"].tolist() == [250.0] - - def test_augment_policyengine_person_inputs_aliases_rent_to_pre_subsidy_rent( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "rent": [14_400.0, 0.0, 9_600.0], - "pre_subsidy_rent": [0.0, 7_200.0, None], - "age": [45, 70, 12], - "sex": [1, 2, 1], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["pre_subsidy_rent"].tolist() == [ - 14_400.0, - 7_200.0, - 9_600.0, - ] - - def test_augment_policyengine_person_inputs_recomposes_signed_rental_income( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "age": [45, 50, 55], - "sex": [1, 2, 1], - "income": [1_000.0, 1_000.0, 1_000.0], - "rental_income": [900.0, 900.0, 900.0], - "rental_income_positive": [300.0, 0.0, 50.0], - "rental_income_negative": [100.0, 200.0, 0.0], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["rental_income"].tolist() == [200.0, -200.0, 50.0] - assert augmented["employment_income_before_lsr"].tolist() == [ - 800.0, - 1_200.0, - 950.0, - ] - - def test_augment_policyengine_person_inputs_preserves_zero_employment_income( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "age": [45, 50], - "sex": [1, 2], - "income": [50_000.0, 75_000.0], - "employment_income": [0.0, 20_000.0], - "wage_income": [50_000.0, 25_000.0], - "self_employment_income": [0.0, 0.0], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["employment_income_before_lsr"].tolist() == [ - 0.0, - 20_000.0, - ] - - def test_augment_policyengine_person_inputs_prefers_signed_business_losses( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "age": [45, 50, 55], - "sex": [1, 2, 1], - "income": [1_000.0, 1_000.0, 1_000.0], - "self_employment_income_before_lsr": [50.0, 60.0, 70.0], - "self_employment_income": [100.0, -25.0, 0.0], - "farm_income": [20.0, 30.0, 40.0], - "farm_operations_income": [10.0, -15.0, 0.0], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["self_employment_income_before_lsr"].tolist() == [ - 100.0, - -25.0, - 70.0, - ] - assert augmented["farm_income"].tolist() == [10.0, -15.0, 40.0] - assert augmented["employment_income_before_lsr"].tolist() == [ - 900.0, - 1_025.0, - 930.0, - ] - - def test_augment_policyengine_person_inputs_zeros_part_b_without_medicare( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "medicare_part_b_premiums": [100.0, 200.0, -30.0, 400.0], - "has_medicare": [0, 1, 0, 1], - "age": [12, 70, 45, 58], - "sex": [1, 2, 1, 2], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["has_medicare"].tolist() == [False, True, False, True] - assert augmented["medicare_part_b_premiums"].tolist() == [ - 0.0, - 200.0, - 0.0, - 400.0, - ] - - def test_augment_policyengine_person_inputs_derives_blind_flag(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "difficulty_seeing": [0, 1, None, 2], - "age": [30, 45, 70, 12], - "sex": [1, 2, 1, 2], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["is_blind"].tolist() == [False, True, False, True] - - def test_augment_policyengine_person_inputs_uses_reported_ssi_for_takeup_only( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "ssi": [500.0, 0.0, 200.0], - "ssi_reported": [0.0, 100.0, 0.0], - "age": [70, 45, 34], - "sex": [1, 2, 1], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["ssi"].tolist() == [500.0, 0.0, 200.0] - assert augmented["ssi_reported"].tolist() == [0.0, 100.0, 0.0] - assert augmented["takes_up_ssi_if_eligible"].tolist() == [ - False, - True, - False, - ] - - def test_augment_policyengine_person_inputs_normalizes_explicit_ssi_takeup( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "takes_up_ssi_if_eligible": [1, 0, None, 2], - "ssi_reported": [0.0, 100.0, 100.0, 0.0], - "age": [70, 45, 34, 60], - "sex": [1, 2, 1, 2], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["takes_up_ssi_if_eligible"].tolist() == [ - True, - False, - False, - True, - ] - - def test_calibrate_policyengine_ssi_takeup_uses_reported_amounts_by_age( - self, - monkeypatch, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - policyengine_dataset_year=2024, - policyengine_calibration_target_profile="pe_native_broad", - ) - ) - persons = pd.DataFrame( - { - "person_id": [1, 2, 3, 4], - "household_id": [10, 20, 30, 40], - "age": [70, 70, 40, 40], - "weight": [1.0, 1.0, 1.0, 1.0], - "ssi": [100.0, 0.0, 100.0, 0.0], - "takes_up_ssi_if_eligible": [True, False, True, False], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20, 30, 40], - "household_weight": [1.0, 1.0, 1.0, 1.0], - } - ), - persons=persons, - tax_units=pd.DataFrame({"tax_unit_id": [1], "household_id": [10]}), - spm_units=pd.DataFrame({"spm_unit_id": [1], "household_id": [10]}), - families=pd.DataFrame({"family_id": [1], "household_id": [10]}), - ) - - def fake_materialize(tables_arg, **kwargs): - assert kwargs["variables"] == ("ssi",) - assert tables_arg.persons["takes_up_ssi_if_eligible"].all() - materialized_persons = tables_arg.persons.copy() - materialized_persons["ssi"] = [80.0, 20.0, 20.0, 80.0] - return PolicyEngineUSVariableMaterializationResult( - tables=PolicyEngineUSEntityTableBundle( - households=tables_arg.households, - persons=materialized_persons, - tax_units=tables_arg.tax_units, - spm_units=tables_arg.spm_units, - families=tables_arg.families, - marital_units=tables_arg.marital_units, - ), - bindings={ - "ssi": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="ssi", - ) - }, - materialized_variables=("ssi",), - ) - - monkeypatch.setattr( - us_pipeline_module, - "materialize_policyengine_us_variables_safely", - fake_materialize, - ) - - updated_tables, summary = ( - pipeline._calibrate_policyengine_ssi_takeup_from_reported_amounts( - tables, - target_period=2024, - ) - ) - - assert updated_tables.persons["takes_up_ssi_if_eligible"].tolist() == [ - True, - True, - True, - True, - ] - assert summary["enabled"] is True - assert summary["reported_amount"] == 200.0 - assert summary["selected_amount"] == 200.0 - - def test_augment_policyengine_person_inputs_materializes_agi_parity_inputs(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "estate_income": [22.0], - "farm_operations_income": [120.0], - "farm_rent_income": [35.0], - "health_savings_account_ald": [20.0], - "self_employed_health_insurance_ald": [15.0], - "self_employed_pension_contribution_ald": [10.0], - "age": [45], - "sex": [1], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["estate_income"].tolist() == [22.0] - assert augmented["farm_operations_income"].tolist() == [120.0] - assert augmented["farm_rent_income"].tolist() == [35.0] - assert augmented["health_savings_account_ald"].tolist() == [20.0] - assert augmented["self_employed_health_insurance_ald"].tolist() == [15.0] - assert augmented["self_employed_pension_contribution_ald"].tolist() == [10.0] - - def test_augment_policyengine_person_inputs_materializes_export_support_aliases( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "age": [45, 50], - "sex": [1, 2], - "w2_wages_from_qualified_business": [1_000.0, 0.0], - "unadjusted_basis_qualified_property": [10_000.0, 0.0], - "business_is_sstb": [1, 0], - "sstb_self_employment_income": [300.0, 0.0], - "sstb_w2_wages_from_qualified_business": [200.0, 0.0], - "sstb_unadjusted_basis_qualified_property": [2_000.0, 0.0], - "self_employment_income_would_be_qualified": [1, 0], - "sstb_self_employment_income_would_be_qualified": [1, 0], - "qualified_reit_and_ptp_income": [75.0, 0.0], - "qualified_bdc_income": [25.0, 0.0], - "deductible_mortgage_interest": [900.0, 0.0], - "investment_income_elected_form_4952": [40.0, 0.0], - "health_insurance_premiums_without_medicare_part_b": [120.0, 0.0], - "hours_worked": [37.5, 0.0], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["w2_wages_from_qualified_business"].tolist() == [1_000.0, 0.0] - assert augmented["unadjusted_basis_qualified_property"].tolist() == [ - 10_000.0, - 0.0, - ] - assert augmented["business_is_sstb"].tolist() == [True, False] - assert augmented["sstb_self_employment_income_before_lsr"].tolist() == [ - 300.0, - 0.0, - ] - assert augmented["sstb_w2_wages_from_qualified_business"].tolist() == [ - 200.0, - 0.0, - ] - assert augmented["sstb_unadjusted_basis_qualified_property"].tolist() == [ - 2_000.0, - 0.0, - ] - assert augmented["self_employment_income_would_be_qualified"].tolist() == [ - True, - False, - ] - assert augmented["sstb_self_employment_income_would_be_qualified"].tolist() == [ - True, - False, - ] - assert augmented["qualified_reit_and_ptp_income"].tolist() == [75.0, 0.0] - assert augmented["qualified_bdc_income"].tolist() == [25.0, 0.0] - assert augmented["home_mortgage_interest"].tolist() == [900.0, 0.0] - assert augmented["investment_interest_expense"].tolist() == [40.0, 0.0] - assert augmented["other_health_insurance_premiums"].tolist() == [120.0, 0.0] - assert augmented["weekly_hours_worked_before_lsr"].tolist() == [37.5, 0.0] - - def test_augment_policyengine_person_inputs_coalesces_sparse_source_aliases_by_row( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "age": [45, 50, 55], - "sex": [1, 2, 1], - "income": [60_000.0, 75_000.0, 0.0], - "employment_income_before_lsr": [0.0, 70_000.0, 0.0], - "wage_income": [50_000.0, 80_000.0, 0.0], - "self_employment_income_before_lsr": [0.0, 200.0, -300.0], - "self_employment_income": [500.0, 999.0, 50.0], - "taxable_interest_income": [0.0, 20.0, 0.0], - "interest_income": [100.0, 999.0, 0.0], - "ordinary_dividend_income": [0.0, 30.0, 0.0], - "dividend_income": [80.0, 999.0, 0.0], - "qualified_dividend_income": [0.0, 5.0, 0.0], - "non_qualified_dividend_income": [0.0, 25.0, 0.0], - "tax_exempt_pension_income": [40.0, 0.0, 0.0], - "long_term_capital_gains_before_response": [0.0, 60.0, -10.0], - "long_term_capital_gains": [40.0, 999.0, 0.0], - "capital_gains": [999.0, 999.0, 25.0], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["employment_income_before_lsr"].tolist() == [ - 50_000.0, - 70_000.0, - 0.0, - ] - assert augmented["self_employment_income_before_lsr"].tolist() == [ - 500.0, - 200.0, - -300.0, - ] - assert augmented["taxable_interest_income"].tolist() == [100.0, 20.0, 0.0] - assert augmented["ordinary_dividend_income"].tolist() == [80.0, 30.0, 0.0] - assert augmented["dividend_income"].tolist() == [80.0, 30.0, 0.0] - assert augmented["long_term_capital_gains_before_response"].tolist() == [ - 40.0, - 60.0, - -10.0, - ] - assert augmented["tax_exempt_private_pension_income"].tolist() == [ - 40.0, - 0.0, - 0.0, - ] - - def test_augment_policyengine_person_inputs_preserves_tax_exempt_interest_split( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "age": [45, 50, 55], - "sex": [1, 2, 1], - "interest_income": [100.0, 50.0, 75.0], - "taxable_interest_income": [0.0, 20.0, 0.0], - "tax_exempt_interest_income": [100.0, 30.0, 0.0], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["taxable_interest_income"].tolist() == [0.0, 20.0, 75.0] - assert augmented["tax_exempt_interest_income"].tolist() == [ - 100.0, - 30.0, - 0.0, - ] - assert ( - augmented["taxable_interest_income"] - + augmented["tax_exempt_interest_income"] - ).tolist() == [100.0, 50.0, 75.0] - - def test_attach_policyengine_tax_unit_source_inputs_derives_mortgage_structure( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - tax_units = pd.DataFrame( - { - "tax_unit_id": [1, 2], - "deductible_mortgage_interest": [600.0, 0.0], - "interest_deduction": [700.0, 0.0], - "scf_mortgage_debt": [8_000.0, 0.0], - } - ) - - augmented = pipeline._attach_policyengine_tax_unit_source_inputs(tax_units) - - assert augmented["first_home_mortgage_interest"].tolist() == [600.0, 0.0] - assert augmented["interest_deduction"].tolist() == [700.0, 0.0] - assert augmented["first_home_mortgage_balance"].tolist() == [10_000.0, 0.0] - assert augmented["first_home_mortgage_origination_year"].tolist() == [2014, 0] - - def test_build_policyengine_households_preserves_vehicle_exports(self): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "household_id": [10, 10, 20], - "weight": [1.0, 1.0, 2.0], - "household_vehicles_owned": [2.0, 2.0, 1.0], - "household_vehicles_value": [12_000.0, 12_000.0, 6_000.0], - } - ) - - households = pipeline._build_policyengine_households(persons) - - assert households["household_vehicles_owned"].tolist() == [2.0, 1.0] - assert households["household_vehicles_value"].tolist() == [12_000.0, 6_000.0] - - def test_augment_policyengine_person_inputs_derives_marital_status_flags_from_cps_codes( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "age": [45, 52, 38], - "sex": [1, 2, 1], - "marital_status": [6, 4, 7], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["is_separated"].tolist() == [True, False, False] - assert augmented["is_surviving_spouse"].tolist() == [False, True, False] - - def test_augment_policyengine_person_inputs_derives_marital_status_flags_from_filing_status_code( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - persons = pd.DataFrame( - { - "age": [45, 52, 38], - "sex": [1, 2, 1], - "filing_status_code": [3, 5, 1], - } - ) - - augmented = pipeline._augment_policyengine_person_inputs(persons) - - assert augmented["is_separated"].tolist() == [True, False, False] - assert augmented["is_surviving_spouse"].tolist() == [False, True, False] - - def test_calibrate_policyengine_tables_from_db(self, persons, households, tmp_path): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - household_weights = calibrated_tables.households.set_index("household_id")[ - "household_weight" - ] - california_weight = calibrated_tables.households.loc[ - calibrated_tables.households["state_fips"] == 6, - "household_weight", - ].sum() - - assert summary["backend"] == "policyengine_db_entropy" - assert summary["n_constraints"] == 2 - assert summary["max_error"] < 1e-6 - assert summary["weight_collapse_suspected"] is False - assert summary["household_weight_diagnostics"]["total_weight"] == pytest.approx( - 450.0, - rel=1e-6, - ) - assert ( - summary["household_weight_diagnostics"]["positive_count"] - == summary["household_weight_diagnostics"]["row_count"] - ) - assert household_weights.sum() == pytest.approx(450.0, rel=1e-6) - assert california_weight == pytest.approx(225.0, rel=1e-6) - assert calibrated_persons.loc[ - calibrated_persons["state_fips"] == 6, "weight" - ].iloc[0] == pytest.approx(225.0, rel=1e-6) - - def test_calibrate_policyengine_tables_residualizes_and_appends_forbes_spine( - self, - persons, - households, - tmp_path, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - forbes_path = tmp_path / "forbes.jsonl" - forbes_path.write_text( - json.dumps( - { - "forbes_unit_id": "forbes-1", - "name": "Example Founder", - "rank": 1, - "state_fips": 6, - "net_worth": 10_000_000_000.0, - "weight": 1.0, - } - ) - + "\n" - ) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - forbes_fixed_spine_records_path=forbes_path, - forbes_fixed_spine_snapshot_id="forbes-test-2024", - forbes_fixed_spine_replicates_per_unit=2, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - contributions = { - contribution["target_name"]: contribution - for contribution in summary["fixed_spine"]["residualization"][ - "contributions" - ] - } - for table in ( - calibrated_tables.households, - calibrated_tables.persons, - calibrated_tables.tax_units, - ): - assert table is not None - assert not any(column.startswith("forbes_") for column in table.columns) - - assert summary["fixed_spine"]["enabled"] is True - assert summary["fixed_spine"]["record_metadata_rows"] == 2 - assert ( - summary["fixed_spine"]["source_metadata"]["snapshot_id"] - == "forbes-test-2024" - ) - assert summary["fixed_spine"]["residualization"]["supported_target_count"] == 2 - assert contributions["policyengine_us_target_1"]["contribution"] == ( - pytest.approx(1.0) - ) - assert contributions["policyengine_us_target_2"]["contribution"] == ( - pytest.approx(1.0) - ) - assert len(calibrated_tables.households) == len(tables.households) + 2 - assert calibrated_tables.households["household_weight"].sum() == pytest.approx( - 450.0, - rel=1e-6, - ) - california_weight = calibrated_tables.households.loc[ - calibrated_tables.households["state_fips"].eq(6), - "household_weight", - ].sum() - assert california_weight == pytest.approx(225.0, rel=1e-6) - assert calibrated_persons["weight"].sum() == pytest.approx(899.0, rel=1e-6) - - def test_calibrate_policyengine_tables_none_backend_preserves_original_weights( - self, - persons, - households, - tmp_path, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - config = USMicroplexBuildConfig( - calibration_backend="none", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - original_weights = tables.households["household_weight"].astype(float).copy() - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - assert summary["backend"] == "policyengine_db_none" - assert summary["converged"] is True - assert summary["max_error"] == 0.0 - assert summary["mean_error"] == 0.0 - assert summary["weight_collapse_suspected"] is False - calibrated_weights = calibrated_tables.households["household_weight"].astype( - float - ) - assert calibrated_weights.tolist() == pytest.approx(original_weights.tolist()) - - def test_calibrate_policyengine_tables_from_db_with_sparse_backend( - self, - persons, - households, - tmp_path, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - config = USMicroplexBuildConfig( - calibration_backend="sparse", - target_sparsity=0.0, - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["backend"] == "policyengine_db_sparse" - assert summary["n_constraints"] == 2 - assert summary["max_error"] < 1e-5 - assert summary["converged"] is True - assert summary["sparsity"] == pytest.approx(0.0, abs=1e-9) - assert calibrated_tables.households["household_weight"].sum() == pytest.approx( - 450.0, - rel=1e-5, - ) - - def test_synthesize_seed_backend_preserves_seed_support(self, persons, households): - config = USMicroplexBuildConfig( - synthesis_backend="seed", - n_synthetic=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households) - - synthetic, synthesizer, metadata = pipeline.synthesize(seed) - - assert synthesizer is None - assert metadata["backend"] == "seed" - assert metadata["n_seed_records"] == len(seed) - assert len(synthetic) == len(seed) - assert synthetic["household_id"].nunique() == seed["household_id"].nunique() - assert synthetic["weight"].tolist() == pytest.approx(seed["hh_weight"].tolist()) - - def test_calibrate_policyengine_tables_can_prune_to_household_budget( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - - class StubSparseSelector: - def __init__(self, **_kwargs): - pass - - def fit_transform( - self, - frame, - *_args, - weight_col, - linear_constraints=None, - **_kwargs, - ): - result = frame.copy() - result[weight_col] = np.array([10.0, 8.0, 0.0]) - self._constraints = tuple(linear_constraints or ()) - return result - - def validate(self, _frame): - return { - "max_error": 0.1, - "mean_error": 0.05, - "converged": True, - "sparsity": 1 / 3, - "linear_errors": { - constraint.name: { - "actual": float(constraint.target), - "target": float(constraint.target), - "relative_error": 0.0, - } - for constraint in getattr(self, "_constraints", ()) - }, - } - - monkeypatch.setattr( - "microplex_us.pipelines.us.SparseCalibrator", - StubSparseSelector, - ) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - policyengine_selection_household_budget=2, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - assert len(calibrated_tables.households) == 2 - assert set(calibrated_tables.households["household_id"]) == {1, 2} - assert set(calibrated_persons["household_id"]) == {1, 2} - assert summary["selection"]["applied"] is True - assert summary["selection"]["selected_household_count"] == 2 - assert summary["selection"]["selector_positive_selected_count"] == 2 - assert "pre_selection" in summary["feasibility_filter"] - - def test_calibrate_policyengine_tables_from_db_can_use_pe_native_selection_backend( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - - def _fake_optimize(**kwargs): - assert kwargs["max_iter"] == 777 - assert kwargs["tol"] == pytest.approx(1e-7) - assert kwargs["l2_penalty"] == pytest.approx(1e-5) - output_path = kwargs["output_dataset_path"] - with h5py.File(output_path, "w") as handle: - household_id_group = handle.create_group("household_id") - household_id_group.create_dataset("2024", data=np.asarray([1, 2, 3])) - household_weight_group = handle.create_group("household_weight") - household_weight_group.create_dataset( - "2024", - data=np.asarray([3.0, 2.0, 0.0], dtype=np.float32), - ) - return SimpleNamespace( - to_dict=lambda: { - "metric": "enhanced_cps_native_loss_weight_optimization", - "initial_loss": 0.9, - "optimized_loss": 0.7, - "converged": True, - "iterations": 12, - "positive_household_count": 2, - "target_names": ["nation/foo"], - } - ) - - monkeypatch.setattr( - "microplex_us.pipelines.us.optimize_policyengine_us_native_loss_dataset", - _fake_optimize, - ) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - policyengine_selection_backend="pe_native_loss", - policyengine_selection_household_budget=2, - policyengine_selection_max_iter=777, - policyengine_selection_tol=1e-7, - policyengine_selection_l2_penalty=1e-5, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - assert len(calibrated_tables.households) == 2 - assert set(calibrated_tables.households["household_id"]) == {1, 2} - assert set(calibrated_persons["household_id"]) == {1, 2} - assert summary["selection"]["applied"] is True - assert summary["selection"]["backend"] == "pe_native_loss" - assert summary["selection"]["selected_household_count"] == 2 - assert summary["selection"]["selector_positive_selected_count"] == 2 - assert summary["selection"]["pe_native_optimization"]["optimized_loss"] == 0.7 - assert "target_names" not in summary["selection"]["pe_native_optimization"] - - def test_calibrate_policyengine_tables_pe_native_selection_can_preallocate_state_floor( - self, - tmp_path, - monkeypatch, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - - def _unexpected_optimize(**_kwargs): - raise AssertionError( - "PE-native optimizer should not run when state floor fills budget" - ) - - monkeypatch.setattr( - "microplex_us.pipelines.us.optimize_policyengine_us_native_loss_dataset", - _unexpected_optimize, - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - policyengine_selection_backend="pe_native_loss", - policyengine_selection_household_budget=2, - policyengine_selection_state_floor=1, - ) - ) - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2, 3], - "household_weight": [10.0, 4.0, 8.0], - "state_fips": [6, 6, 36], - } - ), - persons=pd.DataFrame( - { - "person_id": [101, 102, 103], - "household_id": [1, 2, 3], - "weight": [10.0, 4.0, 8.0], - "state_fips": [6, 6, 36], - "age": [35, 28, 52], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [11, 12, 13], - "household_id": [1, 2, 3], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [21, 22, 23], - "household_id": [1, 2, 3], - } - ), - families=pd.DataFrame( - { - "family_id": [31, 32, 33], - "household_id": [1, 2, 3], - } - ), - marital_units=pd.DataFrame( - { - "marital_unit_id": [41, 42, 43], - "household_id": [1, 2, 3], - } - ), - ) - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - assert set(calibrated_tables.households["household_id"]) == {1, 3} - assert set(calibrated_persons["household_id"]) == {1, 3} - assert summary["selection"]["backend"] == "pe_native_loss" - assert summary["selection"]["state_floor"]["applied"] is True - assert summary["selection"]["state_floor"]["selected_household_count"] == 2 - assert summary["selection"]["state_floor"]["state_count"] == 2 - assert summary["selection"]["pe_native_optimization"]["budget"] == 0 - - def test_selection_optimizer_kwargs_passes_target_total_weight(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - policyengine_selection_backend="pe_native_loss", - policyengine_selection_household_budget=100, - policyengine_selection_target_total_weight=150_000_000.0, - ) - ) - kwargs = pipeline._policyengine_selection_optimizer_kwargs(requested_budget=100) - assert kwargs["target_total_weight"] == 150_000_000.0 - - def test_selection_optimizer_kwargs_omits_target_total_weight_when_none(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - policyengine_selection_backend="pe_native_loss", - policyengine_selection_household_budget=100, - ) - ) - kwargs = pipeline._policyengine_selection_optimizer_kwargs(requested_budget=100) - assert "target_total_weight" not in kwargs - - def test_calibrate_policyengine_tables_from_db_with_hardconcrete_backend( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - seen_constraints = {} - - class StubHardConcreteCalibrator: - def __init__(self, **_kwargs): - self._constraints = () - - def fit_transform( - self, - frame, - *_args, - weight_col, - linear_constraints=None, - **_kwargs, - ): - self._constraints = tuple(linear_constraints or ()) - seen_constraints["count"] = len(self._constraints) - return frame.copy() - - def validate(self, _frame): - return { - "max_error": 0.0, - "mean_error": 0.0, - "converged": True, - "sparsity": 0.25, - "linear_errors": { - constraint.name: { - "actual": float(constraint.target), - "target": float(constraint.target), - "relative_error": 0.0, - } - for constraint in self._constraints - }, - } - - monkeypatch.setattr( - "microplex_us.pipelines.us.HardConcreteCalibrator", - StubHardConcreteCalibrator, - ) - config = USMicroplexBuildConfig( - calibration_backend="hardconcrete", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert seen_constraints["count"] == 2 - assert summary["backend"] == "policyengine_db_hardconcrete" - assert summary["n_constraints"] == 2 - assert summary["converged"] is True - assert summary["sparsity"] == pytest.approx(0.25) - assert calibrated_tables.households["household_weight"].sum() == pytest.approx( - 450.0, - rel=1e-6, - ) - - def test_calibrate_policyengine_tables_from_db_with_pe_l0_backend( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - seen_constraints = {} - - class StubPolicyEngineL0Calibrator: - def __init__(self, **_kwargs): - self._constraints = () - - def fit_transform( - self, - frame, - *_args, - weight_col, - linear_constraints=None, - **_kwargs, - ): - self._constraints = tuple(linear_constraints or ()) - seen_constraints["count"] = len(self._constraints) - result = frame.copy() - result[weight_col] = result[weight_col].astype(float) - return result - - def validate(self, _frame): - return { - "max_error": 0.0, - "mean_error": 0.0, - "converged": True, - "sparsity": 0.1, - "linear_errors": { - constraint.name: { - "actual": float(constraint.target), - "target": float(constraint.target), - "relative_error": 0.0, - } - for constraint in self._constraints - }, - } - - monkeypatch.setattr( - "microplex_us.pipelines.us.PolicyEngineL0Calibrator", - StubPolicyEngineL0Calibrator, - ) - config = USMicroplexBuildConfig( - calibration_backend="pe_l0", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert seen_constraints["count"] == 2 - assert summary["backend"] == "policyengine_db_pe_l0" - assert summary["n_constraints"] == 2 - assert summary["converged"] is True - assert summary["sparsity"] == pytest.approx(0.1) - assert calibrated_tables.households["household_weight"].sum() == pytest.approx( - 450.0, - rel=1e-6, - ) - - def test_calibrate_policyengine_tables_flags_weight_collapse( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - - class CollapsingCalibrator: - def __init__(self, method, **_kwargs): - self.method = method - - def fit_transform( - self, - frame, - *_args, - weight_col, - **_kwargs, - ): - collapsed = frame.copy() - collapsed[weight_col] = 1e-10 - return collapsed - - def validate(self, _frame): - return { - "max_error": 1.0, - "mean_error": 1.0, - "converged": False, - "linear_errors": {}, - } - - monkeypatch.setattr( - "microplex_us.pipelines.us.Calibrator", - CollapsingCalibrator, - ) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - _, calibrated_persons, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["weight_collapse_suspected"] is True - assert ( - summary["household_weight_diagnostics"]["tiny_count"] - == summary["household_weight_diagnostics"]["row_count"] - ) - assert summary["household_weight_diagnostics"]["total_weight"] == pytest.approx( - summary["household_weight_diagnostics"]["row_count"] * 1e-10 - ) - assert summary["person_weight_diagnostics"]["tiny_count"] == len( - calibrated_persons - ) - - def test_calibrate_policyengine_tables_can_rescale_back_to_input_weight_sum( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - - class ShrinkingCalibrator: - def __init__(self, method, **_kwargs): - self.method = method - - def fit_transform( - self, - frame, - *_args, - weight_col, - **_kwargs, - ): - shrunk = frame.copy() - shrunk[weight_col] = shrunk[weight_col].astype(float) * 0.25 - return shrunk - - def validate(self, frame): - values = frame["household_weight"].astype(float).to_numpy() - return { - "max_error": 0.0, - "mean_error": 0.0, - "converged": True, - "linear_errors": {}, - "sparsity": 0.0, - "validated_weight_sum": float(values.sum()), - } - - monkeypatch.setattr( - "microplex_us.pipelines.us.Calibrator", - ShrinkingCalibrator, - ) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - policyengine_calibration_rescale_to_input_weight_sum=True, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - assert summary["input_household_weight_sum"] == pytest.approx(450.0, rel=1e-6) - assert summary["pre_rescale_household_weight_sum"] == pytest.approx( - 112.5, rel=1e-6 - ) - assert summary["post_rescale_household_weight_sum"] == pytest.approx( - 450.0, rel=1e-6 - ) - assert summary["weight_sum_rescaled"] is True - assert summary["weight_sum_rescale_mode"] == "input_weight_sum" - assert calibrated_tables.households["household_weight"].sum() == pytest.approx( - 450.0, - rel=1e-6, - ) - assert calibrated_persons["weight"].sum() == pytest.approx(900.0, rel=1e-6) - - def test_calibrate_policyengine_tables_can_rescale_to_target_weight_sum( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - - class ShrinkingCalibrator: - def __init__(self, method, **_kwargs): - self.method = method - - def fit_transform( - self, - frame, - *_args, - weight_col, - **_kwargs, - ): - shrunk = frame.copy() - shrunk[weight_col] = shrunk[weight_col].astype(float) * 0.25 - return shrunk - - def validate(self, frame): - values = frame["household_weight"].astype(float).to_numpy() - return { - "max_error": 0.0, - "mean_error": 0.0, - "converged": True, - "linear_errors": {}, - "sparsity": 0.0, - "validated_weight_sum": float(values.sum()), - } - - monkeypatch.setattr( - "microplex_us.pipelines.us.Calibrator", - ShrinkingCalibrator, - ) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - policyengine_calibration_target_total_weight=1_000.0, - policyengine_calibration_rescale_to_target_total_weight=True, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - assert summary["input_household_weight_sum"] == pytest.approx(450.0, rel=1e-6) - assert summary["pre_rescale_household_weight_sum"] == pytest.approx( - 112.5, rel=1e-6 - ) - assert summary["post_rescale_household_weight_sum"] == pytest.approx( - 1_000.0, rel=1e-6 - ) - assert summary["weight_sum_rescaled"] is True - assert summary["weight_sum_rescale_mode"] == "target_total_weight" - assert calibrated_tables.households["household_weight"].sum() == pytest.approx( - 1_000.0, - rel=1e-6, - ) - assert calibrated_persons["weight"].sum() == pytest.approx(2_000.0, rel=1e-6) - - def test_summarize_weight_diagnostics_flags_low_effective_sample_ratio(self): - summary = _summarize_weight_diagnostics([100.0, 100.0] + [1e-10] * 10) - - assert summary["tiny_share"] < 0.95 - assert summary["effective_sample_ratio"] < 0.25 - assert summary["collapse_suspected"] is True - - def test_select_feasible_policyengine_calibration_constraints_caps_budget(self): - targets = [ - TargetSpec( - name="national_count", - entity=EntityType.HOUSEHOLD, - value=100.0, - period=2024, - aggregation=TargetAggregation.COUNT, - metadata={"geo_level": "national"}, - ), - TargetSpec( - name="state_count", - entity=EntityType.HOUSEHOLD, - value=50.0, - period=2024, - aggregation=TargetAggregation.COUNT, - metadata={"geo_level": "state"}, - ), - TargetSpec( - name="state_sum", - entity=EntityType.HOUSEHOLD, - value=25.0, - period=2024, - measure="snap", - aggregation=TargetAggregation.SUM, - metadata={"geo_level": "state"}, - ), - ] - constraints = ( - SimpleNamespace(coefficients=np.array([1.0, 1.0])), - SimpleNamespace(coefficients=np.array([1.0, 0.0])), - SimpleNamespace(coefficients=np.array([1.0, 1.0])), - ) - - selected_targets, selected_constraints, summary = ( - _select_feasible_policyengine_calibration_constraints( - targets, - constraints, - household_count=2, - max_constraints=None, - max_constraints_per_household=1.0, - min_active_households=1, - ) - ) - - assert [target.name for target in selected_targets] == [ - "national_count", - "state_count", - ] - assert len(selected_constraints) == 2 - assert summary["feasibility_filter_applied"] is True - assert summary["requested_max_constraints"] == 2 - assert summary["n_constraints_before_feasibility_filter"] == 3 - assert summary["n_constraints_after_feasibility_filter"] == 2 - assert summary["n_constraints_dropped_over_capacity"] == 1 - assert summary["constraint_drop_share"] == pytest.approx(1 / 3) - assert summary["warning_messages"] - - def test_select_feasible_policyengine_calibration_constraints_drops_low_support_rows( - self, - ): - targets = [ - TargetSpec( - name="dense_state_count", - entity=EntityType.HOUSEHOLD, - value=50.0, - period=2024, - aggregation=TargetAggregation.COUNT, - metadata={"geo_level": "state"}, - ), - TargetSpec( - name="thin_state_count", - entity=EntityType.HOUSEHOLD, - value=25.0, - period=2024, - aggregation=TargetAggregation.COUNT, - metadata={"geo_level": "state"}, - ), - ] - constraints = ( - SimpleNamespace(coefficients=np.array([1.0, 1.0, 1.0, 1.0, 1.0])), - SimpleNamespace(coefficients=np.array([0.0, 0.0, 0.0, 0.0, 1.0])), - ) - - selected_targets, _, summary = ( - _select_feasible_policyengine_calibration_constraints( - targets, - constraints, - household_count=5, - max_constraints=None, - max_constraints_per_household=None, - min_active_households=5, - ) - ) - - assert [target.name for target in selected_targets] == ["dense_state_count"] - assert summary["n_constraints_dropped_low_support"] == 1 - assert summary["n_constraints_after_feasibility_filter"] == 1 - - def test_normalize_microcalibrate_constraints_flips_negative_targets(self): - constraints = ( - LinearConstraint( - name="signed_loss", - coefficients=np.array([-4.0, 1.0, 0.0]), - target=-12.0, - ), - LinearConstraint( - name="positive_amount", - coefficients=np.array([2.0, 0.0, 3.0]), - target=20.0, - ), - ) - - normalized, summary = _normalize_policyengine_constraints_for_microcalibrate( - constraints - ) - - assert normalized[0].name == "signed_loss" - assert normalized[0].target == pytest.approx(12.0) - np.testing.assert_allclose(normalized[0].coefficients, [4.0, -1.0, -0.0]) - assert normalized[1].name == "positive_amount" - assert normalized[1].target == pytest.approx(20.0) - np.testing.assert_allclose(normalized[1].coefficients, [2.0, 0.0, 3.0]) - assert summary == { - "sign_flipped_constraint_count": 1, - "sign_flipped_constraint_names": ["signed_loss"], - "sign_flipped_constraint_names_truncated": False, - } - - def test_calibrate_policyengine_tables_applies_feasibility_constraint_budget( - self, - persons, - households, - tmp_path, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_max_constraints_per_household=0.5, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["n_constraints"] == 1 - assert summary["feasibility_filter"]["feasibility_filter_applied"] is True - assert summary["feasibility_filter"]["requested_max_constraints"] == 1 - assert ( - summary["feasibility_filter"]["n_constraints_before_feasibility_filter"] - == 2 - ) - assert ( - summary["feasibility_filter"]["n_constraints_after_feasibility_filter"] == 1 - ) - assert summary["target_plan"]["stage_counts"] == { - "solve_now": 1, - "solve_later": 1, - "audit_only": 0, - } - assert summary["target_plan"]["reason_counts"]["constraint_capacity"] == 1 - assert summary["oracle_loss"]["full_oracle"]["target_count"] == 2 - assert summary["oracle_loss"]["full_oracle"]["supported_target_count"] == 2 - assert summary["oracle_loss"]["active_solve"]["target_count"] == 1 - assert summary["oracle_loss"]["active_solve"][ - "mean_abs_relative_error" - ] == pytest.approx( - 0.0, - abs=1e-12, - ) - assert summary["oracle_loss"]["active_solve"][ - "capped_mean_abs_relative_error" - ] == pytest.approx(0.0, abs=1e-12) - assert summary["oracle_loss"]["deferred"]["target_count"] == 1 - assert ( - summary["oracle_loss"]["deferred"]["family_summaries"]["household_count"][ - "target_count" - ] - == 1 - ) - assert summary["oracle_loss"]["deferred"]["family_summaries"][ - "household_count" - ]["loss_share"] == pytest.approx(1.0, rel=1e-9) - assert summary["oracle_loss"]["deferred"]["family_summaries"][ - "household_count" - ]["sum_abs_relative_error"] == pytest.approx( - summary["oracle_loss"]["deferred"]["mean_abs_relative_error"], - rel=1e-9, - ) - assert summary["oracle_loss"]["deferred"]["family_ranking"][0]["group"] == ( - "household_count" - ) - assert summary["oracle_loss"]["deferred"]["family_ranking"][0][ - "capped_sum_abs_relative_error" - ] == pytest.approx( - summary["oracle_loss"]["deferred"]["family_ranking"][0][ - "sum_abs_relative_error" - ], - rel=1e-9, - ) - assert ( - summary["oracle_loss"]["full_oracle"]["geography_summaries"]["unspecified"][ - "target_count" - ] - == 2 - ) - assert summary["oracle_loss"]["full_oracle"]["geography_ranking"][0][ - "group" - ] == ("unspecified") - assert ( - summary["oracle_loss"]["full_oracle"]["mean_abs_relative_error"] - > summary["oracle_loss"]["active_solve"]["mean_abs_relative_error"] - ) - assert summary["full_oracle_mean_abs_relative_error"] == pytest.approx( - summary["oracle_loss"]["full_oracle"]["mean_abs_relative_error"], - rel=1e-9, - ) - assert summary["full_oracle_capped_mean_abs_relative_error"] == pytest.approx( - summary["oracle_loss"]["full_oracle"]["capped_mean_abs_relative_error"], - rel=1e-9, - ) - assert summary["active_solve_mean_abs_relative_error"] == pytest.approx( - summary["oracle_loss"]["active_solve"]["mean_abs_relative_error"], - rel=1e-9, - ) - assert summary["active_solve_capped_mean_abs_relative_error"] == pytest.approx( - summary["oracle_loss"]["active_solve"]["capped_mean_abs_relative_error"], - rel=1e-9, - ) - assert summary["oracle_relative_error_cap"] == pytest.approx(10.0) - assert calibrated_tables.households["household_weight"].sum() == pytest.approx( - 450.0, - rel=1e-6, - ) - - def test_calibrate_policyengine_tables_warns_when_many_constraints_are_dropped( - self, - persons, - households, - tmp_path, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_max_constraints_per_household=0.5, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - with pytest.warns( - UserWarning, - match="Calibration feasibility filter dropped", - ): - _, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["warnings"] - - def test_calibrate_policyengine_tables_runs_deferred_low_support_stage( - self, - persons, - households, - tmp_path, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=2, - policyengine_calibration_deferred_stage_min_active_households=(1,), - policyengine_calibration_deferred_stage_top_family_count=None, - policyengine_calibration_deferred_stage_top_geography_count=None, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - _, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["n_constraints"] == 2 - assert summary["n_supported_targets"] == 2 - assert summary["n_calibration_stages_applied"] == 2 - assert summary["final_calibration_stage_index"] == 2 - assert summary["deferred_stage_support_schedule"] == [1] - assert summary["target_plan"]["stage_counts"] == { - "solve_now": 2, - "solve_later": 0, - "audit_only": 0, - } - assert summary["target_plan"]["reason_counts"]["selected_stage_1"] == 1 - assert summary["target_plan"]["reason_counts"]["selected_stage_2"] == 1 - assert summary["oracle_loss"]["deferred"]["target_count"] == 0 - assert summary["oracle_loss"]["active_solve"]["target_count"] == 2 - assert len(summary["calibration_stages"]) == 2 - assert summary["calibration_stages"][1]["kind"] == "deferred" - assert summary["calibration_stages"][1]["status"] == "applied" - assert summary["calibration_stages"][1]["min_active_households"] == 1 - assert summary["calibration_stages"][1]["selected_target_count"] == 1 - assert any( - entry["stage"] == "solve_now" and entry["reason"] == "selected_stage_2" - for entry in summary["target_ledger"] - ) - - def test_calibrate_policyengine_tables_skips_deferred_stage_below_trigger_threshold( - self, - persons, - households, - tmp_path, - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=3, - policyengine_calibration_deferred_stage_min_active_households=(2, 1), - policyengine_calibration_deferred_stage_top_family_count=None, - policyengine_calibration_deferred_stage_top_geography_count=None, - policyengine_calibration_deferred_stage_min_full_oracle_capped_mean_abs_relative_error=100.0, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - _, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["n_constraints"] == 1 - assert summary["n_calibration_stages_applied"] == 1 - assert summary["final_calibration_stage_index"] == 1 - assert summary["deferred_stage_support_schedule"] == [2, 1] - assert summary["target_plan"]["stage_counts"] == { - "solve_now": 1, - "solve_later": 1, - "audit_only": 0, - } - assert len(summary["calibration_stages"]) == 3 - assert summary["calibration_stages"][1]["status"] == "skipped" - assert summary["calibration_stages"][1]["skip_reason"] == ( - "trigger_metric_below_threshold" - ) - assert summary["calibration_stages"][1]["trigger_threshold"] == pytest.approx( - 100.0 - ) - assert summary["calibration_stages"][2]["status"] == "skipped" - assert summary["calibration_stages"][2]["skip_reason"] == ( - "trigger_metric_below_threshold" - ) - assert summary["calibration_stages"][2]["trigger_threshold"] == pytest.approx( - 100.0 - ) - - def test_calibrate_policyengine_tables_marks_materialization_failures_audit_only( - self, - persons, - households, - tmp_path, - ): - class FakeEntity: - def __init__(self, key: str): - self.key = key - - class FakeVariable: - def __init__( - self, - entity: FakeEntity, - formulas: dict[str, object] | None = None, - ): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self) -> bool: - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "state_fips": FakeVariable(FakeEntity("household")), - "income_tax": FakeVariable( - FakeEntity("person"), - formulas={"2024": object()}, - ), - } - - class FakeSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - _ = dataset, dataset_year, kwargs - - def calculate(self, variable, period=None, map_to=None): - assert period == 2024 - assert map_to is None - if variable == "income_tax": - raise RuntimeError("missing test parameter") - raise KeyError(variable) - - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db_with_unsupported_target(db_path) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - policyengine_simulation_cls=FakeSimulation, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["n_loaded_targets"] == 2 - assert summary["n_supported_targets"] == 1 - assert summary["n_unsupported_targets"] == 0 - assert summary["n_constraints"] == 1 - assert summary["target_plan"]["stage_counts"] == { - "solve_now": 1, - "solve_later": 0, - "audit_only": 1, - } - assert summary["target_plan"]["reason_counts"]["materialization_failure"] == 1 - assert summary["oracle_loss"]["full_oracle"]["target_count"] == 2 - assert summary["oracle_loss"]["full_oracle"]["supported_target_count"] == 1 - assert summary["oracle_loss"]["full_oracle"]["unsupported_target_count"] == 1 - assert summary["oracle_loss"]["audit_only"]["target_count"] == 1 - assert summary["oracle_loss"]["audit_only"]["supported_target_count"] == 0 - assert summary["oracle_loss"]["audit_only"]["unsupported_target_count"] == 1 - assert summary["oracle_loss"]["full_oracle"][ - "unsupported_target_error_penalty" - ] == pytest.approx(10.0) - assert summary["oracle_loss"]["full_oracle"]["mean_abs_relative_error"] == ( - pytest.approx(5.0) - ) - assert summary["oracle_loss"]["full_oracle"][ - "capped_mean_abs_relative_error" - ] == pytest.approx(5.0) - assert summary["oracle_loss"]["audit_only"]["mean_abs_relative_error"] == ( - pytest.approx(10.0) - ) - assert summary["oracle_loss"]["audit_only"][ - "capped_mean_abs_relative_error" - ] == pytest.approx(10.0) - assert ( - summary["oracle_loss"]["audit_only"]["family_summaries"]["income_tax"][ - "unsupported_target_count" - ] - == 1 - ) - assert summary["oracle_loss"]["audit_only"]["family_summaries"]["income_tax"][ - "sum_abs_relative_error" - ] == pytest.approx(10.0) - assert summary["oracle_loss"]["audit_only"]["family_summaries"]["income_tax"][ - "capped_sum_abs_relative_error" - ] == pytest.approx(10.0) - assert summary["oracle_loss"]["audit_only"]["family_ranking"][0]["group"] == ( - "income_tax" - ) - assert summary["oracle_loss"]["full_oracle"]["family_summaries"]["income_tax"][ - "supported_target_rate" - ] == pytest.approx(0.0, abs=1e-12) - assert ( - summary["oracle_loss"]["full_oracle"]["geography_summaries"]["unspecified"][ - "unsupported_target_count" - ] - == 1 - ) - assert summary["oracle_loss"]["full_oracle"]["geography_ranking"][0][ - "group" - ] == ("unspecified") - assert any( - entry["stage"] == "audit_only" - and entry["reason"] == "materialization_failure" - for entry in summary["target_ledger"] - ) - assert calibrated_tables.households["household_weight"].sum() == pytest.approx( - 450.0, - rel=1e-6, - ) - - def test_policyengine_target_provider_returns_canonical_specs( - self, persons, households, tmp_path - ): - db_path = tmp_path / "policyengine_targets.db" - _create_policyengine_calibration_db(db_path) - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("household_count",), - policyengine_target_period=2024, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - provider = pipeline.config.policyengine_targets_db - assert provider is not None - bindings = pipeline._infer_policyengine_variable_bindings(tables) - - from microplex_us.policyengine.us import PolicyEngineUSDBTargetProvider - - targets = PolicyEngineUSDBTargetProvider(provider).load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "variables": ["household_count"], - "reform_id": 0, - "entity_overrides": { - variable: binding.entity - for variable, binding in bindings.items() - }, - }, - ) - ) - - assert targets.targets - assert all(isinstance(target, TargetSpec) for target in targets.targets) - - def test_calibrate_policyengine_tables_from_db_with_simulated_variable( - self, persons, households, tmp_path, monkeypatch - ): - db_path = tmp_path / "policyengine_targets.db" - conn = sqlite3.connect(db_path) - national_constraints: tuple[PolicyEngineUSConstraint, ...] = () - snap_positive_constraints = (PolicyEngineUSConstraint("snap", ">", "0"),) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - """ - ) - conn.executemany( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - [ - ( - 1, - compute_policyengine_us_definition_hash(national_constraints), - None, - ), - ( - 2, - compute_policyengine_us_definition_hash( - snap_positive_constraints, - parent_stratum_id=1, - ), - 1, - ), - ], - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - constraint_variable, - operation, - value - ) VALUES (?, ?, ?, ?) - """, - (2, "snap", ">", "0"), - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (1, "snap", 2024, 1, 0, 200.0, 1, None, "test", "national"), - (2, "household_count", 2024, 2, 0, 2.0, 1, None, "test", "positive"), - ], - ) - conn.commit() - conn.close() - - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self): - return not self.formulas - - class FakeSystem: - variables = { - "employment_income": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable( - FakeEntity("household"), - formulas={"2024": object()}, - ), - } - - class FakeSimulation: - tax_benefit_system = FakeSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - self.dataset = dataset - self.dataset_year = dataset_year - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert period == 2024 - assert self.dataset_year == 2024 - assert map_to is None - if variable == "snap": - return [100.0, 0.0, 0.0] - raise KeyError(variable) - - captured_direct_overrides: list[tuple[str, ...]] = [] - original_materialize = ( - us_pipeline_module.materialize_policyengine_us_variables_safely - ) - - def spy_materialize(*args, **kwargs): - captured_direct_overrides.append( - tuple(kwargs.get("direct_override_variables", ())) - ) - return original_materialize(*args, **kwargs) - - monkeypatch.setattr( - us_pipeline_module, - "materialize_policyengine_us_variables_safely", - spy_materialize, - ) - - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("snap", "household_count"), - policyengine_target_period=2024, - policyengine_dataset_year=2024, - policyengine_simulation_cls=FakeSimulation, - policyengine_direct_override_variables=("pre_tax_contributions",), - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight", "income": "employment_income"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - tables.households["snap"] = 999.0 - - calibrated_tables, calibrated_persons, summary = ( - pipeline.calibrate_policyengine_tables(tables) - ) - - assert summary["backend"] == "policyengine_db_entropy" - assert captured_direct_overrides == [("pre_tax_contributions",)] - assert summary["n_constraints"] == 2 - assert summary["materialized_variables"] == ["snap"] - assert summary["max_error"] < 1e-6 - positive_weight = calibrated_tables.households.loc[ - calibrated_tables.households["snap"] > 0, - "household_weight", - ].sum() - assert ( - calibrated_tables.households["snap"] - * calibrated_tables.households["household_weight"] - ).sum() == pytest.approx( - 200.0, - rel=1e-6, - ) - assert positive_weight == pytest.approx(2.0, rel=1e-6) - positive_household_id = int( - calibrated_tables.households.loc[ - calibrated_tables.households["snap"] > 0, - "household_id", - ].iloc[0] - ) - assert calibrated_persons.loc[ - calibrated_persons["household_id"] == positive_household_id, "weight" - ].iloc[0] == pytest.approx(2.0, rel=1e-6) - - def test_calibrate_policyengine_tables_skips_failed_materialized_variables( - self, persons, households, tmp_path - ): - db_path = tmp_path / "policyengine_targets.db" - conn = sqlite3.connect(db_path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - """ - ) - conn.execute( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - (1, compute_policyengine_us_definition_hash(()), None), - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (1, "snap", 2024, 1, 0, 200.0, 1, None, "test", "national"), - ( - 2, - "adjusted_gross_income", - 2024, - 1, - 0, - 1_000.0, - 1, - None, - "test", - "agi", - ), - ], - ) - conn.commit() - conn.close() - - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self): - return not self.formulas - - class FakeSystem: - variables = { - "employment_income": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable( - FakeEntity("household"), - formulas={"2024": object()}, - ), - "adjusted_gross_income": FakeVariable( - FakeEntity("tax_unit"), - formulas={"2024": object()}, - ), - } - - class FakeSimulation: - tax_benefit_system = FakeSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - self.dataset = dataset - self.dataset_year = dataset_year - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert period == 2024 - assert self.dataset_year == 2024 - assert map_to is None - if variable == "snap": - return [100.0, 0.0, 0.0] - if variable == "adjusted_gross_income": - raise RuntimeError("invalid state metadata") - raise KeyError(variable) - - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("snap", "adjusted_gross_income"), - policyengine_target_period=2024, - policyengine_dataset_year=2024, - policyengine_simulation_cls=FakeSimulation, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight", "income": "employment_income"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["n_loaded_targets"] == 2 - assert summary["n_supported_targets"] == 1 - assert summary["n_constraints"] == 1 - assert summary["materialized_variables"] == ["snap"] - assert summary["materialization_failures"] == { - "adjusted_gross_income": "RuntimeError: invalid state metadata" - } - - def test_calibrate_policyengine_tables_uses_calibration_target_filters( - self, persons, households, tmp_path - ): - db_path = tmp_path / "policyengine_targets.db" - conn = sqlite3.connect(db_path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - """ - ) - conn.execute( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - (1, compute_policyengine_us_definition_hash(()), None), - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (1, "snap", 2024, 1, 0, 200.0, 1, None, "test", "national"), - ( - 2, - "adjusted_gross_income", - 2024, - 1, - 0, - 1_000.0, - 1, - None, - "test", - "agi", - ), - ], - ) - conn.commit() - conn.close() - - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self): - return not self.formulas - - class FakeSystem: - variables = { - "employment_income": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable( - FakeEntity("household"), - formulas={"2024": object()}, - ), - "adjusted_gross_income": FakeVariable( - FakeEntity("tax_unit"), - formulas={"2024": object()}, - ), - } - - class FakeSimulation: - tax_benefit_system = FakeSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - self.dataset = dataset - self.dataset_year = dataset_year - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert period == 2024 - assert self.dataset_year == 2024 - assert map_to is None - if variable == "snap": - return [100.0, 0.0, 0.0] - if variable == "adjusted_gross_income": - raise RuntimeError("invalid state metadata") - raise KeyError(variable) - - config = USMicroplexBuildConfig( - calibration_backend="entropy", - policyengine_targets_db=str(db_path), - policyengine_target_variables=("snap", "adjusted_gross_income"), - policyengine_calibration_target_variables=("snap",), - policyengine_target_period=2024, - policyengine_dataset_year=2024, - policyengine_simulation_cls=FakeSimulation, - policyengine_calibration_min_active_households=1, - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households).rename( - columns={"hh_weight": "weight", "income": "employment_income"} - ) - tables = pipeline.build_policyengine_entity_tables(seed) - - calibrated_tables, _, summary = pipeline.calibrate_policyengine_tables(tables) - - assert summary["n_loaded_targets"] == 1 - assert summary["n_supported_targets"] == 1 - assert summary["n_constraints"] == 1 - assert summary["target_variables"] == ["snap"] - assert summary["materialized_variables"] == ["snap"] - assert summary["materialization_failures"] == {} - assert ( - calibrated_tables.households["snap"] - * calibrated_tables.households["household_weight"] - ).sum() == pytest.approx(200.0, rel=1e-6) - assert ( - calibrated_tables.households["snap"] - * calibrated_tables.households["household_weight"] - ).sum() == pytest.approx(200.0, rel=1e-6) - - def test_build_policyengine_target_query_includes_named_target_profile(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - policyengine_target_profile="pe_native_broad", - ) - ) - - query = pipeline._build_policyengine_target_query({}, period=2024) - - assert query.provider_filters["target_profile"] == "pe_native_broad" - assert query.provider_filters["target_cells"] - assert { - cell["geo_level"] for cell in query.provider_filters["target_cells"] - } <= {"national", "state"} - - def test_build_policyengine_target_query_prefers_calibration_profile_override(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - policyengine_target_profile="pe_native_broad", - policyengine_calibration_target_profile="pe_native_broad", - policyengine_calibration_target_variables=("snap",), - ) - ) - - query = pipeline._build_policyengine_target_query( - {}, - period=2024, - for_calibration=True, - ) - - assert query.provider_filters["target_profile"] == "pe_native_broad" - assert query.provider_filters["variables"] == ["snap"] - assert query.provider_filters["target_cells"] - - def test_load_inputs_from_directory(self, persons, households, tmp_path): - households.rename(columns={"hh_weight": "household_weight"}).to_parquet( - tmp_path / "cps_asec_households.parquet", - index=False, - ) - persons.to_parquet(tmp_path / "cps_asec_persons.parquet", index=False) - - config = USMicroplexBuildConfig( - n_synthetic=8, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - pipeline = USMicroplexPipeline(config) - result = pipeline.build_from_data_dir(tmp_path) - - assert result.synthetic_data["household_id"].nunique() == 8 - assert len(result.synthetic_data) > 8 - assert result.seed_data["hh_weight"].sum() == pytest.approx(900.0) - - def test_build_weight_calibrator_respects_iteration_and_tolerance_config(self): - config = USMicroplexBuildConfig( - calibration_backend="entropy", - calibration_tol=1e-4, - calibration_max_iter=777, - ) - pipeline = USMicroplexPipeline(config) - - calibrator = pipeline._build_weight_calibrator() - - assert calibrator.tol == pytest.approx(1e-4) - assert calibrator.max_iter == 777 - - def test_build_from_data_dir_can_prefer_cached_cps_asec_source( - self, - persons, - households, - tmp_path, - monkeypatch, - ): - households.rename(columns={"hh_weight": "household_weight"}).to_parquet( - tmp_path / "cps_asec_households.parquet", - index=False, - ) - persons.to_parquet(tmp_path / "cps_asec_persons.parquet", index=False) - cache_dir = tmp_path / "cache" - cache_dir.mkdir() - (cache_dir / "cps_asec_2023_processed.parquet").write_text("stub") - - class FakeCachedProvider: - def __init__(self, *, year, cache_dir, download): - self.year = year - self.cache_dir = cache_dir - self.download = download - self.descriptor = SourceDescriptor( - name="cps_asec", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips",), - ), - ), - ) - - class FakeParquetProvider: - def __init__(self, *, data_dir): - self.data_dir = data_dir - self.descriptor = SourceDescriptor( - name="cps_asec_parquet", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips",), - ), - ), - ) - - monkeypatch.setattr( - "microplex_us.data_sources.cps.CPSASECSourceProvider", - FakeCachedProvider, - ) - monkeypatch.setattr( - "microplex_us.data_sources.cps.CPSASECParquetSourceProvider", - FakeParquetProvider, - ) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - prefer_cached_cps_asec_source=True, - cps_asec_cache_dir=str(cache_dir), - cps_asec_source_year=2023, - ) - ) - chosen: dict[str, object] = {} - - def fake_build_from_source_provider(provider): - chosen["provider"] = provider - return "cached" - - monkeypatch.setattr( - pipeline, "build_from_source_provider", fake_build_from_source_provider - ) - - result = pipeline.build_from_data_dir(tmp_path) - - assert result == "cached" - assert chosen["provider"].descriptor.name == "cps_asec" - - def test_build_from_source_provider(self, persons, households): - provider_households = households.rename( - columns={ - "household_id": "hh_id", - "hh_weight": "household_weight", - } - ) - provider_persons = persons.rename( - columns={ - "person_id": "person_key", - "household_id": "hh_id", - } - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="test_cps", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="hh_id", - weight_column="household_weight", - variable_names=tuple( - column - for column in provider_households.columns - if column != "hh_id" - ), - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_key", - variable_names=tuple( - column - for column in provider_persons.columns - if column not in {"person_key", "hh_id"} - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: provider_households, - EntityType.PERSON: provider_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="hh_id", - child_key="hh_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - provider = StaticSourceProvider(frame) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=8, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - - result = pipeline.build_from_source_provider(provider) - - assert result.synthetic_data["household_id"].nunique() == 8 - assert len(result.synthetic_data) > 8 - assert result.source_frame is not None - assert result.source_frame.source.name == "test_cps" - assert result.fusion_plan is not None - assert result.fusion_plan.source_names == ("test_cps",) - assert result.seed_data["hh_weight"].sum() == pytest.approx(900.0) - assert {"person_id", "household_id"}.issubset(result.seed_data.columns) - - def test_build_from_source_provider_requires_household_person_relationship( - self, persons, households - ): - frame = ObservationFrame( - source=SourceDescriptor( - name="test_cps", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "hh_weight", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=(), - ) - provider = StaticSourceProvider(frame) - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - - with pytest.raises( - ValueError, - match="one-to-many household-to-person relationship", - ): - pipeline.build_from_source_provider(provider) - - def test_build_from_frames_prefers_scaffold_with_valid_geography(self): - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 19], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [60_000.0, 12_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [50.0, 75.0, 80.0], - "state_fips": [0, 0, 0], - "tenure": [1, 2, 1], - "extra_household_var": [1.0, 2.0, 3.0], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [51, 34, 28], - "sex": [1, 2, 1], - "education": [4, 3, 2], - "employment_status": [1, 1, 0], - "income": [80_000.0, 40_000.0, 20_000.0], - "extra_person_var": [9.0, 8.0, 7.0], - } - ) - - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure", "extra_household_var"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "extra_person_var", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - - result = pipeline.build_from_frames([cps_frame, donor_frame]) - - assert result.source_frame is not None - assert result.source_frame.source.name == "cps_like" - assert result.seed_data["state_fips"].tolist() == [6, 36] - - def test_select_scaffold_prefers_cps_when_puf_support_clone_enabled(self): - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 19], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [60_000.0, 12_000.0], - } - ) - acs_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [90.0, 110.0, 130.0], - "state_fips": [6, 36, 48], - "tenure": [1, 2, 1], - "rent": [1_000.0, 1_500.0, 900.0], - "real_estate_taxes": [0.0, 2_000.0, 3_000.0], - } - ) - acs_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [44, 21, 62], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [58_000.0, 13_000.0, 74_000.0], - "extra_person_var": [9.0, 8.0, 7.0], - } - ) - - def frame( - name: str, - households: pd.DataFrame, - persons: pd.DataFrame, - household_variables: tuple[str, ...], - person_variables: tuple[str, ...], - ) -> ObservationFrame: - return ObservationFrame( - source=SourceDescriptor( - name=name, - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=household_variables, - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=person_variables, - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - cps_frame = frame( - "cps_asec_2025", - cps_households, - cps_persons, - ("state_fips", "tenure"), - ( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ) - acs_frame = frame( - "acs_2022", - acs_households, - acs_persons, - ("state_fips", "tenure", "rent", "real_estate_taxes"), - ( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "extra_person_var", - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - puf_support_clone_enabled=True, - synthesis_backend="seed", - calibration_backend="entropy", - ) - ) - source_inputs = [ - pipeline.prepare_source_input(cps_frame), - pipeline.prepare_source_input(acs_frame), - ] - - selected = pipeline._select_scaffold_source(source_inputs) - - assert selected.frame.source.name == "cps_asec_2025" - - def test_build_from_frames_prefers_scaffold_with_state_program_proxies(self): - proxy_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - proxy_persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - "age": [45, 19], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [60_000.0, 12_000.0], - "has_medicaid": [1, 0], - "public_assistance": [0.0, 250.0], - "ssi": [0.0, 0.0], - "social_security": [0.0, 0.0], - } - ) - wider_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [90.0, 110.0], - "state_fips": [6, 36], - "tenure": [1, 2], - "extra_household_var": [1.0, 2.0], - } - ) - wider_persons = pd.DataFrame( - { - "person_id": [1001, 1002], - "household_id": [101, 102], - "age": [44, 21], - "sex": [1, 2], - "education": [3, 2], - "employment_status": [1, 0], - "income": [58_000.0, 13_000.0], - "extra_person_var": [9.0, 8.0], - "another_extra_var": [5.0, 6.0], - } - ) - - proxy_frame = ObservationFrame( - source=SourceDescriptor( - name="proxy_rich_cps", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "has_medicaid", - "public_assistance", - "ssi", - "social_security", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: proxy_households, - EntityType.PERSON: proxy_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - wider_frame = ObservationFrame( - source=SourceDescriptor( - name="wider_but_proxy_poor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure", "extra_household_var"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "extra_person_var", - "another_extra_var", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: wider_households, - EntityType.PERSON: wider_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - - result = pipeline.build_from_frames([proxy_frame, wider_frame]) - - assert result.source_frame is not None - assert result.source_frame.source.name == "proxy_rich_cps" - assert result.synthesis_metadata["state_program_support_proxies"][ - "available" - ] == [ - "has_medicaid", - "public_assistance", - "social_security", - "ssi", - ] - assert result.synthesis_metadata["condition_vars"] == [ - "age", - "sex", - "education", - "employment_status", - "state_fips", - "tenure", - "has_medicaid", - ] - assert "has_medicaid" not in result.synthesis_metadata["target_vars"] - assert "public_assistance" in result.synthesis_metadata["target_vars"] - assert "ssi" in result.synthesis_metadata["target_vars"] - assert "social_security" in result.synthesis_metadata["target_vars"] - - def test_build_from_source_provider_promotes_state_program_proxies_to_conditions( - self, - ): - households = pd.DataFrame( - { - "household_key": [1, 2, 3], - "household_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - persons = pd.DataFrame( - { - "person_key": [10, 11, 12], - "household_key": [1, 2, 3], - "age": [45, 19, 62], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [60_000.0, 12_000.0, 40_000.0], - "has_medicaid": [1, 0, 1], - "public_assistance": [0.0, 250.0, 0.0], - "ssi": [0.0, 0.0, 900.0], - "social_security": [0.0, 0.0, 1200.0], - } - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="proxy_rich_single_source", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_key", - variable_names=("state_fips", "tenure"), - weight_column="household_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_key", - variable_names=( - "age", - "sex", - "education", - "employment_status", - "income", - "has_medicaid", - "public_assistance", - "ssi", - "social_security", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_key", - child_key="household_key", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - - result = pipeline.build_from_source_provider(StaticSourceProvider(frame)) - - assert result.synthesis_metadata["condition_vars"] == [ - "age", - "sex", - "education", - "employment_status", - "state_fips", - "tenure", - "has_medicaid", - ] - assert result.synthesis_metadata["target_vars"] == [ - "income", - "public_assistance", - "ssi", - "social_security", - ] - - def test_build_from_frames_skips_non_numeric_donor_imputation_targets(self): - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [45, 19, 62], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [60_000.0, 12_000.0, 40_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [44, 21, 61], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [58_000.0, 13_000.0, 41_000.0], - "taxable_interest_income": [100.0, 50.0, 25.0], - "all_zero_income": [0.0, 0.0, 0.0], - "filing_status": ["SINGLE", "JOINT", "SINGLE"], - } - ) - - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "taxable_interest_income", - "all_zero_income", - "filing_status", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert "taxable_interest_income" in integration["seed_data"].columns - assert "all_zero_income" not in integration["seed_data"].columns - assert "filing_status" not in integration["seed_data"].columns - assert integration["integrated_variables"] == ["taxable_interest_income"] - - def test_integrate_donor_sources_restricts_puf_to_authoritative_variables(self): - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [45, 19, 62], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [60_000.0, 12_000.0, 40_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [0, 0, 0], - "tenure": [0, 0, 0], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [44, 21, 61], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [58_000.0, 13_000.0, 41_000.0], - "employment_income": [55_000.0, 12_500.0, 39_000.0], - "taxable_interest_income": [0.0, 25.0, 100.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="irs_soi_puf_2024", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "employment_income", - "taxable_interest_income", - ), - ), - ), - variable_capabilities={ - "state_fips": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "tenure": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "income": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "employment_status": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "employment_income": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "taxable_interest_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert "taxable_interest_income" in integration["integrated_variables"] - assert "employment_income" not in integration["integrated_variables"] - assert "taxable_interest_income" in integration["seed_data"].columns - assert "employment_income" not in integration["seed_data"].columns - - def test_integrate_donor_sources_respects_excluded_variables(self, monkeypatch): - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = condition_vars, kwargs - self.target_vars = tuple(target_vars) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["taxable_interest_income"] = [10.0] * len(result) - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [45, 19, 62], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [60_000.0, 12_000.0, 40_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [0, 0, 0], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [44, 21, 61], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [58_000.0, 13_000.0, 41_000.0], - "taxable_interest_income": [0.0, 25.0, 100.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="irs_soi_puf_2024", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "taxable_interest_income", - ), - ), - ), - variable_capabilities={ - "state_fips": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "tenure": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "income": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "employment_status": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "taxable_interest_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_excluded_variables=("taxable_interest_income",), - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert integration["integrated_variables"] == [] - assert "taxable_interest_income" not in integration["seed_data"].columns - - def test_default_build_config_excludes_filing_status_code_from_donor_imputation( - self, - ): - config = USMicroplexBuildConfig() - - assert "filing_status_code" in config.donor_imputer_excluded_variables - - def test_build_config_can_opt_back_into_filing_status_code_donor_imputation(self): - config = USMicroplexBuildConfig(donor_imputer_excluded_variables=()) - - assert "filing_status_code" not in config.donor_imputer_excluded_variables - - def test_integrate_donor_sources_drops_constant_donor_conditions(self, monkeypatch): - captured: list[tuple[str, ...]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = kwargs - self.target_vars = tuple(target_vars) - self.condition_vars = tuple(condition_vars) - captured.append(self.condition_vars) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["taxable_interest_income"] = [10.0] * len(result) - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [45, 19, 62], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [60_000.0, 12_000.0, 40_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [0, 0, 0], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [44, 21, 61], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [58_000.0, 13_000.0, 41_000.0], - "taxable_interest_income": [0.0, 25.0, 100.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="irs_soi_puf_2024", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "taxable_interest_income", - ), - ), - ), - variable_capabilities={ - "state_fips": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "tenure": SourceVariableCapability( - authoritative=False, - usable_as_condition=True, - ), - "income": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "employment_status": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "taxable_interest_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert captured - assert "state_fips" not in captured[0] - assert "tenure" in captured[0] - - def test_integrate_donor_sources_selects_top_correlated_condition_vars( - self, - monkeypatch, - ): - captured: list[tuple[str, ...]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = target_vars, kwargs - captured.append(tuple(condition_vars)) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["taxable_interest_income"] = [10.0, 20.0, 30.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [25, 45, 65], - "sex": [1, 2, 1], - "education": [2, 2, 2], - "employment_status": [1, 1, 1], - "income": [30_000.0, 40_000.0, 50_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [24, 44, 64], - "sex": [1, 1, 1], - "education": [2, 2, 2], - "employment_status": [1, 1, 1], - "income": [10_000.0, 80_000.0, 20_000.0], - "taxable_interest_income": [5.0, 15.0, 25.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="irs_soi_puf_2024", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "taxable_interest_income", - ), - ), - ), - variable_capabilities={ - "state_fips": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "income": SourceVariableCapability( - authoritative=False, - usable_as_condition=True, - ), - "taxable_interest_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_max_condition_vars=1, - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert captured == [("age",)] - - def test_augment_donor_condition_frame_for_targets_derives_pe_style_puf_predictors( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - frame = pd.DataFrame( - { - "person_id": ["1:1", "1:2", "1:3"], - "household_id": [1, 1, 1], - "tax_unit_id": ["1001", "1001", "1001"], - "person_number": [1, 2, 3], - "spouse_person_number": [2, 1, 0], - "family_relationship": [1, 2, 3], - "age": [45, 43, 12], - "sex": [1, 2, 2], - } - ) - - result = pipeline._augment_donor_condition_frame_for_targets( - frame, - ("taxable_interest_income",), - ) - - assert result["is_male"].tolist() == [1.0, 0.0, 0.0] - assert result["tax_unit_is_joint"].tolist() == [1.0, 1.0, 1.0] - assert result["tax_unit_count_dependents"].tolist() == [1.0, 1.0, 1.0] - assert result["is_tax_unit_head"].tolist() == [1.0, 0.0, 0.0] - assert result["is_tax_unit_spouse"].tolist() == [0.0, 1.0, 0.0] - assert result["is_tax_unit_dependent"].tolist() == [0.0, 0.0, 1.0] - - def test_resolve_preferred_donor_condition_vars_uses_available_block_predictors( - self, - ): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - donor_frame = pd.DataFrame( - { - "age": [30, 45, 70], - "is_male": [1.0, 0.0, 1.0], - "income": [20_000.0, 80_000.0, 250_000.0], - "tax_unit_is_joint": [0.0, 1.0, 1.0], - } - ) - current_frame = pd.DataFrame( - { - "age": [28, 50, 72], - "is_male": [0.0, 1.0, 1.0], - "income": [25_000.0, 90_000.0, 300_000.0], - "tax_unit_is_joint": [0.0, 0.0, 1.0], - } - ) - - assert pipeline._resolve_preferred_donor_condition_vars( - donor_frame=donor_frame, - current_frame=current_frame, - donor_block=("dividend_income", "qualified_dividend_share"), - ) == ["age", "is_male", "tax_unit_is_joint"] - - def test_resolve_challenger_shared_condition_vars_uses_source_native_puf_overlap( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - donor_imputer_condition_selection="pe_plus_puf_native_challenger" - ) - ) - donor_frame = pd.DataFrame( - { - "age": [30, 45, 70], - "self_employment_income": [0.0, 15_000.0, 0.0], - "rental_income": [2_000.0, 0.0, 5_000.0], - "social_security_retirement": [0.0, 0.0, 20_000.0], - "alimony_income": [0.0, 3_000.0, 0.0], - } - ) - current_frame = pd.DataFrame( - { - "age": [28, 50, 72], - "self_employment_income": [0.0, 12_000.0, 0.0], - "rental_income": [1_500.0, 0.0, 4_000.0], - "social_security_retirement": [0.0, 0.0, 18_000.0], - "alimony_income": [0.0, 2_500.0, 0.0], - } - ) - - assert pipeline._resolve_challenger_shared_condition_vars( - donor_frame=donor_frame, - current_frame=current_frame, - shared_vars=[ - "age", - "self_employment_income", - "rental_income", - "social_security_retirement", - "alimony_income", - ], - donor_block=("taxable_interest_income",), - donor_source_name="irs_soi_puf_2024", - ) == [ - "self_employment_income", - "rental_income", - "social_security_retirement", - ] - - def test_select_donor_condition_vars_keeps_all_shared_distinct_from_pe_presets( - self, - ): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - donor_imputer_condition_selection="all_shared", - donor_imputer_max_condition_vars=1, - ) - ) - donor_frame = pd.DataFrame( - { - "age": [30, 45, 70], - "is_male": [1.0, 0.0, 1.0], - "tax_unit_is_joint": [0.0, 1.0, 1.0], - "education": [1.0, 2.0, 3.0], - } - ) - current_frame = donor_frame.copy() - shared_vars = ["age", "is_male", "tax_unit_is_joint", "education"] - - assert ( - pipeline._select_donor_condition_vars( - donor_frame, - current_frame, - shared_vars, - ("taxable_interest_income",), - ) - == shared_vars - ) - - def test_integrate_donor_sources_uses_pe_style_puf_predictors_for_generic_irs_vars( - self, - monkeypatch, - ): - captured: list[tuple[str, ...]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = target_vars, kwargs - captured.append(tuple(condition_vars)) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["taxable_interest_income"] = [10.0, 20.0, 0.0, 25.0, 15.0, 0.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 90.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": ["1:1", "1:2", "1:3", "2:1", "3:1", "3:2"], - "household_id": [1, 1, 1, 2, 3, 3], - "age": [45, 43, 12, 61, 38, 10], - "sex": [1, 2, 2, 1, 2, 1], - "education": [2, 2, 1, 2, 2, 1], - "employment_status": [1, 1, 0, 1, 1, 0], - "income": [80_000.0, 50_000.0, 0.0, 70_000.0, 55_000.0, 0.0], - "tax_unit_id": ["1001", "1001", "1001", "2001", "3001", "3001"], - "person_number": [1, 2, 3, 1, 1, 2], - "spouse_person_number": [2, 1, 0, 0, 0, 0], - "family_relationship": [1, 2, 3, 1, 1, 3], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 110.0, 95.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": ["101:1", "101:2", "101:3", "102:1", "103:1", "103:2"], - "household_id": [101, 101, 101, 102, 103, 103], - "age": [46, 42, 11, 60, 39, 9], - "sex": [1, 2, 2, 1, 2, 1], - "education": [2, 2, 1, 2, 2, 1], - "employment_status": [1, 1, 0, 1, 1, 0], - "income": [70_000.0, 45_000.0, 0.0, 68_000.0, 52_000.0, 0.0], - "tax_unit_id": ["2101", "2101", "2101", "2201", "2301", "2301"], - "person_number": [1, 2, 3, 1, 1, 2], - "spouse_person_number": [2, 1, 0, 0, 0, 0], - "is_head": [1, 0, 0, 1, 1, 0], - "is_spouse": [0, 1, 0, 0, 0, 0], - "is_dependent": [0, 0, 1, 0, 0, 1], - "taxable_interest_income": [5.0, 10.0, 0.0, 12.0, 8.0, 0.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "tax_unit_id", - "person_number", - "spouse_person_number", - "family_relationship", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="irs_soi_puf_2024", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "tax_unit_id", - "person_number", - "spouse_person_number", - "is_head", - "is_spouse", - "is_dependent", - "taxable_interest_income", - ), - ), - ), - variable_capabilities={ - "income": SourceVariableCapability( - authoritative=False, - usable_as_condition=True, - ), - "taxable_interest_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_condition_selection="pe_prespecified", - donor_imputer_max_condition_vars=1, - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert captured == [ - ( - "age", - "is_male", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - ) - ] - assert integration["conditioning_diagnostics"] == [ - { - "donor_source": "irs_soi_puf_2024", - "model_variables": ["taxable_interest_income"], - "restored_variables": ["taxable_interest_income"], - "condition_selection": "pe_prespecified", - "used_condition_surface": False, - "raw_shared_vars": [ - "age", - "education", - "employment_status", - "income", - "person_number", - "sex", - "spouse_person_number", - "state_fips", - "tenure", - ], - "shared_vars_after_model_exclusion": [ - "age", - "education", - "employment_status", - "income", - "person_number", - "sex", - "spouse_person_number", - "state_fips", - "tenure", - ], - "projection_applied": False, - "entity_compatible_shared_vars": [], - "shared_vars_for_block": [ - "age", - "education", - "employment_status", - "income", - "person_number", - "sex", - "spouse_person_number", - "state_fips", - "tenure", - ], - "selected_condition_vars": [ - "age", - "is_male", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - ], - "requested_supplemental_shared_condition_vars": [], - "requested_challenger_shared_condition_vars": [], - "raw_supplemental_shared_condition_var_status": [], - "raw_challenger_shared_condition_var_status": [], - "supplemental_shared_condition_var_status": [], - "challenger_shared_condition_var_status": [], - "dropped_shared_vars": [ - "education", - "employment_status", - "income", - "person_number", - "sex", - "spouse_person_number", - "state_fips", - "tenure", - ], - } - ] - - def test_integrate_donor_sources_pe_plus_puf_native_challenger_widens_pe_surface( - self, - monkeypatch, - caplog, - ): - captured: list[tuple[str, ...]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = target_vars, kwargs - captured.append(tuple(condition_vars)) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["taxable_interest_income"] = [10.0, 20.0, 0.0, 25.0, 15.0, 0.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 90.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": ["1:1", "1:2", "1:3", "2:1", "3:1", "3:2"], - "household_id": [1, 1, 1, 2, 3, 3], - "age": [45, 43, 12, 61, 38, 10], - "sex": [1, 2, 2, 1, 2, 1], - "education": [2, 2, 1, 2, 2, 1], - "employment_status": [1, 1, 0, 1, 1, 0], - "income": [80_000.0, 50_000.0, 0.0, 70_000.0, 55_000.0, 0.0], - "self_employment_income": [0.0, 2_000.0, 0.0, 0.0, 4_000.0, 0.0], - "rental_income": [500.0, 0.0, 0.0, 0.0, 1_500.0, 0.0], - "social_security_retirement": [0.0, 0.0, 0.0, 20_000.0, 0.0, 0.0], - "tax_unit_id": ["1001", "1001", "1001", "2001", "3001", "3001"], - "person_number": [1, 2, 3, 1, 1, 2], - "spouse_person_number": [2, 1, 0, 0, 0, 0], - "family_relationship": [1, 2, 3, 1, 1, 3], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 110.0, 95.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": ["101:1", "101:2", "101:3", "102:1", "103:1", "103:2"], - "household_id": [101, 101, 101, 102, 103, 103], - "age": [46, 42, 11, 60, 39, 9], - "sex": [1, 2, 2, 1, 2, 1], - "education": [2, 2, 1, 2, 2, 1], - "employment_status": [1, 1, 0, 1, 1, 0], - "income": [70_000.0, 45_000.0, 0.0, 68_000.0, 52_000.0, 0.0], - "self_employment_income": [0.0, 1_000.0, 0.0, 0.0, 3_500.0, 0.0], - "rental_income": [400.0, 0.0, 0.0, 0.0, 1_200.0, 0.0], - "social_security_retirement": [0.0, 0.0, 0.0, 18_000.0, 0.0, 0.0], - "tax_unit_id": ["2101", "2101", "2101", "2201", "2301", "2301"], - "person_number": [1, 2, 3, 1, 1, 2], - "spouse_person_number": [2, 1, 0, 0, 0, 0], - "is_head": [1, 0, 0, 1, 1, 0], - "is_spouse": [0, 1, 0, 0, 0, 0], - "is_dependent": [0, 0, 1, 0, 0, 1], - "taxable_interest_income": [5.0, 10.0, 0.0, 12.0, 8.0, 0.0], - } - ) - - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_2024", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - weight_column="hh_weight", - key_column="household_id", - variable_names=("state_fips", "tenure"), - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "age", - "sex", - "education", - "employment_status", - "income", - "self_employment_income", - "rental_income", - "social_security_retirement", - "tax_unit_id", - "person_number", - "spouse_person_number", - "family_relationship", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="irs_soi_puf_2024", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - weight_column="hh_weight", - key_column="household_id", - variable_names=("state_fips", "tenure"), - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "age", - "sex", - "education", - "employment_status", - "income", - "self_employment_income", - "rental_income", - "social_security_retirement", - "tax_unit_id", - "person_number", - "spouse_person_number", - "is_head", - "is_spouse", - "is_dependent", - "taxable_interest_income", - ), - ), - ), - variable_capabilities={ - "income": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "employment_status": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "taxable_interest_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_condition_selection="pe_plus_puf_native_challenger", - donor_imputer_max_condition_vars=1, - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - caplog.set_level(logging.INFO, logger="microplex_us.pipelines.us") - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert captured == [ - ( - "age", - "is_male", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - "self_employment_income", - "rental_income", - "social_security_retirement", - ) - ] - diagnostics = integration["conditioning_diagnostics"][0] - assert diagnostics["condition_selection"] == "pe_plus_puf_native_challenger" - assert diagnostics["used_condition_surface"] is False - assert diagnostics["requested_challenger_shared_condition_vars"] == [ - "self_employment_income", - "rental_income", - "social_security_retirement", - ] - assert diagnostics["selected_condition_vars"] == list(captured[0]) - assert diagnostics["raw_challenger_shared_condition_var_status"] == [ - { - "variable": "self_employment_income", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - { - "variable": "rental_income", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - { - "variable": "social_security_retirement", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - ] - assert diagnostics["challenger_shared_condition_var_status"] == [ - { - "variable": "self_employment_income", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - { - "variable": "rental_income", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - { - "variable": "social_security_retirement", - "selected": True, - "in_shared_overlap": True, - "reason": "selected", - }, - ] - log_messages = [record.getMessage() for record in caplog.records] - assert any( - "US microplex donor integration: source ready" in message - and "donor_source=irs_soi_puf_2024" in message - and "blocks=1" in message - for message in log_messages - ) - assert any( - "US microplex donor integration: block run" in message - and "block=taxable_interest_income" in message - and "condition_vars=10" in message - for message in log_messages - ) - assert any( - "US microplex donor integration: block complete" in message - and "integrated_vars=1" in message - for message in log_messages - ) - - def test_integrate_donor_sources_uses_pe_prespecified_acs_predictors( - self, - monkeypatch, - ): - captured: list[tuple[str, ...]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = target_vars, kwargs - captured.append(tuple(condition_vars)) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["rent"] = [1_200.0, 900.0, 600.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "age": [45, 14, 67], - "sex": [1, 2, 2], - "is_head": [1, 0, 1], - "employment_income": [60_000.0, 0.0, 10_000.0], - "self_employment_income": [5_000.0, 0.0, 0.0], - "gross_social_security": [0.0, 0.0, 20_000.0], - "taxable_pension_income": [0.0, 0.0, 15_000.0], - "income": [65_000.0, 0.0, 45_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [80.0, 90.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 101, 102], - "age": [44, 12, 68], - "sex": [1, 2, 2], - "is_head": [1, 0, 1], - "employment_income": [58_000.0, 0.0, 12_000.0], - "self_employment_income": [4_000.0, 0.0, 0.0], - "gross_social_security": [0.0, 0.0, 22_000.0], - "taxable_pension_income": [0.0, 0.0, 16_000.0], - "income": [62_000.0, 0.0, 50_000.0], - "rent": [1_100.0, 0.0, 950.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "is_head", - "employment_income", - "self_employment_income", - "gross_social_security", - "taxable_pension_income", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="acs_2022", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "is_head", - "employment_income", - "self_employment_income", - "gross_social_security", - "taxable_pension_income", - "income", - "rent", - ), - ), - ), - variable_capabilities={ - "rent": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_condition_selection="pe_prespecified", - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert captured == [ - ( - "is_household_head", - "age", - "is_male", - "tenure_type", - "employment_income", - "self_employment_income", - "social_security", - "pension_income", - "household_size", - "state_fips", - ) - ] - - def test_integrate_donor_sources_pe_prespecified_falls_back_for_unmapped_sources( - self, - monkeypatch, - ): - captured: list[tuple[str, ...]] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = target_vars, kwargs - captured.append(tuple(condition_vars)) - - def fit(self, *args, **kwargs): - _ = args, kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["taxable_interest_income"] = [10.0, 20.0, 30.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [25, 45, 65], - "sex": [1, 2, 1], - "education": [2, 2, 2], - "employment_status": [1, 1, 1], - "income": [30_000.0, 40_000.0, 50_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [24, 44, 64], - "sex": [1, 1, 1], - "education": [2, 2, 2], - "employment_status": [1, 1, 1], - "income": [10_000.0, 80_000.0, 20_000.0], - "taxable_interest_income": [5.0, 15.0, 25.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.RESTRICTED, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "taxable_interest_income", - ), - ), - ), - variable_capabilities={ - "state_fips": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "income": SourceVariableCapability( - authoritative=False, - usable_as_condition=True, - ), - "taxable_interest_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_condition_selection="pe_prespecified", - donor_imputer_max_condition_vars=1, - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert captured == [("age",)] - - def test_integrate_donor_sources_keeps_person_native_irs_blocks_on_person_rows_when_ids_present( - self, - monkeypatch, - ): - captured_conditions: list[tuple[str, ...]] = [] - captured_fit_rows: list[int] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = target_vars, kwargs - captured_conditions.append(tuple(condition_vars)) - - def fit(self, frame, *args, **kwargs): - _ = args, kwargs - captured_fit_rows.append(len(frame)) - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["taxable_interest_income"] = np.zeros(len(result), dtype=float) - result.loc[result.index[-1], "taxable_interest_income"] = 100.0 - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": ["1:1", "1:2", "2:1"], - "household_id": [1, 1, 2], - "tax_unit_id": [100, 100, 200], - "age": [45, 43, 19], - "sex": [1, 2, 1], - "education": [3, 3, 2], - "employment_status": [1, 1, 1], - "income": [60_000.0, 15_000.0, 12_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [80.0, 90.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": ["101:1", "101:2", "102:1"], - "household_id": [101, 101, 102], - "tax_unit_id": [900, 900, 901], - "age": [44, 42, 21], - "sex": [1, 2, 1], - "education": [3, 3, 2], - "employment_status": [1, 1, 1], - "income": [58_000.0, 14_000.0, 13_000.0], - "taxable_interest_income": [0.0, 0.0, 100.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "tax_unit_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "tax_unit_id", - "age", - "sex", - "education", - "employment_status", - "income", - "taxable_interest_income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert len(captured_conditions) == 1 - assert {"age", "income", "state_fips", "tenure"}.issubset( - set(captured_conditions[0]) - ) - assert captured_fit_rows == [3] - assert integration["seed_data"]["taxable_interest_income"].tolist() == [ - 0.0, - 0.0, - 100.0, - ] - - def test_integrate_donor_sources_allows_person_conditions_for_labor_tax_unit_blocks( - self, - monkeypatch, - ): - captured_conditions: list[tuple[str, ...]] = [] - captured_fit_rows: list[int] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = target_vars, kwargs - captured_conditions.append(tuple(condition_vars)) - - def fit(self, frame, *args, **kwargs): - _ = args, kwargs - captured_fit_rows.append(len(frame)) - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["self_employment_income"] = np.linspace( - 0.0, - 90.0, - num=len(result), - dtype=float, - ) - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": ["1:1", "1:2", "2:1", "3:1"], - "household_id": [1, 1, 2, 3], - "tax_unit_id": [100, 100, 200, 300], - "age": [25, 23, 45, 65], - "sex": [1, 1, 1, 1], - "education": [2, 2, 2, 2], - "employment_status": [1, 1, 1, 1], - "income": [20_000.0, 5_000.0, 50_000.0, 90_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": ["101:1", "101:2", "102:1", "103:1"], - "household_id": [101, 101, 102, 103], - "tax_unit_id": [900, 900, 901, 902], - "age": [24, 22, 44, 64], - "sex": [1, 1, 1, 1], - "education": [2, 2, 2, 2], - "employment_status": [1, 1, 1, 1], - "income": [18_000.0, 4_000.0, 52_000.0, 92_000.0], - "self_employment_income": [0.0, 0.0, 20.0, 100.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "tax_unit_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "tax_unit_id", - "age", - "sex", - "education", - "employment_status", - "income", - "self_employment_income", - ), - ), - ), - variable_capabilities={ - "state_fips": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "income": SourceVariableCapability( - authoritative=False, - usable_as_condition=False, - ), - "self_employment_income": SourceVariableCapability( - authoritative=True, - usable_as_condition=True, - ), - }, - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - donor_imputer_max_condition_vars=1, - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert captured_conditions == [("age",)] - assert captured_fit_rows == [4] - - def test_project_frame_to_entity_uses_variable_projection_aggregation(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - frame = pd.DataFrame( - { - "tax_unit_id": [100, 100, 200], - "age": [25, 45, 65], - "income": [20_000.0, 5_000.0, 90_000.0], - "tenure": [1, 1, 2], - } - ) - - projected = pipeline._project_frame_to_entity( - frame, - entity=EntityType.TAX_UNIT, - variables={"age", "income", "tenure"}, - ) - - assert projected["tax_unit_id"].tolist() == [100, 200] - assert projected["age"].tolist() == [45, 65] - assert projected["income"].tolist() == [25_000.0, 90_000.0] - assert projected["tenure"].tolist() == [1, 2] - - def test_integrate_donor_sources_projects_spm_unit_native_blocks_when_ids_missing( - self, - monkeypatch, - ): - captured_conditions: list[tuple[str, ...]] = [] - captured_fit_rows: list[int] = [] - - class FakeSynthesizer: - def __init__(self, *, target_vars, condition_vars, **kwargs): - _ = target_vars, kwargs - captured_conditions.append(tuple(condition_vars)) - - def fit(self, frame, *args, **kwargs): - _ = args, kwargs - captured_fit_rows.append(len(frame)) - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["snap"] = [120.0, 0.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - cps_households = pd.DataFrame( - { - "household_id": [1, 2], - "hh_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": ["1:1", "1:2", "2:1"], - "household_id": [1, 1, 2], - "relationship_to_head": [0, 2, 0], - "age": [40, 10, 55], - "sex": [1, 2, 1], - "education": [3, 1, 4], - "employment_status": [1, 0, 1], - "income": [40_000.0, 0.0, 35_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102], - "hh_weight": [80.0, 90.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": ["101:1", "101:2", "102:1"], - "household_id": [101, 101, 102], - "relationship_to_head": [0, 2, 0], - "age": [42, 11, 57], - "sex": [1, 2, 1], - "education": [3, 1, 4], - "employment_status": [1, 0, 1], - "income": [38_000.0, 0.0, 34_000.0], - "snap": [120.0, 120.0, 0.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "relationship_to_head", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="spm_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "relationship_to_head", - "age", - "sex", - "education", - "employment_status", - "income", - "snap", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - monkeypatch.setattr( - pipeline, - "build_policyengine_entity_tables", - lambda _population: pytest.fail( - "SPM-only donor projection should not build full entity tables" - ), - ) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert len(captured_conditions) == 1 - assert {"age", "income", "state_fips", "tenure"}.issubset( - set(captured_conditions[0]) - ) - assert captured_fit_rows == [2] - assert "spm_unit_id" in integration["seed_data"].columns - assert integration["seed_data"]["snap"].tolist() == [120.0, 120.0, 0.0] - - def test_strip_generated_entity_ids_drops_helper_ids_missing_from_scaffold(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_households = pd.DataFrame( - { - "household_id": [1], - "hh_weight": [100.0], - "state_fips": [6], - "tenure": [1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": ["1:1", "1:2"], - "household_id": [1, 1], - "age": [40, 10], - "sex": [1, 2], - "education": [3, 1], - "employment_status": [1, 0], - "income": [40_000.0, 0.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - cps_input = pipeline.prepare_source_input(cps_frame) - frame = cps_persons.assign( - tax_unit_id=[100, 100], - family_id=[10, 10], - spm_unit_id=[20, 20], - marital_unit_id=[30, 31], - ) - - stripped = pipeline._strip_generated_entity_ids( - frame, - scaffold_input=cps_input, - ) - - assert "tax_unit_id" not in stripped.columns - assert "family_id" not in stripped.columns - assert "spm_unit_id" not in stripped.columns - assert "marital_unit_id" not in stripped.columns - assert stripped["person_id"].tolist() == ["1:1", "1:2"] - - def test_strip_generated_entity_ids_preserves_observed_scaffold_ids(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_households = pd.DataFrame( - { - "household_id": [1], - "hh_weight": [100.0], - "state_fips": [6], - "tenure": [1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": ["1:1", "1:2"], - "household_id": [1, 1], - "tax_unit_id": [100, 100], - "family_id": [10, 10], - "age": [40, 10], - "sex": [1, 2], - "education": [3, 1], - "employment_status": [1, 0], - "income": [40_000.0, 0.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "tax_unit_id", - "family_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - cps_input = pipeline.prepare_source_input(cps_frame) - frame = cps_persons.assign( - spm_unit_id=[20, 20], - marital_unit_id=[30, 31], - ) - - stripped = pipeline._strip_generated_entity_ids( - frame, - scaffold_input=cps_input, - ) - - assert stripped["tax_unit_id"].tolist() == [100, 100] - assert stripped["family_id"].tolist() == [10, 10] - assert "spm_unit_id" not in stripped.columns - assert "marital_unit_id" not in stripped.columns - - def test_build_from_frames_drops_generated_entity_ids_before_stage5( - self, - monkeypatch, - ): - cps_households = pd.DataFrame( - { - "household_id": [1], - "hh_weight": [100.0], - "state_fips": [6], - "tenure": [1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": ["1:1", "1:2"], - "household_id": [1, 1], - "relationship_to_head": [0, 2], - "age": [40, 10], - "sex": [1, 2], - "education": [3, 1], - "employment_status": [1, 0], - "income": [40_000.0, 0.0], - } - ) - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "relationship_to_head", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="seed", - calibration_backend="entropy", - ) - ) - original_prepare_seed_data_from_source = pipeline.prepare_seed_data_from_source - captured_integrate_seed_columns: list[str] = [] - captured_seed_columns: list[str] = [] - - def fake_prepare_seed_data_from_source(source_input): - seed_data = original_prepare_seed_data_from_source(source_input) - return seed_data.assign( - tax_unit_id=[100, 100], - spm_unit_id=[200, 200], - marital_unit_id=[300, 301], - ) - - def fake_integrate(seed_data, *, scaffold_input, donor_inputs): - _ = scaffold_input, donor_inputs - captured_integrate_seed_columns[:] = seed_data.columns.tolist() - return { - "seed_data": seed_data, - "integrated_variables": [], - "conditioning_diagnostics": [ - { - "donor_source": "test_donor", - "model_variables": ["income"], - "selected_condition_vars": ["age"], - } - ], - } - - def fake_synthesize(seed_data, synthesis_variables=None): - _ = synthesis_variables - captured_seed_columns[:] = seed_data.columns.tolist() - synthetic = seed_data.copy() - synthetic["weight"] = synthetic["hh_weight"].astype(float) - return synthetic, None, {} - - def fake_calibrate(synthetic_data, targets): - _ = targets - return synthetic_data, {} - - monkeypatch.setattr( - pipeline, - "prepare_seed_data_from_source", - fake_prepare_seed_data_from_source, - ) - monkeypatch.setattr(pipeline, "_integrate_donor_sources", fake_integrate) - monkeypatch.setattr(pipeline, "synthesize", fake_synthesize) - monkeypatch.setattr(pipeline, "calibrate", fake_calibrate) - - result = pipeline.build_from_frames([cps_frame]) - - assert "tax_unit_id" not in captured_integrate_seed_columns - assert "spm_unit_id" not in captured_integrate_seed_columns - assert "marital_unit_id" not in captured_integrate_seed_columns - assert result.scaffold_seed_data is not None - assert "tax_unit_id" not in result.scaffold_seed_data.columns - assert "spm_unit_id" not in result.scaffold_seed_data.columns - assert "marital_unit_id" not in result.scaffold_seed_data.columns - assert "tax_unit_id" not in captured_seed_columns - assert "spm_unit_id" not in captured_seed_columns - assert "marital_unit_id" not in captured_seed_columns - assert "tax_unit_id" not in result.seed_data.columns - assert "spm_unit_id" not in result.seed_data.columns - assert "marital_unit_id" not in result.seed_data.columns - assert result.synthesis_metadata["donor_conditioning_diagnostics"] == [ - { - "donor_source": "test_donor", - "model_variables": ["income"], - "selected_condition_vars": ["age"], - } - ] - - def test_build_from_frames_rank_matches_generated_donor_values( - self, - monkeypatch, - ): - cps_households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "hh_weight": [100.0, 120.0, 140.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - cps_persons = pd.DataFrame( - { - "person_id": [10, 20, 30], - "household_id": [1, 2, 3], - "age": [45, 19, 62], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [60_000.0, 12_000.0, 40_000.0], - } - ) - donor_households = pd.DataFrame( - { - "household_id": [101, 102, 103], - "hh_weight": [80.0, 90.0, 110.0], - "state_fips": [6, 36, 12], - "tenure": [1, 2, 1], - } - ) - donor_persons = pd.DataFrame( - { - "person_id": [1001, 1002, 1003], - "household_id": [101, 102, 103], - "age": [44, 21, 61], - "sex": [1, 2, 1], - "education": [3, 2, 4], - "employment_status": [1, 0, 1], - "income": [58_000.0, 13_000.0, 41_000.0], - "taxable_interest_income": [0.0, 0.0, 100.0], - } - ) - - cps_frame = ObservationFrame( - source=SourceDescriptor( - name="cps_like", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: cps_households, - EntityType.PERSON: cps_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - donor_frame = ObservationFrame( - source=SourceDescriptor( - name="tax_donor", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_id", - variable_names=("state_fips", "tenure"), - weight_column="hh_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_id", - variable_names=( - "household_id", - "age", - "sex", - "education", - "employment_status", - "income", - "taxable_interest_income", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: donor_households, - EntityType.PERSON: donor_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_id", - child_key="household_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - - class FakeSynthesizer: - def __init__(self, *args, **kwargs): - _ = args - _ = kwargs - - def fit(self, *args, **kwargs): - _ = args - _ = kwargs - - def generate(self, frame, seed=None): - _ = seed - result = frame.copy() - result["taxable_interest_income"] = [1e12, -1e12, 500.0] - return result - - monkeypatch.setattr("microplex_us.pipelines.us.Synthesizer", FakeSynthesizer) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - cps_input = pipeline.prepare_source_input(cps_frame) - donor_input = pipeline.prepare_source_input(donor_frame) - seed_data = pipeline.prepare_seed_data_from_source(cps_input) - - integration = pipeline._integrate_donor_sources( - seed_data, - scaffold_input=cps_input, - donor_inputs=[donor_input], - ) - - assert integration["seed_data"]["taxable_interest_income"].tolist() == [ - 100.0, - 0.0, - 0.0, - ] - - def test_rank_match_donor_values_preserves_zero_inflated_positive_support(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - scores = pd.Series([0.1, 0.2, 0.9, 1.0], dtype=float) - donor_values = pd.Series([0.0, 0.0, 10.0, 20.0], dtype=float) - donor_weights = pd.Series([1.0, 1.0, 1.0, 1.0], dtype=float) - - matched = pipeline._rank_match_donor_values( - scores, - donor_values=donor_values, - donor_weights=donor_weights, - rng=np.random.default_rng(42), - ) - - assert matched.tolist() == [0.0, 0.0, 10.0, 20.0] - - def test_rank_match_donor_values_respects_weighted_positive_rate(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=5, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - scores = pd.Series([0.1, 0.2, 0.3, 0.9, 1.0], dtype=float) - donor_values = pd.Series([0.0, 0.0, 10.0], dtype=float) - donor_weights = pd.Series([4.0, 4.0, 2.0], dtype=float) - - matched = pipeline._rank_match_donor_values( - scores, - donor_values=donor_values, - donor_weights=donor_weights, - rng=np.random.default_rng(42), - ) - - assert (matched > 0).sum() == 1 - assert matched.iloc[-1] > 0.0 - assert matched.iloc[:-1].eq(0.0).all() - - def test_build_from_source_provider_defaults_missing_optional_variables(self): - households = pd.DataFrame( - { - "household_key": [1, 2], - "household_weight": [125.0, 175.0], - "region_code": [1, 2], - } - ) - persons = pd.DataFrame( - { - "person_key": [10, 11], - "household_key": [1, 2], - "age": [45, 19], - "income": [60_000.0, 12_000.0], - } - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="sparse_provider", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_key", - variable_names=("region_code",), - weight_column="household_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_key", - variable_names=("age", "income"), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_key", - child_key="household_key", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - provider = StaticSourceProvider(frame) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - - result = pipeline.build_from_source_provider(provider) - - assert result.seed_data["tenure"].eq(0).all() - assert result.seed_data["employment_status"].eq(0).all() - assert set(result.seed_data["state"]) == {"UNK"} - assert result.seed_data["hh_weight"].sum() == pytest.approx(300.0) - - def test_build_from_source_provider_prefers_household_scoped_merge_columns(self): - households = pd.DataFrame( - { - "household_key": [1, 2], - "household_weight": [125.0, 175.0], - "state_fips": [6, 36], - "tenure": [1, 2], - } - ) - persons = pd.DataFrame( - { - "person_key": [10, 11], - "household_key": [1, 2], - "age": [45, 19], - "income": [60_000.0, 12_000.0], - "state_fips": [99, 99], - "tenure": [9, 9], - } - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="overlapping_columns", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_key", - variable_names=("state_fips", "tenure"), - weight_column="household_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_key", - variable_names=("age", "income", "state_fips", "tenure"), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_key", - child_key="household_key", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - provider = StaticSourceProvider(frame) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - - result = pipeline.build_from_source_provider(provider) - - assert result.seed_data["state_fips"].tolist() == [6, 36] - assert result.seed_data["tenure"].tolist() == [1, 2] - - def test_synthesizer_uses_observed_source_coverage(self): - households = pd.DataFrame( - { - "household_key": [1, 2, 3], - "household_weight": [100.0, 120.0, 140.0], - "region_code": [1, 2, 3], - } - ) - persons = pd.DataFrame( - { - "person_key": [10, 11, 12], - "household_key": [1, 2, 3], - "age": [45, 19, 62], - "income": [60_000.0, 12_000.0, 40_000.0], - } - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="sparse_provider", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_key", - variable_names=("region_code",), - weight_column="household_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_key", - variable_names=("age", "income"), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_key", - child_key="household_key", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - provider = StaticSourceProvider(frame) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=3, - synthesis_backend="synthesizer", - calibration_backend="entropy", - synthesizer_epochs=2, - synthesizer_n_layers=2, - synthesizer_hidden_dim=8, - random_seed=5, - ) - ) - - result = pipeline.build_from_source_provider(provider) - - assert result.synthesis_metadata["condition_vars"] == ["age"] - assert result.synthesis_metadata["target_vars"] == ["income"] - assert result.synthesizer is not None - - def test_synthesizer_handles_state_program_proxy_condition_vars(self): - households = pd.DataFrame( - { - "household_key": [1, 2, 3, 4], - "household_weight": [100.0, 120.0, 140.0, 160.0], - "state_fips": [6, 6, 36, 36], - "tenure": [1, 2, 1, 2], - } - ) - persons = pd.DataFrame( - { - "person_key": [10, 11, 12, 13], - "household_key": [1, 2, 3, 4], - "age": [45, 19, 62, 35], - "sex": [1, 2, 1, 2], - "education": [4, 2, 3, 1], - "employment_status": [1, 0, 1, 1], - "income": [60_000.0, 12_000.0, 40_000.0, 22_000.0], - "has_medicaid": [1.0, 0.0, 0.0, 1.0], - "public_assistance": [0.0, 150.0, 0.0, 0.0], - "ssi": [0.0, 0.0, 0.0, 0.0], - "social_security": [0.0, 0.0, 900.0, 0.0], - } - ) - frame = ObservationFrame( - source=SourceDescriptor( - name="state_program_proxy_provider", - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="household_key", - variable_names=("state_fips", "tenure"), - weight_column="household_weight", - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_key", - variable_names=( - "age", - "sex", - "education", - "employment_status", - "income", - "has_medicaid", - "public_assistance", - "ssi", - "social_security", - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="household_key", - child_key="household_key", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - provider = StaticSourceProvider(frame) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="synthesizer", - calibration_backend="entropy", - synthesizer_epochs=2, - synthesizer_n_layers=2, - synthesizer_hidden_dim=8, - random_seed=7, - ) - ) - - result = pipeline.build_from_source_provider(provider) - - assert result.synthesizer is not None - assert result.synthesis_metadata["condition_vars"] == [ - "age", - "sex", - "education", - "employment_status", - "state_fips", - "tenure", - "has_medicaid", - ] - assert len(result.synthetic_data) == 4 - - def test_constant_has_medicaid_is_not_auto_promoted_to_condition_var(self): - frame = pd.DataFrame( - { - "age": [25, 40, 55, 32], - "sex": [1, 2, 1, 2], - "education": [2, 3, 4, 1], - "employment_status": [1, 1, 0, 1], - "state_fips": [6, 6, 36, 36], - "tenure": [1, 2, 1, 2], - "income": [50_000.0, 30_000.0, 20_000.0, 80_000.0], - "has_medicaid": [0.0, 0.0, 0.0, 0.0], - "weight": [1.0, 1.0, 1.0, 1.0], - } - ) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=4, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - - condition_vars = pipeline._resolve_synthesis_condition_vars( - frame.columns, - observed_frame=frame, - ) - - assert "has_medicaid" not in condition_vars - - def test_ensure_target_support_handles_bool_destination_columns(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - ) - synthetic_data = pd.DataFrame( - { - "person_id": [0, 1], - "household_id": [0, 1], - "state_fips": [6, 36], - "tenure": [1, 2], - "age": [40, 50], - "sex": [1, 2], - "education": [3, 4], - "employment_status": [1, 1], - "income": [40_000.0, 60_000.0], - "has_medicaid": pd.Series([False, False], dtype=bool), - "weight": [1.0, 1.0], - } - ) - seed_data = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [10, 20], - "state_fips": [6, 36], - "tenure": [1, 2], - "age": [41, 51], - "sex": [1, 2], - "education": [3, 4], - "employment_status": [1, 1], - "income": [42_000.0, 61_000.0], - "has_medicaid": [1.0, 0.0], - "weight": [1.0, 1.0], - } - ) - targets = USMicroplexTargets( - marginal={"has_medicaid": ["1.0"]}, - continuous={}, - ) - - result = pipeline.ensure_target_support(synthetic_data, seed_data, targets) - - assert pd.to_numeric(result["has_medicaid"], errors="coerce").max() == 1.0 - - def test_build_from_missing_directory_raises(self, tmp_path): - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - - with pytest.raises(FileNotFoundError, match="CPS ASEC data files not found"): - pipeline.build_from_data_dir(tmp_path) - - -class TestUSMicroplexBuildResult: - """Test build result helpers.""" - - @pytest.fixture - def result(self): - config = USMicroplexBuildConfig( - n_synthetic=3, - synthesis_backend="bootstrap", - calibration_backend="entropy", - ) - seed = pd.DataFrame({"income": [1.0], "hh_weight": [1.0]}) - synthetic = pd.DataFrame({"income": [1.0, 2.0, 3.0], "weight": [1.0, 1.0, 1.0]}) - calibrated = synthetic.copy() - calibrated["weight"] = [0.0, 2.0, 3.0] - - return USMicroplexBuildResult( - config=config, - seed_data=seed, - synthetic_data=synthetic, - calibrated_data=calibrated, - targets=USMicroplexTargets(marginal={}, continuous={"income": 6.0}), - calibration_summary={"max_error": 0.0, "mean_error": 0.0}, - synthesis_metadata={"backend": "bootstrap"}, - synthesizer=None, - policyengine_tables=None, - ) - - def test_nonzero_weight_count(self, result): - assert result.n_nonzero_weights == 2 - - def test_total_weighted_population(self, result): - assert result.total_weighted_population == 5.0 diff --git a/tests/pipelines/test_us_aotc_eligibility_inputs.py b/tests/pipelines/test_us_aotc_eligibility_inputs.py deleted file mode 100644 index 9b7d34c0..00000000 --- a/tests/pipelines/test_us_aotc_eligibility_inputs.py +++ /dev/null @@ -1,396 +0,0 @@ -"""Tests for the AOTC eligibility-input construction in the US pipeline. - -Exercises ``USMicroplexPipeline._construct_aotc_eligibility_inputs`` (and its -call site inside ``build_policyengine_entity_tables``), which mirrors the -enhanced-CPS baseline ``ExtendedCPS._impute_aotc_eligibility_inputs`` -(``PolicyEngine/policyengine-us-data``, unmerged branch -``codex/fix-aotc-eligibility``). -""" - -import pandas as pd -import pytest - -from microplex_us.pipelines.us import USMicroplexBuildConfig, USMicroplexPipeline -from microplex_us.policyengine.us import ( - POLICYENGINE_US_EXPORT_DEFAULTS, - SAFE_POLICYENGINE_US_EXPORT_VARIABLES, - build_policyengine_us_export_variable_maps, - build_policyengine_us_time_period_arrays, -) - -AOTC_TRUE_FLAG_COLUMNS = ( - "is_pursuing_credential_for_american_opportunity_credit", - "attends_eligible_educational_institution_for_american_opportunity_credit", - "is_enrolled_at_least_half_time_for_american_opportunity_credit", - "has_american_opportunity_credit_1098_t_or_exception", - "has_american_opportunity_credit_institution_ein", -) -AOTC_FALSE_FLAG_COLUMNS = ( - "has_completed_first_four_years_of_postsecondary_education", - "has_felony_drug_conviction", -) -AOTC_PRIOR_YEARS_COLUMN = "american_opportunity_credit_claimed_prior_years" -ALL_AOTC_COLUMNS = ( - AOTC_TRUE_FLAG_COLUMNS + AOTC_FALSE_FLAG_COLUMNS + (AOTC_PRIOR_YEARS_COLUMN,) -) - - -def _pipeline(year: int = 2024) -> USMicroplexPipeline: - return USMicroplexPipeline(USMicroplexBuildConfig(policyengine_dataset_year=year)) - - -def test_all_eight_aotc_columns_are_safe_export_variables(): - for column in ALL_AOTC_COLUMNS: - assert column in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - - -def test_all_eight_aotc_columns_have_false_or_zero_defaults(): - for column in AOTC_TRUE_FLAG_COLUMNS + AOTC_FALSE_FLAG_COLUMNS: - assert POLICYENGINE_US_EXPORT_DEFAULTS[column] is False - assert POLICYENGINE_US_EXPORT_DEFAULTS[AOTC_PRIOR_YEARS_COLUMN] == 0 - - -def test_fallback_marks_tuition_holders_when_no_credit_signal(): - """No credit column -> eCPS fallback aotc_student = tuition > 0. - - This path needs no PolicyEngine-US parameters (no back-solve runs). - """ - pipeline = _pipeline() - persons = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "tax_unit_id": [100, 100, 200], - "age": [45, 19, 50], - "income": [60_000.0, 0.0, 40_000.0], - "qualified_tuition_expenses": [0.0, 3_500.0, 0.0], - "relationship_to_head": [0, 2, 0], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - by_id = result.set_index("person_id") - - # Student (person 2, positive tuition) gets the five factual flags. - for column in AOTC_TRUE_FLAG_COLUMNS: - assert bool(by_id.loc[2, column]) is True - for column in AOTC_FALSE_FLAG_COLUMNS: - assert bool(by_id.loc[2, column]) is False - assert int(by_id.loc[2, AOTC_PRIOR_YEARS_COLUMN]) == 0 - - # Non-students (persons 1, 3) keep defaults. - for person_id in (1, 3): - for column in AOTC_TRUE_FLAG_COLUMNS + AOTC_FALSE_FLAG_COLUMNS: - assert bool(by_id.loc[person_id, column]) is False - assert int(by_id.loc[person_id, AOTC_PRIOR_YEARS_COLUMN]) == 0 - - -def test_no_signal_at_all_leaves_frame_unchanged(): - """Neither a credit nor a tuition column -> nothing to construct.""" - pipeline = _pipeline() - persons = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "age": [40, 38], - "income": [50_000.0, 45_000.0], - "relationship_to_head": [0, 1], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - - # The construction returns early; no AOTC columns are added here. The - # export layer supplies the contract-required columns from defaults. - for column in ALL_AOTC_COLUMNS: - assert column not in result.columns - - -def test_fallback_clamps_existing_prior_years_to_three(): - pipeline = _pipeline() - persons = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "tax_unit_id": [100], - "age": [20], - "income": [0.0], - "qualified_tuition_expenses": [2_000.0], - AOTC_PRIOR_YEARS_COLUMN: [7], - "relationship_to_head": [0], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - assert int(result.set_index("person_id").loc[1, AOTC_PRIOR_YEARS_COLUMN]) == 3 - - -def test_credit_signal_with_zero_positive_credit_marks_nobody(): - """Credit column present but no positive value -> eCPS early return.""" - pipeline = _pipeline() - persons = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "age": [45, 19], - "income": [60_000.0, 0.0], - "qualified_tuition_expenses": [0.0, 3_000.0], - "american_opportunity_credit": [0.0, 0.0], - "is_full_time_college_student": [False, True], - "relationship_to_head": [0, 2], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - # When a credit signal exists but is all-zero, the credit-driven path - # returns before writing inputs (it does NOT fall back to tuition>0). - for column in ALL_AOTC_COLUMNS: - assert column not in result.columns - - -class TestCreditDrivenConstruction: - """Credit-driven back-solve; needs PolicyEngine-US parameters.""" - - @pytest.fixture(autouse=True) - def _require_policyengine_us(self): - pytest.importorskip("policyengine_us") - - def test_dependent_student_selected_and_tuition_backsolved(self): - pipeline = _pipeline(2024) - # Parent filer + full-time college dependent; $2,500 tax-unit credit - # broadcast across members (PUF tax-unit column on the person frame). - persons = pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 10], - "tax_unit_id": [100, 100, 100], - "age": [50, 19, 16], - "income": [80_000.0, 0.0, 0.0], - "is_tax_unit_dependent": [0.0, 1.0, 1.0], - "is_full_time_college_student": [False, True, False], - "qualified_tuition_expenses": [0.0, 4_000.0, 0.0], - "american_opportunity_credit": [2_500.0, 2_500.0, 2_500.0], - "relationship_to_head": [0, 2, 2], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - by_id = result.set_index("person_id") - - # The college dependent is the selected student. - for column in AOTC_TRUE_FLAG_COLUMNS: - assert bool(by_id.loc[2, column]) is True - for column in AOTC_FALSE_FLAG_COLUMNS: - assert bool(by_id.loc[2, column]) is False - assert int(by_id.loc[2, AOTC_PRIOR_YEARS_COLUMN]) in range(0, 4) - - # Person 2 already reports $4,000 tuition; eCPS flags the member and - # preserves the reported tuition (no rewrite). - assert by_id.loc[2, "qualified_tuition_expenses"] == pytest.approx(4_000.0) - - # Parent and minor are not students. - for person_id in (1, 3): - for column in AOTC_TRUE_FLAG_COLUMNS: - assert bool(by_id.loc[person_id, column]) is False - - def test_existing_positive_tuition_is_preserved(self): - pipeline = _pipeline(2024) - # Single filer-student who already reports positive tuition. eCPS flags - # the member but leaves the reported tuition untouched -- no back-solve, - # no overwrite -- even when the credit would imply a smaller base. - persons = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "tax_unit_id": [100], - "age": [28], - "income": [30_000.0], - "is_tax_unit_dependent": [0.0], - "is_full_time_college_student": [True], - "qualified_tuition_expenses": [2_000.0], - "american_opportunity_credit": [1_250.0], - "relationship_to_head": [0], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - row = result.set_index("person_id").loc[1] - for column in AOTC_TRUE_FLAG_COLUMNS: - assert bool(row[column]) is True - # Reported tuition is preserved, not overwritten to the $1,250 the - # credit would otherwise back-solve to. - assert row["qualified_tuition_expenses"] == pytest.approx(2_000.0) - - def test_all_tuition_positive_members_are_flagged(self): - pipeline = _pipeline(2024) - # Two members both reporting positive tuition in one credit-positive - # tax unit. eCPS flags BOTH (it does not stop after a single student) - # and leaves both reported tuition values untouched. - persons = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "age": [20, 22], - "income": [0.0, 0.0], - "is_tax_unit_dependent": [1.0, 1.0], - "is_full_time_college_student": [True, True], - "qualified_tuition_expenses": [3_000.0, 3_000.0], - "american_opportunity_credit": [2_500.0, 2_500.0], - "relationship_to_head": [2, 2], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - by_id = result.set_index("person_id") - for person_id in (1, 2): - for column in AOTC_TRUE_FLAG_COLUMNS: - assert bool(by_id.loc[person_id, column]) is True - assert by_id.loc[ - person_id, "qualified_tuition_expenses" - ] == pytest.approx(3_000.0) - - def test_no_tuition_partial_credit_backsolves_to_smaller_expenses(self): - pipeline = _pipeline(2024) - # No member reports tuition; a $1,250 credit back-solves to $1,250 of - # qualified expenses (inside the 100% first bracket) on the selected - # full-time student. - persons = pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "tax_unit_id": [100], - "age": [28], - "income": [30_000.0], - "is_tax_unit_dependent": [0.0], - "is_full_time_college_student": [True], - "qualified_tuition_expenses": [0.0], - "american_opportunity_credit": [1_250.0], - "relationship_to_head": [0], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - row = result.set_index("person_id").loc[1] - for column in AOTC_TRUE_FLAG_COLUMNS: - assert bool(row[column]) is True - assert row["qualified_tuition_expenses"] == pytest.approx(1_250.0) - - def test_full_time_student_selected_when_no_member_has_tuition(self): - pipeline = _pipeline(2024) - # Credit present, nobody has positive tuition: selection falls to the - # full-time college student (second priority group in eCPS). - persons = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "age": [50, 20], - "income": [70_000.0, 0.0], - "is_tax_unit_dependent": [0.0, 1.0], - "is_full_time_college_student": [False, True], - "qualified_tuition_expenses": [0.0, 0.0], - "american_opportunity_credit": [2_500.0, 2_500.0], - "relationship_to_head": [0, 2], - } - ) - - result = pipeline._construct_aotc_eligibility_inputs(persons) - by_id = result.set_index("person_id") - assert ( - bool(by_id.loc[2, "is_pursuing_credential_for_american_opportunity_credit"]) - is True - ) - assert ( - bool(by_id.loc[1, "is_pursuing_credential_for_american_opportunity_credit"]) - is False - ) - # The student's tuition is set to the credit-implied $4,000. - assert by_id.loc[2, "qualified_tuition_expenses"] == pytest.approx(4_000.0) - - def test_export_includes_all_eight_columns_with_real_values(self): - pipeline = _pipeline(2024) - tbs = pipeline._resolve_policyengine_tax_benefit_system() - persons = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "age": [50, 19], - "sex": [1, 2], - "income": [80_000.0, 0.0], - "is_tax_unit_dependent": [0.0, 1.0], - "is_full_time_college_student": [False, True], - "qualified_tuition_expenses": [0.0, 4_000.0], - "american_opportunity_credit": [2_500.0, 2_500.0], - "relationship_to_head": [0, 2], - } - ) - - tables = pipeline.build_policyengine_entity_tables(persons) - export_maps = build_policyengine_us_export_variable_maps( - tables, tax_benefit_system=tbs - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map=export_maps["household"], - person_variable_map=export_maps["person"], - tax_unit_variable_map=export_maps["tax_unit"], - spm_unit_variable_map=export_maps["spm_unit"], - family_variable_map=export_maps["family"], - ) - - for column in ALL_AOTC_COLUMNS: - assert column in arrays, column - - # The dependent student (second person row) has the True flags. - for column in AOTC_TRUE_FLAG_COLUMNS: - assert arrays[column]["2024"].tolist() == [False, True] - for column in AOTC_FALSE_FLAG_COLUMNS: - assert arrays[column]["2024"].tolist() == [False, False] - assert arrays[AOTC_PRIOR_YEARS_COLUMN]["2024"].tolist() == [0, 0] - - # american_opportunity_credit is a PUF calculated output and must not - # be exported (PolicyEngine-US recomputes it from these inputs). - assert "american_opportunity_credit" not in arrays - - -def test_no_signal_export_falls_back_to_defaults(): - """With no AOTC signal, the contract-required columns still export.""" - pytest.importorskip("policyengine_us") - pipeline = _pipeline(2024) - tbs = pipeline._resolve_policyengine_tax_benefit_system() - persons = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "age": [40, 38], - "sex": [1, 2], - "income": [50_000.0, 45_000.0], - "is_tax_unit_dependent": [0.0, 0.0], - "relationship_to_head": [0, 1], - } - ) - - tables = pipeline.build_policyengine_entity_tables(persons) - export_maps = build_policyengine_us_export_variable_maps( - tables, tax_benefit_system=tbs - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map=export_maps["household"], - person_variable_map=export_maps["person"], - tax_unit_variable_map=export_maps["tax_unit"], - spm_unit_variable_map=export_maps["spm_unit"], - family_variable_map=export_maps["family"], - ) - - for column in AOTC_TRUE_FLAG_COLUMNS + AOTC_FALSE_FLAG_COLUMNS: - assert column in arrays - assert arrays[column]["2024"].tolist() == [False, False] - assert arrays[AOTC_PRIOR_YEARS_COLUMN]["2024"].tolist() == [0, 0] diff --git a/tests/pipelines/test_us_microunit_delegation.py b/tests/pipelines/test_us_microunit_delegation.py deleted file mode 100644 index db89544b..00000000 --- a/tests/pipelines/test_us_microunit_delegation.py +++ /dev/null @@ -1,549 +0,0 @@ -"""Tests for delegating PolicyEngine tax-unit reconstruction to ``microunit``. - -These exercise the reconstruction-from-scratch path added in issue #113, where -``USMicroplexPipeline._build_policyengine_tax_units_from_role_flags`` delegates to -:func:`microunit.construct_tax_units` when the person frame carries the raw CPS -columns ``microunit`` requires. The authoritative-ID path (#112) and the legacy -role-flag fallback are validated by ``test_us_entities.py``. - -``microunit`` ships in the ``policyengine`` optional dependency group, so these -tests skip when it is not installed. -""" - -from __future__ import annotations - -import pandas as pd -import pytest - -from microplex_us.pipelines.us import USMicroplexPipeline - -pytest.importorskip("microunit", reason="microunit is in the policyengine extra") - - -@pytest.fixture -def pipeline() -> USMicroplexPipeline: - return USMicroplexPipeline() - - -def _unit_field(tax_units: pd.DataFrame, unit_id: int, field: str): - """Read one tax unit's field robustly. - - ``tax_units`` has list-valued columns (``filer_ids`` etc.); ``DataFrame.iterrows`` - coerces a row to a single-dtype Series and mangles those, so we filter to the - single matching row and read the column's first (only) value instead. - """ - rows = tax_units.loc[tax_units["tax_unit_id"] == unit_id] - assert len(rows) == 1, f"expected exactly one unit {unit_id}, found {len(rows)}" - return rows[field].iloc[0] - - -def _cps_person_frame() -> pd.DataFrame: - """A two-household CPS-like frame microunit can consume directly. - - Household 10 is a married couple (lines 1 and 2) with one own child - (line 3, parent pointers to lines 1 and 2). Household 20 is a single adult. - The columns are the raw CPS ASEC names microunit expects; ``person_id`` and - ``household_id`` are microplex's own keys used to map results back. - """ - return pd.DataFrame( - [ - { - "person_id": 1, - "household_id": 10, - "relationship_to_head": 0, - "age": 40, - "income": 50000.0, - "PH_SEQ": 10, - "A_LINENO": 1, - "A_AGE": 40, - "A_MARITL": 1, - "A_SPOUSE": 2, - "PEPAR1": 0, - "PEPAR2": 0, - "A_EXPRRP": 1, - "WSAL_VAL": 50000, - }, - { - "person_id": 2, - "household_id": 10, - "relationship_to_head": 1, - "age": 38, - "income": 30000.0, - "PH_SEQ": 10, - "A_LINENO": 2, - "A_AGE": 38, - "A_MARITL": 1, - "A_SPOUSE": 1, - "PEPAR1": 0, - "PEPAR2": 0, - "A_EXPRRP": 3, - "WSAL_VAL": 30000, - }, - { - "person_id": 3, - "household_id": 10, - "relationship_to_head": 2, - "age": 10, - "income": 0.0, - "PH_SEQ": 10, - "A_LINENO": 3, - "A_AGE": 10, - "A_MARITL": 7, - "A_SPOUSE": 0, - "PEPAR1": 1, - "PEPAR2": 2, - "A_EXPRRP": 5, - "WSAL_VAL": 0, - }, - { - "person_id": 4, - "household_id": 20, - "relationship_to_head": 0, - "age": 25, - "income": 45000.0, - "PH_SEQ": 20, - "A_LINENO": 1, - "A_AGE": 25, - "A_MARITL": 7, - "A_SPOUSE": 0, - "PEPAR1": 0, - "PEPAR2": 0, - "A_EXPRRP": 1, - "WSAL_VAL": 45000, - }, - ] - ) - - -def test_microunit_path_groups_married_couple_with_child(pipeline): - persons = _cps_person_frame() - - result = pipeline._build_policyengine_tax_units_via_microunit(persons) - - assert result is not None, "microunit path must activate when CPS columns exist" - tax_units, person_rows, households = result - - # Two units: the married-with-child family and the single adult. - assert households == {10, 20} - assert person_rows["tax_unit_id"].nunique() == 2 - assert len(tax_units) == 2 - - # The three household-10 members share one unit; the single adult is alone. - unit_by_person = dict( - zip(person_rows["person_id"], person_rows["tax_unit_id"], strict=True) - ) - assert unit_by_person[1] == unit_by_person[2] == unit_by_person[3] - assert unit_by_person[4] != unit_by_person[1] - - family_id = int(unit_by_person[1]) - single_id = int(unit_by_person[4]) - - assert _unit_field(tax_units, family_id, "filing_status") == "JOINT" - assert int(_unit_field(tax_units, family_id, "n_dependents")) == 1 - assert sorted(_unit_field(tax_units, family_id, "filer_ids")) == [1, 2] - assert list(_unit_field(tax_units, family_id, "dependent_ids")) == [3] - - assert _unit_field(tax_units, single_id, "filing_status") == "SINGLE" - assert int(_unit_field(tax_units, single_id, "n_dependents")) == 0 - assert list(_unit_field(tax_units, single_id, "filer_ids")) == [4] - assert list(_unit_field(tax_units, single_id, "dependent_ids")) == [] - - # The temporary role column must not leak into the returned person frame. - assert "_microunit_role" not in person_rows.columns - - -def test_microunit_path_honors_start_tax_unit_id(pipeline): - persons = _cps_person_frame() - - result = pipeline._build_policyengine_tax_units_via_microunit( - persons, start_tax_unit_id=100 - ) - - assert result is not None - _, person_rows, _ = result - assert int(person_rows["tax_unit_id"].min()) >= 100 - - -def test_role_flags_entry_delegates_to_microunit_when_cps_columns_present(pipeline): - persons = _cps_person_frame() - - via_entry = pipeline._build_policyengine_tax_units_from_role_flags(persons) - via_direct = pipeline._build_policyengine_tax_units_via_microunit(persons) - - assert via_entry is not None - assert via_direct is not None - _, entry_persons, entry_households = via_entry - _, direct_persons, direct_households = via_direct - - # The entry method must route through microunit (identical assignments), - # not the legacy role-flag reconstruction, when CPS columns are present. - assert entry_households == direct_households - pd.testing.assert_series_equal( - entry_persons.set_index("person_id")["tax_unit_id"], - direct_persons.set_index("person_id")["tax_unit_id"], - ) - - -def test_role_flags_entry_falls_back_without_cps_columns(pipeline): - # No CPS columns: only microplex role flags. microunit must NOT be used; - # the legacy role-flag reconstruction handles it (behavior-preserving). - persons = pd.DataFrame( - [ - { - "person_id": 1, - "household_id": 10, - "relationship_to_head": 0, - "age": 40, - "income": 50000.0, - "is_tax_unit_head": 1, - "is_tax_unit_spouse": 0, - "is_tax_unit_dependent": 0, - }, - { - "person_id": 2, - "household_id": 10, - "relationship_to_head": 1, - "age": 38, - "income": 30000.0, - "is_tax_unit_head": 0, - "is_tax_unit_spouse": 1, - "is_tax_unit_dependent": 0, - }, - ] - ) - - # The dedicated microunit adapter declines (missing CPS columns)... - assert pipeline._build_policyengine_tax_units_via_microunit(persons) is None - - # ...but the role-flag entry still succeeds via the legacy path. - result = pipeline._build_policyengine_tax_units_from_role_flags(persons) - assert result is not None - tax_units, person_rows, households = result - assert households == {10} - assert person_rows["tax_unit_id"].nunique() == 1 - assert tax_units["filing_status"].iloc[0] == "JOINT" - - -def test_microunit_path_preserves_alignment_under_unsorted_input(pipeline): - """microunit groups internally by PH_SEQ but returns per-person results in - input row order; the delegation maps TAX_ID/role back positionally, so the - mapping must stay correct even when input rows are not pre-sorted by - PH_SEQ/A_LINENO. - - Regression guard: if a future microunit change stopped preserving input row - order, the positional mapping would misassign people to units/roles and this - test would fail while the pre-sorted fixture test still passed. - """ - persons = _cps_person_frame() - # Input order != PH_SEQ order: single adult (HH 20) first, then the HH 10 - # members out of A_LINENO order (2, 1, 3). - shuffled = persons.iloc[[3, 1, 0, 2]].reset_index(drop=True) - - result = pipeline._build_policyengine_tax_units_via_microunit(shuffled) - assert result is not None - tax_units, person_rows, _ = result - - unit_by_person = dict( - zip(person_rows["person_id"], person_rows["tax_unit_id"], strict=True) - ) - # Couple + child still share one unit; the single adult stays separate. - assert unit_by_person[1] == unit_by_person[2] == unit_by_person[3] - assert unit_by_person[4] != unit_by_person[1] - - # Roles/filing must stay correctly aligned, not just the grouping. - family_id = int(unit_by_person[1]) - assert _unit_field(tax_units, family_id, "filing_status") == "JOINT" - assert sorted(_unit_field(tax_units, family_id, "filer_ids")) == [1, 2] - assert list(_unit_field(tax_units, family_id, "dependent_ids")) == [3] - - -def _normalized_person_frame() -> pd.DataFrame: - """A purely NORMALIZED microplex frame (no raw CPS columns). - - The same population as ``_cps_person_frame`` expressed only in microplex's - materialization columns: a married couple with one child (HH 10) and a - single adult (HH 20). Used to exercise the prototype adapter (#115) that - synthesizes microunit's CPS contract from these columns. - """ - return pd.DataFrame( - [ - { - "person_id": 1, - "household_id": 10, - "relationship_to_head": 0, - "age": 40, - "income": 50000.0, - }, - { - "person_id": 2, - "household_id": 10, - "relationship_to_head": 1, - "age": 38, - "income": 30000.0, - }, - { - "person_id": 3, - "household_id": 10, - "relationship_to_head": 2, - "age": 10, - "income": 0.0, - }, - { - "person_id": 4, - "household_id": 20, - "relationship_to_head": 0, - "age": 25, - "income": 45000.0, - }, - ] - ) - - -def test_normalized_adapter_is_off_by_default(pipeline): - persons = _normalized_person_frame() - # No raw CPS columns and the adapter is not enabled -> declines (the default - # behavior is unchanged; the legacy role-flag path handles such frames). - assert pipeline._build_policyengine_tax_units_via_microunit(persons) is None - - -def test_normalized_adapter_activates_delegation_when_enabled(pipeline): - persons = _normalized_person_frame() - - result = pipeline._build_policyengine_tax_units_via_microunit( - persons, allow_normalized_adapter=True - ) - - assert result is not None, ( - "adapter must let the delegation fire from a normalized frame" - ) - tax_units, person_rows, households = result - assert households == {10, 20} - - unit_by_person = dict( - zip(person_rows["person_id"], person_rows["tax_unit_id"], strict=True) - ) - # Couple + child collapse into one unit; the single adult stays separate. - assert unit_by_person[1] == unit_by_person[2] == unit_by_person[3] - assert unit_by_person[4] != unit_by_person[1] - - family_id = int(unit_by_person[1]) - assert _unit_field(tax_units, family_id, "filing_status") == "JOINT" - assert int(_unit_field(tax_units, family_id, "n_dependents")) == 1 - - -def test_normalized_adapter_builds_microunit_cps_contract(pipeline): - persons = _normalized_person_frame() - frame = pipeline._microunit_cps_frame_from_normalized(persons) - assert frame is not None - # All eight microunit-required CPS columns are present. - for col in ( - "PH_SEQ", - "A_LINENO", - "A_AGE", - "A_MARITL", - "A_SPOUSE", - "PEPAR1", - "PEPAR2", - "A_EXPRRP", - ): - assert col in frame.columns, f"missing {col}" - by_pid = frame.set_index("person_id") - # Couple are married (A_MARITL == 1) and point at each other's line numbers. - assert int(by_pid.loc[1, "A_MARITL"]) == 1 - assert int(by_pid.loc[1, "A_SPOUSE"]) == int(by_pid.loc[2, "A_LINENO"]) - assert int(by_pid.loc[2, "A_SPOUSE"]) == int(by_pid.loc[1, "A_LINENO"]) - # The child's parent pointers reference the head and spouse line numbers. - assert int(by_pid.loc[3, "PEPAR1"]) == int(by_pid.loc[1, "A_LINENO"]) - assert int(by_pid.loc[3, "PEPAR2"]) == int(by_pid.loc[2, "A_LINENO"]) - # The single adult is never-married with no spouse/parent pointers. - assert int(by_pid.loc[4, "A_MARITL"]) == 7 - assert int(by_pid.loc[4, "A_SPOUSE"]) == 0 - - -def test_microunit_path_fails_safe_to_none(pipeline, monkeypatch): - # If microunit raises on a household it cannot resolve, the delegation must - # fall back (return None) rather than crash materialization. - import microunit - - def _boom(*args, **kwargs): - raise ValueError("synthetic microunit failure") - - monkeypatch.setattr(microunit, "construct_tax_units", _boom) - persons = _cps_person_frame() # raw CPS columns present -> reaches microunit - assert pipeline._build_policyengine_tax_units_via_microunit(persons) is None - - -def test_normalized_adapter_promotes_a_head_when_none_marked(pipeline): - # A household where nobody is marked head (all "other") must still get a - # single reference person so microunit can construct it. - persons = pd.DataFrame( - [ - { - "person_id": 1, - "household_id": 30, - "relationship_to_head": 3, - "age": 70, - "income": 20000.0, - }, - { - "person_id": 2, - "household_id": 30, - "relationship_to_head": 3, - "age": 35, - "income": 15000.0, - }, - ] - ) - frame = pipeline._microunit_cps_frame_from_normalized(persons) - assert frame is not None - # Exactly one reference person (A_EXPRRP == 1) in the household. - assert int((frame["A_EXPRRP"] == 1).sum()) == 1 - # And the delegation must not raise (fail-safe covers microunit edge cases). - pipeline._build_policyengine_tax_units_via_microunit( - persons, allow_normalized_adapter=True - ) - - -def _cps_fields_frame() -> pd.DataFrame: - """A frame in microplex's real CPS-derived fields (no relationship_to_head): - person_number = within-household line, spouse_person_number = spouse line, - family_relationship = CPS A_FAMREL. Couple+child (HH 10), single (HH 20).""" - return pd.DataFrame( - [ - { - "person_id": 1, - "household_id": 10, - "age": 40, - "income": 50000.0, - "person_number": 1, - "spouse_person_number": 2, - "family_relationship": 1, - }, - { - "person_id": 2, - "household_id": 10, - "age": 38, - "income": 30000.0, - "person_number": 2, - "spouse_person_number": 1, - "family_relationship": 2, - }, - { - "person_id": 3, - "household_id": 10, - "age": 10, - "income": 0.0, - "person_number": 3, - "spouse_person_number": 0, - "family_relationship": 3, - }, - { - "person_id": 4, - "household_id": 20, - "age": 25, - "income": 45000.0, - "person_number": 1, - "spouse_person_number": 0, - "family_relationship": 0, - }, - ] - ) - - -def test_high_fidelity_adapter_uses_real_pointers(pipeline): - persons = _cps_fields_frame() - # The normalized adapter must dispatch to the high-fidelity path when the real - # CPS-derived fields are present (no relationship_to_head needed). - frame = pipeline._microunit_cps_frame_from_normalized(persons) - assert frame is not None - by = frame.set_index("person_id") - # Real line/spouse pointers, not heuristics. - assert int(by.loc[1, "A_LINENO"]) == 1 and int(by.loc[2, "A_LINENO"]) == 2 - assert int(by.loc[1, "A_SPOUSE"]) == 2 and int(by.loc[2, "A_SPOUSE"]) == 1 - # A_EXPRRP from family relationship: ref=1, spouse=3, own child=5. - assert int(by.loc[1, "A_EXPRRP"]) == 1 - assert int(by.loc[2, "A_EXPRRP"]) == 3 - assert int(by.loc[3, "A_EXPRRP"]) == 5 - # Child's parent pointers reference the head (line 1) and spouse (line 2). - assert int(by.loc[3, "PEPAR1"]) == 1 and int(by.loc[3, "PEPAR2"]) == 2 - - -def test_high_fidelity_adapter_activates_delegation(pipeline): - persons = _cps_fields_frame() - result = pipeline._build_policyengine_tax_units_via_microunit( - persons, allow_normalized_adapter=True - ) - assert result is not None - tax_units, person_rows, households = result - unit_by_person = dict( - zip(person_rows["person_id"], person_rows["tax_unit_id"], strict=True) - ) - # Couple + child form one unit; single adult separate. - assert unit_by_person[1] == unit_by_person[2] == unit_by_person[3] - assert unit_by_person[4] != unit_by_person[1] - family_id = int(unit_by_person[1]) - assert _unit_field(tax_units, family_id, "filing_status") == "JOINT" - - -def _zero_based_cps_fields_frame() -> pd.DataFrame: - """Same couple+child household as ``_cps_fields_frame`` but with - family_relationship in the optimizer's 0-based coding (0=head, 1=spouse, - 2=child) instead of CPS A_FAMREL 1-based (1=ref, 2=spouse, 3=child).""" - return pd.DataFrame( - [ - { - "person_id": 1, - "household_id": 10, - "age": 40, - "income": 50000.0, - "person_number": 1, - "spouse_person_number": 2, - "family_relationship": 0, - }, - { - "person_id": 2, - "household_id": 10, - "age": 38, - "income": 30000.0, - "person_number": 2, - "spouse_person_number": 1, - "family_relationship": 1, - }, - { - "person_id": 3, - "household_id": 10, - "age": 10, - "income": 0.0, - "person_number": 3, - "spouse_person_number": 0, - "family_relationship": 2, - }, - ] - ) - - -def test_high_fidelity_adapter_normalizes_zero_based_family_relationship(pipeline): - # A 0-based family_relationship frame must map to the SAME A_EXPRRP / parent - # pointers as the 1-based CPS A_FAMREL coding. Without per-household scheme - # normalization the child (code 2) is mis-read as a spouse and loses parent - # pointers, so microunit mis-partitions the household. - frame = pipeline._microunit_cps_frame_from_normalized( - _zero_based_cps_fields_frame() - ) - assert frame is not None - by = frame.set_index("person_id") - assert int(by.loc[1, "A_EXPRRP"]) == 1 # head - assert int(by.loc[2, "A_EXPRRP"]) == 3 # spouse, not other-relative - assert int(by.loc[3, "A_EXPRRP"]) == 5 # own child, not spouse - assert int(by.loc[3, "PEPAR1"]) == 1 and int(by.loc[3, "PEPAR2"]) == 2 - - # And the partition matches the 1-based frame: couple + child = one unit. - result = pipeline._build_policyengine_tax_units_via_microunit( - _zero_based_cps_fields_frame(), allow_normalized_adapter=True - ) - assert result is not None - _, person_rows, _ = result - unit_by_person = dict( - zip(person_rows["person_id"], person_rows["tax_unit_id"], strict=True) - ) - assert unit_by_person[1] == unit_by_person[2] == unit_by_person[3] diff --git a/tests/pipelines/test_us_spm_preservation.py b/tests/pipelines/test_us_spm_preservation.py deleted file mode 100644 index 002343d4..00000000 --- a/tests/pipelines/test_us_spm_preservation.py +++ /dev/null @@ -1,103 +0,0 @@ -"""SPM-unit-id preservation (#113). - -The authoritative SPM unit ids carried by the source records are eCPS-quality -(~1.04 units/household). Synthesis can leave some records without an id; the old -all-or-nothing logic discarded the whole column on any missing id, collapsing to -one SPM unit per household (~1.00). These tests pin the preserve-present-fill- -missing behavior that keeps the authoritative structure. -""" - -from __future__ import annotations - -import numpy as np -import pandas as pd - -from microplex_us.pipelines.us import USMicroplexPipeline - - -def _pipe() -> USMicroplexPipeline: - return USMicroplexPipeline() - - -def test_preserve_present_keeps_distinct_units(): - # Two SPM units in one household, all ids present -> both preserved. - persons = pd.DataFrame( - {"household_id": [1, 1, 1, 1], "spm_unit_id": [10, 10, 11, 11]} - ) - out = _pipe()._preserve_present_group_ids(persons, "spm_unit_id") - assert out.nunique() == 2 - assert out.iloc[0] == out.iloc[1] - assert out.iloc[2] == out.iloc[3] - assert out.iloc[0] != out.iloc[2] - - -def test_missing_rows_fold_into_present_unit_not_new(): - # A missing id in a household that has present ids must NOT fabricate a unit. - persons = pd.DataFrame( - {"household_id": [1, 1, 1], "spm_unit_id": [10.0, 10.0, np.nan]} - ) - out = _pipe()._preserve_present_group_ids(persons, "spm_unit_id") - assert out.nunique() == 1 # folded into the present unit, not split to 2 - - -def test_fully_missing_household_gets_one_fallback_when_others_present(): - persons = pd.DataFrame( - {"household_id": [1, 1, 2, 2], "spm_unit_id": [10.0, 10.0, np.nan, np.nan]} - ) - out = _pipe()._preserve_present_group_ids(persons, "spm_unit_id") - assert out.iloc[0] == out.iloc[1] # hh1 preserved - assert out.iloc[2] == out.iloc[3] # hh2 -> one fallback unit - assert out.iloc[0] != out.iloc[2] - - -def test_all_missing_returns_none(): - persons = pd.DataFrame({"household_id": [1, 1], "spm_unit_id": [np.nan, np.nan]}) - # Entirely empty column -> None so the caller regenerates from scratch. - assert _pipe()._preserve_present_group_ids(persons, "spm_unit_id") is None - - -def test_assign_family_and_spm_preserves_partial_spm(): - # End to end: a partially-missing SPM column keeps the present structure - # rather than collapsing to one unit per household. - persons = pd.DataFrame( - { - "person_id": [1, 2, 3, 4, 5], - "household_id": [1, 1, 1, 2, 2], - "relationship_to_head": [0, 1, 2, 0, 2], - "spm_unit_id": [10.0, 10.0, 11.0, np.nan, np.nan], - } - ) - out = _pipe()._assign_family_and_spm_units(persons) - per_hh = out.groupby("household_id")["spm_unit_id"].nunique() - assert int(per_hh.loc[1]) == 2 # two present units preserved - assert int(per_hh.loc[2]) == 1 # fully-missing household -> one fallback - - -def test_missing_row_folds_without_merging_distinct_present_units(): - # The trickiest folding case: a household with TWO present SPM units AND a - # missing row. The missing row must fold into one existing unit WITHOUT - # merging the two genuinely-distinct present units or fabricating a third. - persons = pd.DataFrame( - { - "household_id": [1, 1, 1, 1, 1], - "spm_unit_id": [10.0, 10.0, 11.0, 11.0, np.nan], - } - ) - out = _pipe()._preserve_present_group_ids(persons, "spm_unit_id") - assert out.nunique() == 2 # the two present units stay distinct - assert out.iloc[0] == out.iloc[1] # unit 10 - assert out.iloc[2] == out.iloc[3] # unit 11 - assert out.iloc[0] != out.iloc[2] - assert out.iloc[4] == out.iloc[0] # missing row folded into the first unit - - -def test_preserve_present_aligns_under_non_default_index(): - # A non-default / shuffled index must not misalign the missing-row assignment. - persons = pd.DataFrame( - {"household_id": [1, 1, 2, 2], "spm_unit_id": [10.0, np.nan, np.nan, 20.0]}, - index=[100, 5, 42, 7], - ) - out = _pipe()._preserve_present_group_ids(persons, "spm_unit_id") - assert out.loc[100] == out.loc[5] # hh1: present 10 + missing fold together - assert out.loc[42] == out.loc[7] # hh2: missing + present 20 fold together - assert out.loc[100] != out.loc[42] # distinct households stay distinct diff --git a/tests/pipelines/test_version_benchmark.py b/tests/pipelines/test_version_benchmark.py deleted file mode 100644 index 16d9a6f0..00000000 --- a/tests/pipelines/test_version_benchmark.py +++ /dev/null @@ -1,120 +0,0 @@ -"""Tests for the canonical US version-bump benchmark CLI.""" - -from __future__ import annotations - -from pathlib import Path -from types import SimpleNamespace - -from microplex_us.pipelines.version_benchmark import ( - _resolve_site_snapshot_path, - main, -) - - -def test_resolve_site_snapshot_path_defaults_to_artifacts_root(tmp_path) -> None: - output_root = tmp_path / "artifacts" / "live_run" - resolved = _resolve_site_snapshot_path( - output_root=output_root, - site_snapshot_path=None, - ) - assert resolved == (tmp_path / "artifacts" / "site_snapshot_us.json").resolve() - - -def test_main_writes_default_site_snapshot(monkeypatch, tmp_path) -> None: - recorded_snapshot_paths: list[Path] = [] - recorded_build_kwargs: dict[str, object] = {} - - monkeypatch.setattr( - "microplex_us.pipelines.version_benchmark.CPSASECParquetSourceProvider", - lambda data_dir: ("cps", data_dir), - ) - monkeypatch.setattr( - "microplex_us.pipelines.version_benchmark.build_and_save_versioned_us_microplex_from_source_providers", - lambda **_kwargs: ( - recorded_build_kwargs.update(_kwargs) - or SimpleNamespace( - artifact_paths=SimpleNamespace( - output_dir=tmp_path / "artifacts" / "live_run" / "run-1", - run_registry=tmp_path / "artifacts" / "live_run" / "run_registry.jsonl", - ), - current_entry=SimpleNamespace( - candidate_enhanced_cps_native_loss=0.2, - baseline_enhanced_cps_native_loss=0.3, - enhanced_cps_native_loss_delta=-0.1, - ), - ) - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.version_benchmark.write_us_microplex_site_snapshot", - lambda artifact_dir, output_path: recorded_snapshot_paths.append(Path(output_path)), - ) - - main( - [ - "--output-root", - str(tmp_path / "artifacts" / "live_run"), - "--cps-parquet-dir", - str(tmp_path / "cps"), - "--baseline-dataset", - str(tmp_path / "baseline.h5"), - "--targets-db", - str(tmp_path / "targets.duckdb"), - ] - ) - - assert recorded_snapshot_paths == [ - (tmp_path / "artifacts" / "site_snapshot_us.json").resolve() - ] - assert recorded_build_kwargs["frontier_metric"] == "enhanced_cps_native_loss_delta" - assert recorded_build_kwargs["require_policyengine_native_score"] is True - config = recorded_build_kwargs["config"] - assert config.policyengine_target_profile == "pe_native_broad" - assert config.policyengine_calibration_target_profile == "pe_native_broad" - - -def test_main_can_require_beating_pe_native_loss(monkeypatch, tmp_path) -> None: - monkeypatch.setattr( - "microplex_us.pipelines.version_benchmark.CPSASECParquetSourceProvider", - lambda data_dir: ("cps", data_dir), - ) - monkeypatch.setattr( - "microplex_us.pipelines.version_benchmark.build_and_save_versioned_us_microplex_from_source_providers", - lambda **_kwargs: SimpleNamespace( - artifact_paths=SimpleNamespace( - output_dir=tmp_path / "artifacts" / "live_run" / "run-1", - run_registry=tmp_path / "artifacts" / "live_run" / "run_registry.jsonl", - ), - current_entry=SimpleNamespace( - candidate_enhanced_cps_native_loss=0.2, - baseline_enhanced_cps_native_loss=0.1, - enhanced_cps_native_loss_delta=0.1, - ), - ), - ) - monkeypatch.setattr( - "microplex_us.pipelines.version_benchmark.write_us_microplex_site_snapshot", - lambda *_args, **_kwargs: None, - ) - - try: - main( - [ - "--output-root", - str(tmp_path / "artifacts" / "live_run"), - "--cps-parquet-dir", - str(tmp_path / "cps"), - "--baseline-dataset", - str(tmp_path / "baseline.h5"), - "--targets-db", - str(tmp_path / "targets.duckdb"), - "--require-beat-pe-native-loss", - ] - ) - except SystemExit as exc: - assert str(exc) == ( - "US version-bump benchmark did not beat PE on PE-native enhanced-CPS " - "loss: candidate=0.200000, baseline=0.100000, delta=0.100000" - ) - else: - raise AssertionError("expected SystemExit when native loss does not beat PE") diff --git a/tests/pipelines/test_versioned_artifacts.py b/tests/pipelines/test_versioned_artifacts.py deleted file mode 100644 index e281888c..00000000 --- a/tests/pipelines/test_versioned_artifacts.py +++ /dev/null @@ -1,996 +0,0 @@ -"""Tests for versioned US microplex artifact saving and frontier lookup.""" - -import json -import sqlite3 -from pathlib import Path - -import duckdb -import pandas as pd -from microplex.core import ( - EntityObservation, - EntityRelationship, - EntityType, - ObservationFrame, - RelationshipCardinality, - Shareability, - SourceDescriptor, - StaticSourceProvider, - TimeStructure, -) - -from microplex_us.pipelines import ( - build_and_save_versioned_us_microplex, - build_and_save_versioned_us_microplex_from_data_dir, - build_and_save_versioned_us_microplex_from_source_provider, - build_and_save_versioned_us_microplex_from_source_providers, - compare_us_microplex_target_delta_rows, - list_us_microplex_target_delta_rows, - rebuild_us_microplex_run_index, - resolve_us_microplex_frontier_artifact_dir, - resolve_us_microplex_run_index_path, - save_versioned_us_microplex_artifacts, - select_us_microplex_frontier_entry, - select_us_microplex_frontier_index_row, -) -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexTargets, -) -from microplex_us.policyengine import ( - PolicyEngineUSEntityTableBundle, - build_policyengine_us_time_period_arrays, - compute_policyengine_us_definition_hash, - write_policyengine_us_time_period_dataset, -) - - -def _write_baseline_dataset( - path: Path, - tables: PolicyEngineUSEntityTableBundle, -) -> Path: - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map={"state_fips": "state_fips", "snap": "snap"}, - person_variable_map={"age": "age", "income": "employment_income"}, - tax_unit_variable_map={"filing_status": "filing_status"}, - ) - write_policyengine_us_time_period_dataset(arrays, path) - return path - - -def _create_policyengine_targets_db(path: Path) -> None: - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - - CREATE VIEW target_overview AS - SELECT - t.target_id, - t.stratum_id, - t.variable, - t.value, - t.period, - t.active, - CASE - WHEN t.variable = 'snap' THEN 'state' - ELSE 'district' - END AS geo_level, - CASE - WHEN t.variable = 'snap' THEN '06' - ELSE '0601' - END AS geographic_id, - CASE - WHEN t.variable = 'snap' THEN 'snap' - WHEN t.variable = 'household_count' THEN 'snap' - ELSE NULL - END AS domain_variable - FROM targets AS t; - """ - ) - conn.execute( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, NULL) - """, - (1, compute_policyengine_us_definition_hash(())), - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (1, "household_count", 2024, 1, 0, 3.0, 1, None, "test", "count"), - (2, "snap", 2024, 1, 0, 250.0, 1, None, "test", "snap"), - ], - ) - conn.commit() - conn.close() - - -def _make_result( - *, - targets_db: Path, - baseline_dataset: Path, - snap_values: tuple[float, float], -) -> USMicroplexBuildResult: - return USMicroplexBuildResult( - config=USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - policyengine_targets_db=str(targets_db), - policyengine_baseline_dataset=str(baseline_dataset), - policyengine_target_variables=("snap", "household_count"), - ), - seed_data=pd.DataFrame({"income": [10.0], "hh_weight": [1.0]}), - synthetic_data=pd.DataFrame({"income": [10.0, 20.0], "weight": [1.0, 1.0]}), - calibrated_data=pd.DataFrame({"income": [10.0, 20.0], "weight": [0.5, 1.5]}), - targets=USMicroplexTargets( - marginal={"state": {"CA": 2.0}}, - continuous={"income": 30.0}, - ), - calibration_summary={"max_error": 0.01, "mean_error": 0.005}, - synthesis_metadata={"backend": "bootstrap", "source_names": ["cps", "puf"]}, - synthesizer=None, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [2.0, 1.0], - "state_fips": [6, 36], - "snap": list(snap_values), - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame({"spm_unit_id": [201, 202], "household_id": [1, 2]}), - families=pd.DataFrame({"family_id": [301, 302], "household_id": [1, 2]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - - -def test_save_versioned_us_microplex_artifacts_accepts_path_config_values(tmp_path): - targets_db = tmp_path / "policy_data.db" - baseline_dataset = tmp_path / "baseline.h5" - _create_policyengine_targets_db(targets_db) - _write_baseline_dataset( - baseline_dataset, - _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ).policyengine_tables, - ) - - result = _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ) - result.config = USMicroplexBuildConfig( - n_synthetic=2, - synthesis_backend="bootstrap", - calibration_backend="entropy", - policyengine_dataset_year=2024, - policyengine_targets_db=targets_db, - policyengine_baseline_dataset=baseline_dataset, - policyengine_target_variables=("snap", "household_count"), - ) - - artifact_paths = save_versioned_us_microplex_artifacts( - result, tmp_path / "artifacts" - ) - manifest = json.loads(artifact_paths.manifest.read_text()) - - assert manifest["config"]["policyengine_targets_db"] == str(targets_db) - assert manifest["config"]["policyengine_baseline_dataset"] == str(baseline_dataset) - - -def test_save_versioned_us_microplex_artifacts_accepts_stage_runtime_writer_keyword( - tmp_path, -): - targets_db = tmp_path / "policy_data.db" - baseline_dataset = tmp_path / "baseline.h5" - _create_policyengine_targets_db(targets_db) - _write_baseline_dataset( - baseline_dataset, - _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ).policyengine_tables, - ) - - artifact_paths = save_versioned_us_microplex_artifacts( - _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ), - tmp_path / "artifacts", - stage_runtime_writer=None, - ) - - assert artifact_paths.policyengine_dataset.exists() - assert artifact_paths.manifest.exists() - - -def _make_source_provider( - *, - name: str, - households: pd.DataFrame, - persons: pd.DataFrame, -) -> StaticSourceProvider: - provider_households = households.rename( - columns={ - "household_id": "hh_id", - "hh_weight": "household_weight", - } - ) - provider_persons = persons.rename( - columns={ - "person_id": "person_key", - "household_id": "hh_id", - } - ) - frame = ObservationFrame( - source=SourceDescriptor( - name=name, - shareability=Shareability.PUBLIC, - time_structure=TimeStructure.REPEATED_CROSS_SECTION, - observations=( - EntityObservation( - entity=EntityType.HOUSEHOLD, - key_column="hh_id", - weight_column="household_weight", - variable_names=tuple( - column - for column in provider_households.columns - if column != "hh_id" - ), - ), - EntityObservation( - entity=EntityType.PERSON, - key_column="person_key", - variable_names=tuple( - column - for column in provider_persons.columns - if column not in {"person_key", "hh_id"} - ), - ), - ), - ), - tables={ - EntityType.HOUSEHOLD: provider_households, - EntityType.PERSON: provider_persons, - }, - relationships=( - EntityRelationship( - parent_entity=EntityType.HOUSEHOLD, - child_entity=EntityType.PERSON, - parent_key="hh_id", - child_key="hh_id", - cardinality=RelationshipCardinality.ONE_TO_MANY, - ), - ), - ) - return StaticSourceProvider(frame) - - -def test_save_versioned_us_microplex_artifacts_uses_explicit_version(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame({"spm_unit_id": [201, 202], "household_id": [1, 2]}), - families=pd.DataFrame({"family_id": [301, 302], "household_id": [1, 2]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - result = _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ) - - paths = save_versioned_us_microplex_artifacts( - result, - tmp_path / "builds", - version_id="run-1", - ) - - assert paths.version_id == "run-1" - assert paths.output_dir == tmp_path / "builds" / "run-1" - assert paths.run_registry == tmp_path / "builds" / "run_registry.jsonl" - assert paths.run_index_db == tmp_path / "builds" / "run_index.duckdb" - assert paths.stage_manifest == paths.output_dir / "stage_manifest.json" - assert paths.artifact_inventory == ( - paths.output_dir / "stage_artifacts" / "artifact_inventory.json" - ) - assert paths.conditional_readiness == ( - paths.output_dir / "stage_artifacts" / "conditional_readiness.json" - ) - assert paths.source_plan == ( - paths.output_dir / "stage_artifacts" / "03_source_planning" / "source_plan.json" - ) - assert paths.policyengine_entity_tables == ( - paths.output_dir - / "stage_artifacts" - / "07_calibration" - / "policyengine_entity_tables" - / "metadata.json" - ) - assert paths.calibration_summary == ( - paths.output_dir - / "stage_artifacts" - / "07_calibration" - / "calibration_summary.json" - ) - assert paths.validation_evidence == ( - paths.output_dir - / "stage_artifacts" - / "09_validation_benchmarking" - / "evidence_manifest.json" - ) - assert paths.source_weight_diagnostics == ( - paths.output_dir / "source_weight_diagnostics.json" - ) - manifest = json.loads(paths.manifest.read_text()) - assert manifest["run_registry"]["artifact_id"] == "run-1" - assert manifest["run_index"]["artifact_id"] == "run-1" - - with duckdb.connect(str(paths.run_index_db), read_only=True) as conn: - assert conn.execute("SELECT COUNT(*) FROM runs").fetchone()[0] == 1 - assert conn.execute("SELECT COUNT(*) FROM slice_metrics").fetchone()[0] >= 1 - assert conn.execute("SELECT COUNT(*) FROM target_metrics").fetchone()[0] == 2 - - -def test_frontier_helpers_select_best_versioned_run(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame({"spm_unit_id": [201, 202], "household_id": [1, 2]}), - families=pd.DataFrame({"family_id": [301, 302], "household_id": [1, 2]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - - root = tmp_path / "builds" - better_result = _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ) - worse_result = _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(60.0, 50.0), - ) - save_versioned_us_microplex_artifacts(better_result, root, version_id="run-1") - save_versioned_us_microplex_artifacts(worse_result, root, version_id="run-2") - - frontier_entry = select_us_microplex_frontier_entry(root) - frontier_dir = resolve_us_microplex_frontier_artifact_dir(root) - - assert frontier_entry is not None - assert frontier_entry.artifact_id == "run-1" - assert frontier_dir == root / "run-1" - assert frontier_entry.candidate_composite_parity_loss is not None - - indexed_frontier = select_us_microplex_frontier_index_row(root) - - assert indexed_frontier is not None - assert indexed_frontier["artifact_id"] == "run-1" - - -def test_build_and_save_versioned_us_microplex_returns_frontier_gap(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame({"spm_unit_id": [201, 202], "household_id": [1, 2]}), - families=pd.DataFrame({"family_id": [301, 302], "household_id": [1, 2]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "state_fips": [6, 36, 48], - "hh_weight": [100.0, 150.0, 200.0], - "tenure": [1, 2, 1], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11, 12, 13, 14, 15], - "household_id": [1, 1, 2, 2, 3, 3], - "age": [34, 12, 47, 43, 68, 30], - "sex": [1, 2, 2, 1, 1, 2], - "education": [3, 1, 4, 4, 2, 4], - "employment_status": [1, 0, 1, 1, 2, 1], - "income": [55_000.0, 0.0, 72_000.0, 40_000.0, 18_000.0, 65_000.0], - } - ) - - saved = build_and_save_versioned_us_microplex( - persons, - households, - tmp_path / "builds", - version_id="run-build", - config=USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - random_seed=7, - policyengine_dataset_year=2024, - policyengine_targets_db=str(targets_db), - policyengine_baseline_dataset=str(baseline_dataset), - policyengine_target_variables=("snap", "household_count"), - ), - ) - - assert saved.artifact_paths.version_id == "run-build" - assert saved.build_result.policyengine_tables is not None - assert saved.current_entry is not None - assert saved.current_entry.artifact_id == "run-build" - assert saved.frontier_entry is not None - assert saved.frontier_entry.artifact_id == "run-build" - assert saved.frontier_delta == 0.0 - assert saved.frontier_entry.candidate_composite_parity_loss is not None - - -def test_build_and_save_versioned_us_microplex_from_source_provider(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame({"spm_unit_id": [201, 202], "household_id": [1, 2]}), - families=pd.DataFrame({"family_id": [301, 302], "household_id": [1, 2]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "state_fips": [6, 36, 48], - "hh_weight": [100.0, 150.0, 200.0], - "tenure": [1, 2, 1], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11, 12, 13, 14, 15], - "household_id": [1, 1, 2, 2, 3, 3], - "age": [34, 12, 47, 43, 68, 30], - "sex": [1, 2, 2, 1, 1, 2], - "education": [3, 1, 4, 4, 2, 4], - "employment_status": [1, 0, 1, 1, 2, 1], - "income": [55_000.0, 0.0, 72_000.0, 40_000.0, 18_000.0, 65_000.0], - } - ) - provider = _make_source_provider( - name="test_cps", households=households, persons=persons - ) - - saved = build_and_save_versioned_us_microplex_from_source_provider( - provider, - tmp_path / "provider-builds", - version_id="provider-run", - config=USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - random_seed=7, - policyengine_dataset_year=2024, - policyengine_targets_db=str(targets_db), - policyengine_baseline_dataset=str(baseline_dataset), - policyengine_target_variables=("snap", "household_count"), - ), - ) - - assert saved.artifact_paths.version_id == "provider-run" - assert saved.build_result.source_frame is not None - assert saved.build_result.source_frame.source.name == "test_cps" - assert saved.current_entry is not None - assert saved.frontier_delta == 0.0 - - -def test_build_and_save_versioned_us_microplex_from_source_providers(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame({"spm_unit_id": [201, 202], "household_id": [1, 2]}), - families=pd.DataFrame({"family_id": [301, 302], "household_id": [1, 2]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "state_fips": [6, 36, 48], - "hh_weight": [100.0, 150.0, 200.0], - "tenure": [1, 2, 1], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11, 12, 13, 14, 15], - "household_id": [1, 1, 2, 2, 3, 3], - "age": [34, 12, 47, 43, 68, 30], - "sex": [1, 2, 2, 1, 1, 2], - "education": [3, 1, 4, 4, 2, 4], - "employment_status": [1, 0, 1, 1, 2, 1], - "income": [55_000.0, 0.0, 72_000.0, 40_000.0, 18_000.0, 65_000.0], - } - ) - providers = [ - _make_source_provider(name="test_cps", households=households, persons=persons), - _make_source_provider(name="test_puf", households=households, persons=persons), - ] - - saved = build_and_save_versioned_us_microplex_from_source_providers( - providers, - tmp_path / "multisource-builds", - version_id="multisource-run", - config=USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - random_seed=7, - policyengine_dataset_year=2024, - policyengine_targets_db=str(targets_db), - policyengine_baseline_dataset=str(baseline_dataset), - policyengine_target_variables=("snap", "household_count"), - ), - ) - - assert saved.artifact_paths.version_id == "multisource-run" - assert saved.build_result.fusion_plan is not None - assert saved.build_result.fusion_plan.source_names == ("test_cps", "test_puf") - assert saved.current_entry is not None - assert saved.frontier_delta == 0.0 - manifest = json.loads(saved.artifact_paths.manifest.read_text()) - stage_output_manifests = manifest["stage_output_manifests"] - assert tuple(stage_output_manifests) == ( - "01_run_profile", - "02_source_loading", - "03_source_planning", - "04_seed_scaffold", - "05_donor_integration_synthesis", - "06_policyengine_entities", - "07_calibration", - "08_dataset_assembly", - "09_validation_benchmarking", - ) - for stage_id, manifest_path in stage_output_manifests.items(): - stage_manifest = json.loads( - (saved.artifact_paths.output_dir / manifest_path).read_text() - ) - assert stage_manifest["lifecycleStatus"] in {"complete", "deferred"} - assert stage_manifest["events"] - - -def test_build_and_save_versioned_us_microplex_from_data_dir(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [101, 101, 102], - "spm_unit_id": [201, 201, 202], - "family_id": [301, 301, 302], - "marital_unit_id": [401, 401, 402], - "age": [40.0, 10.0, 30.0], - "income": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [101, 102], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame({"spm_unit_id": [201, 202], "household_id": [1, 2]}), - families=pd.DataFrame({"family_id": [301, 302], "household_id": [1, 2]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [401, 402], "household_id": [1, 2]} - ), - ), - ) - households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "state_fips": [6, 36, 48], - "household_weight": [100.0, 150.0, 200.0], - "tenure": [1, 2, 1], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11, 12, 13, 14, 15], - "household_id": [1, 1, 2, 2, 3, 3], - "age": [34, 12, 47, 43, 68, 30], - "sex": [1, 2, 2, 1, 1, 2], - "education": [3, 1, 4, 4, 2, 4], - "employment_status": [1, 0, 1, 1, 2, 1], - "income": [55_000.0, 0.0, 72_000.0, 40_000.0, 18_000.0, 65_000.0], - } - ) - data_dir = tmp_path / "cps_parquet" - data_dir.mkdir() - households.to_parquet(data_dir / "cps_asec_households.parquet", index=False) - persons.to_parquet(data_dir / "cps_asec_persons.parquet", index=False) - - saved = build_and_save_versioned_us_microplex_from_data_dir( - data_dir, - tmp_path / "data-dir-builds", - version_id="data-dir-run", - config=USMicroplexBuildConfig( - n_synthetic=6, - synthesis_backend="bootstrap", - calibration_backend="entropy", - random_seed=7, - policyengine_dataset_year=2024, - policyengine_targets_db=str(targets_db), - policyengine_baseline_dataset=str(baseline_dataset), - policyengine_target_variables=("snap", "household_count"), - ), - ) - - assert saved.artifact_paths.version_id == "data-dir-run" - assert saved.build_result.source_frame is not None - assert saved.build_result.source_frame.source.name == "cps_asec_parquet" - assert saved.current_entry is not None - assert saved.frontier_delta == 0.0 - - -def test_run_index_target_deltas_are_queryable(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_seed_result = _make_result( - targets_db=targets_db, - baseline_dataset=tmp_path / "baseline.h5", - snap_values=(75.0, 50.0), - ) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - baseline_seed_result.policyengine_tables, - ) - root = tmp_path / "builds" - save_versioned_us_microplex_artifacts( - _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ), - root, - version_id="run-1", - ) - - rows = list_us_microplex_target_delta_rows( - root, - artifact_id="run-1", - ) - - assert len(rows) == 2 - assert {row["target_value"] for row in rows} == {3.0, 250.0} - assert {row["domain_variable"] for row in rows} == {"snap"} - assert all(row["artifact_id"] == "run-1" for row in rows) - - -def test_run_index_target_deltas_are_comparable_across_runs(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_seed_result = _make_result( - targets_db=targets_db, - baseline_dataset=tmp_path / "baseline.h5", - snap_values=(75.0, 50.0), - ) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - baseline_seed_result.policyengine_tables, - ) - root = tmp_path / "builds" - save_versioned_us_microplex_artifacts( - _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ), - root, - version_id="run-1", - ) - save_versioned_us_microplex_artifacts( - _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(140.0, 50.0), - ), - root, - version_id="run-2", - ) - - rows = compare_us_microplex_target_delta_rows( - root, - artifact_id="run-2", - baseline_artifact_id="run-1", - ) - - assert len(rows) == 2 - assert {row["artifact_id"] for row in rows} == {"run-2"} - assert {row["baseline_artifact_id"] for row in rows} == {"run-1"} - assert {row["domain_variable"] for row in rows} == {"snap"} - - -def test_run_index_can_be_rebuilt_from_registry(tmp_path): - targets_db = tmp_path / "policyengine_targets.db" - _create_policyengine_targets_db(targets_db) - baseline_seed_result = _make_result( - targets_db=targets_db, - baseline_dataset=tmp_path / "baseline.h5", - snap_values=(75.0, 50.0), - ) - baseline_dataset = _write_baseline_dataset( - tmp_path / "baseline.h5", - baseline_seed_result.policyengine_tables, - ) - root = tmp_path / "builds" - save_versioned_us_microplex_artifacts( - _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(100.0, 50.0), - ), - root, - version_id="run-1", - ) - save_versioned_us_microplex_artifacts( - _make_result( - targets_db=targets_db, - baseline_dataset=baseline_dataset, - snap_values=(60.0, 50.0), - ), - root, - version_id="run-2", - ) - - index_path = resolve_us_microplex_run_index_path(root) - index_path.unlink() - - rebuilt_path = rebuild_us_microplex_run_index( - root, - registry_path=root / "run_registry.jsonl", - ) - frontier = select_us_microplex_frontier_index_row(rebuilt_path) - - assert rebuilt_path == index_path - assert frontier is not None - assert frontier["artifact_id"] == "run-1" - with duckdb.connect(str(rebuilt_path), read_only=True) as conn: - assert conn.execute("SELECT COUNT(*) FROM runs").fetchone()[0] == 2 - assert conn.execute("SELECT COUNT(*) FROM target_metrics").fetchone()[0] == 4 diff --git a/tests/pipelines/test_zi_qrf_backend.py b/tests/pipelines/test_zi_qrf_backend.py deleted file mode 100644 index f7170212..00000000 --- a/tests/pipelines/test_zi_qrf_backend.py +++ /dev/null @@ -1,186 +0,0 @@ -"""Pin the zi_qrf donor-imputer backend behavior before v8 relies on it. - -v7 (2026-04-18) used `donor_imputer_backend="qrf"` which bypasses the -zero-classifier gate (see `USMicroplexPipeline._build_donor_imputer`: -`zero_inflated_vars` is populated only when `backend == "zi_qrf"`). With -an empty whitelist, every QRF predict runs over all 3.37M rows even on -columns that are 99%+ zero, which is the main reason donor integration -took hours per source on v7. - -v8 flips `--donor-imputer-backend zi_qrf`. These tests pin the three -guarantees v8 relies on: - -1. The factory (`_build_donor_imputer`) populates `zero_inflated_vars` - from the `VariableSupportFamily.SUPPORT_SENSITIVE` variables - when `backend == "zi_qrf"`, and leaves it empty otherwise. -2. `ColumnwiseQRFDonorImputer.fit` trains a `RandomForestClassifier` - zero-gate on each whitelisted column whose observed zero fraction - crosses the threshold, and does not train one on dense columns. -3. `ColumnwiseQRFDonorImputer.generate` skips QRF `.predict` on rows - the zero-gate sent to zero — i.e. the QRF is invoked on a strict - subset, which is the wall-clock win. -""" - -from __future__ import annotations - -import numpy as np -import pandas as pd -import pytest - -pytest.importorskip("quantile_forest") - -from microplex_us.pipelines.us import ( - ColumnwiseQRFDonorImputer, - USMicroplexBuildConfig, - USMicroplexPipeline, -) - - -def _tiny_problem(n: int = 500, seed: int = 0) -> pd.DataFrame: - """Two-column donor frame: one heavy-zero target, one dense target.""" - rng = np.random.default_rng(seed) - age = rng.integers(18, 80, size=n).astype(float) - is_female = rng.integers(0, 2, size=n).astype(float) - # 97 % zero — only a handful of positive values, like SSI or TANF. - heavy_zero = np.where(rng.random(n) > 0.97, rng.exponential(500, n), 0.0) - # Dense — every row has a positive draw, like age or weight. - dense = rng.normal(40_000, 10_000, size=n).clip(0, None) - return pd.DataFrame( - { - "age": age, - "is_female": is_female, - "tanf_reported": heavy_zero, - "employment_income": dense, - } - ) - - -class TestImputerFit: - """Whitelisted + heavy-zero → RF classifier gate; otherwise no gate.""" - - def test_zi_whitelist_produces_zero_classifier(self) -> None: - data = _tiny_problem() - imputer = ColumnwiseQRFDonorImputer( - condition_vars=["age", "is_female"], - target_vars=["tanf_reported", "employment_income"], - n_estimators=25, - zero_inflated_vars={"tanf_reported"}, - zero_threshold=0.05, - ) - imputer.fit(data) - assert "tanf_reported" in imputer._zero_models, ( - "Heavy-zero column in whitelist must get a zero-gate classifier; " - "this is the optimization v8 depends on." - ) - assert "employment_income" not in imputer._zero_models, ( - "Dense column must not get a zero-gate classifier." - ) - - def test_empty_whitelist_means_no_gates(self) -> None: - """v7 configuration: backend='qrf' → no gates ever fitted.""" - data = _tiny_problem() - imputer = ColumnwiseQRFDonorImputer( - condition_vars=["age", "is_female"], - target_vars=["tanf_reported", "employment_income"], - n_estimators=25, - zero_inflated_vars=set(), - zero_threshold=0.05, - ) - imputer.fit(data) - assert imputer._zero_models == {} - - -class TestImputerGenerateSkipsPredict: - """With a zero-gate, the QRF's .predict runs on a strict subset.""" - - def test_generate_calls_qrf_only_on_predicted_positive_rows( - self, monkeypatch: pytest.MonkeyPatch - ) -> None: - data = _tiny_problem(n=800, seed=1) - imputer = ColumnwiseQRFDonorImputer( - condition_vars=["age", "is_female"], - target_vars=["tanf_reported"], - n_estimators=25, - zero_inflated_vars={"tanf_reported"}, - zero_threshold=0.05, - ) - imputer.fit(data) - - qrf_model = imputer._models["tanf_reported"] - call_input_sizes: list[int] = [] - original_predict = qrf_model.predict - - def spy_predict(x_values: np.ndarray, **kwargs): - call_input_sizes.append(len(x_values)) - return original_predict(x_values, **kwargs) - - monkeypatch.setattr(qrf_model, "predict", spy_predict) - - # Generate on 10k conditioning rows (much larger than training). - rng = np.random.default_rng(42) - n_generate = 10_000 - conditions = pd.DataFrame( - { - "age": rng.integers(18, 80, size=n_generate).astype(float), - "is_female": rng.integers(0, 2, size=n_generate).astype(float), - } - ) - synthetic = imputer.generate(conditions, seed=42) - - assert len(call_input_sizes) == 1, call_input_sizes - predict_rows = call_input_sizes[0] - # Heavy-zero base rate is ~3 %; ZI-predicted-positive fraction - # should be well below 50 % on unseen data, and definitely - # below n_generate. - assert predict_rows < n_generate, ( - f"QRF predict was called on all {n_generate} rows — the " - f"zero-gate isn't skipping any. call_input_sizes={call_input_sizes}" - ) - assert predict_rows < n_generate * 0.5, ( - f"QRF predict got {predict_rows}/{n_generate} rows; the gate " - "is barely cutting the wall, not matching the 3 % training base rate." - ) - # Non-predicted rows must be exactly zero (not NaN, not drawn). - zero_mass = float((synthetic["tanf_reported"] == 0).mean()) - assert zero_mass > 0.5, ( - f"Synthetic zero mass = {zero_mass:.3f}; gate should leave " - "more than half of rows at exactly 0." - ) - - -class TestBuildDonorImputerFactory: - """The pipeline factory wires zero_inflated_vars only when backend='zi_qrf'.""" - - def _factory(self, backend: str) -> ColumnwiseQRFDonorImputer: - config = USMicroplexBuildConfig( - donor_imputer_backend=backend, - donor_imputer_qrf_n_estimators=25, - ) - pipeline = USMicroplexPipeline(config=config) - # Variables chosen to span support families: - # qualified_dividend_income, taxable_interest_income → SUPPORT_SENSITIVE - # age → BOUNDED_INTEGER - # These are all real PolicyEngine-US variable names with explicit - # semantic specs in microplex_us.variables. - target_vars = ( - "qualified_dividend_income", - "taxable_interest_income", - "age", - ) - return pipeline._build_donor_imputer( - condition_vars=["is_female", "cps_race"], - target_vars=target_vars, - ) - - def test_zi_qrf_backend_populates_whitelist(self) -> None: - imputer = self._factory("zi_qrf") - assert isinstance(imputer, ColumnwiseQRFDonorImputer) - assert "qualified_dividend_income" in imputer.zero_inflated_vars - assert "taxable_interest_income" in imputer.zero_inflated_vars - assert "age" not in imputer.zero_inflated_vars - - def test_qrf_backend_leaves_whitelist_empty(self) -> None: - """v7 semantics: pre-v8 default leaves optimization inactive.""" - imputer = self._factory("qrf") - assert isinstance(imputer, ColumnwiseQRFDonorImputer) - assert imputer.zero_inflated_vars == set() diff --git a/tests/policyengine/__init__.py b/tests/policyengine/__init__.py deleted file mode 100644 index 1700cce0..00000000 --- a/tests/policyengine/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""PolicyEngine tests for microplex-us.""" diff --git a/tests/policyengine/test_comparison.py b/tests/policyengine/test_comparison.py deleted file mode 100644 index 86f51133..00000000 --- a/tests/policyengine/test_comparison.py +++ /dev/null @@ -1,1132 +0,0 @@ -"""Tests for PolicyEngine US target comparison helpers.""" - -from __future__ import annotations - -import sqlite3 -from pathlib import Path - -import h5py -import numpy as np -import pandas as pd -import pytest -from microplex.core import EntityType -from microplex.targets import ( - FilterOperator, - TargetFilter, - TargetQuery, - TargetSpec, - normalize_metric_payload, -) - -import microplex_us.policyengine.comparison as comparison_module -from microplex_us.policyengine import ( - PolicyEngineUSComparisonCache, - PolicyEngineUSConstraint, - PolicyEngineUSDBTargetProvider, - PolicyEngineUSEntityTableBundle, - PolicyEngineUSMaterializationError, - PolicyEngineUSTargetComparisonReport, - build_policyengine_us_time_period_arrays, - compare_policyengine_us_target_query_to_baseline, - compute_policyengine_us_definition_hash, - evaluate_policyengine_us_target_set, - load_policyengine_us_entity_tables, - write_policyengine_us_time_period_dataset, -) -from microplex_us.policyengine.comparison import ( - PolicyEngineUSTargetEvaluation, - PolicyEngineUSTargetEvaluationReport, -) - - -def _create_snap_targets_db(path: Path) -> None: - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - """ - ) - conn.executemany( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - [ - (1, compute_policyengine_us_definition_hash(()), None), - ( - 2, - compute_policyengine_us_definition_hash( - (PolicyEngineUSConstraint("state_fips", "==", "06"),), - parent_stratum_id=1, - ), - 1, - ), - ], - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - constraint_variable, - operation, - value - ) VALUES (?, ?, ?, ?) - """, - (2, "state_fips", "==", "06"), - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (10, "snap", 2024, 1, 0, 250.0, 1, 0.0, "test", "National SNAP"), - (11, "snap", 2024, 2, 0, 200.0, 1, 0.0, "test", "California SNAP"), - ], - ) - conn.commit() - conn.close() - - -def _sample_tables() -> PolicyEngineUSEntityTableBundle: - return PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [2.0, 1.0], - "state_fips": [6, 36], - "snap": [100.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [100, 100, 200], - "spm_unit_id": [1000, 1000, 2000], - "family_id": [5000, 5000, 6000], - "marital_unit_id": [7000, 7000, 8000], - "age": [40.0, 10.0, 30.0], - "employment_income": [30_000.0, 0.0, 20_000.0], - "employment_income_before_lsr": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 200], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000, 2000], - "household_id": [1, 2], - } - ), - families=pd.DataFrame( - { - "family_id": [5000, 6000], - "household_id": [1, 2], - } - ), - marital_units=pd.DataFrame( - { - "marital_unit_id": [7000, 8000], - "household_id": [1, 2], - } - ), - ) - - -def test_load_policyengine_us_entity_tables_round_trips_written_dataset(tmp_path): - tables = _sample_tables() - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map={"state_fips": "state_fips", "snap": "snap"}, - person_variable_map={ - "age": "age", - "employment_income": "employment_income", - }, - tax_unit_variable_map={"filing_status": "filing_status"}, - ) - dataset_path = tmp_path / "baseline.h5" - write_policyengine_us_time_period_dataset(arrays, dataset_path) - - loaded = load_policyengine_us_entity_tables( - dataset_path, - period=2024, - variables=("state_fips", "snap", "age", "employment_income", "filing_status"), - ) - - np.testing.assert_array_equal( - loaded.households["household_id"].to_numpy(), - np.array([1, 2]), - ) - np.testing.assert_allclose( - loaded.households["household_weight"].to_numpy(dtype=float), - np.array([2.0, 1.0]), - ) - np.testing.assert_array_equal( - loaded.persons["tax_unit_id"].to_numpy(), - np.array([100, 100, 200]), - ) - assert loaded.tax_units is not None - np.testing.assert_array_equal( - loaded.tax_units["household_id"].to_numpy(), - np.array([1, 2]), - ) - assert loaded.tax_units["filing_status"].tolist() == ["JOINT", "SINGLE"] - - -def test_load_policyengine_us_entity_tables_uses_legacy_contract_entities(tmp_path): - tables = _sample_tables() - assert len(tables.households) == len(tables.spm_units) - tables.spm_units["free_school_meals_reported"] = [1.0, 0.0] - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - spm_unit_variable_map={ - "free_school_meals_reported": "free_school_meals_reported" - }, - ) - dataset_path = tmp_path / "baseline_with_equal_household_spm_counts.h5" - write_policyengine_us_time_period_dataset(arrays, dataset_path) - - loaded = load_policyengine_us_entity_tables( - dataset_path, - period=2024, - variables=("free_school_meals_reported",), - ) - - assert "free_school_meals_reported" not in loaded.households.columns - assert loaded.spm_units is not None - assert loaded.spm_units["free_school_meals_reported"].tolist() == [1.0, 0.0] - - -def test_load_policyengine_us_entity_tables_skips_unsupported_arrays_when_loading_all( - tmp_path, -): - tables = _sample_tables() - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map={"state_fips": "state_fips"}, - ) - dataset_path = tmp_path / "baseline_with_records.h5" - write_policyengine_us_time_period_dataset(arrays, dataset_path) - with h5py.File(dataset_path, "a") as handle: - group = handle.create_group("record_amount") - group.create_dataset("2024", data=np.array([1.0, 2.0, 3.0, 4.0])) - - loaded = load_policyengine_us_entity_tables(dataset_path, period=2024) - - assert "record_amount" not in loaded.households.columns - assert loaded.persons is not None - assert "record_amount" not in loaded.persons.columns - - -def test_evaluate_policyengine_us_target_set_scores_count_sum_and_mean(): - report = evaluate_policyengine_us_target_set( - _sample_tables(), - [ - TargetSpec( - name="ca_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=(TargetFilter("state_fips", FilterOperator.EQ, 6),), - ), - TargetSpec( - name="employment_income_before_lsr_total", - entity=EntityType.PERSON, - value=80_000.0, - period=2024, - measure="employment_income_before_lsr", - aggregation="sum", - ), - TargetSpec( - name="ca_mean_age", - entity=EntityType.PERSON, - value=25.0, - period=2024, - measure="age", - aggregation="mean", - filters=(TargetFilter("state_fips", FilterOperator.EQ, 6),), - ), - ], - period=2024, - ) - - assert report.supported_target_count == 3 - assert not report.unsupported_targets - assert report.materialized_variables == () - assert report.mean_abs_relative_error == 0.0 - actuals = {evaluation.target.name: evaluation.actual_value for evaluation in report.evaluations} - assert actuals == { - "ca_households": 2.0, - "employment_income_before_lsr_total": 80_000.0, - "ca_mean_age": 25.0, - } - - -def test_policyengine_us_benchmark_metrics_delegate_to_shared_normalization(): - target = TargetSpec( - name="zero_target_snap", - entity=EntityType.HOUSEHOLD, - value=0.0, - period=2024, - measure="snap", - aggregation="sum", - source="test", - metadata={"geographic_level": "national"}, - ) - report = PolicyEngineUSTargetEvaluationReport( - label="candidate", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation( - target=target, - actual_value=2.5, - ) - ], - ) - - assert report.benchmark_metrics == [ - normalize_metric_payload( - { - "name": "zero_target_snap", - "estimate": 2.5, - "target": 0.0, - "metadata": { - "source": "test", - "entity": EntityType.HOUSEHOLD.value, - "measure": "snap", - "aggregation": "sum", - "geographic_level": "national", - }, - } - ) - ] - - -def test_evaluate_policyengine_us_target_set_materializes_missing_variables(tmp_path): - base_tables = _sample_tables() - tables = PolicyEngineUSEntityTableBundle( - households=base_tables.households.drop(columns=["snap"]), - persons=base_tables.persons, - tax_units=base_tables.tax_units, - spm_units=base_tables.spm_units, - families=base_tables.families, - marital_units=base_tables.marital_units, - ) - - class FakeEntity: - def __init__(self, key: str): - self.key = key - - class FakeVariable: - def __init__(self, entity: FakeEntity, formulas: dict[str, object] | None = None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self) -> bool: - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "employment_income": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable(FakeEntity("household"), formulas={"2024": object()}), - } - - class FakeSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - assert Path(dataset).exists() - assert dataset_year == 2024 - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert period == 2024 - assert map_to is None - if variable == "snap": - return np.array([120.0, 50.0]) - raise KeyError(variable) - - report = evaluate_policyengine_us_target_set( - tables, - [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=290.0, - period=2024, - measure="snap", - aggregation="sum", - ) - ], - period=2024, - dataset_year=2024, - simulation_cls=FakeSimulation, - ) - - assert report.materialized_variables == ("snap",) - assert report.supported_target_count == 1 - assert report.evaluations[0].actual_value == 290.0 - assert report.materialization_failures == {} - - -def test_evaluate_policyengine_us_target_set_materializes_add_based_variables(tmp_path): - tables = _sample_tables() - - class FakeEntity: - def __init__(self, key: str): - self.key = key - - class FakeVariable: - def __init__( - self, - entity: FakeEntity, - *, - adds: list[str] | None = None, - formulas: dict[str, object] | None = None, - ): - self.entity = entity - self.adds = adds or [] - self.subtracts: list[str] = [] - self.formulas = formulas or {} - - def is_input_variable(self) -> bool: - return not self.formulas and not self.adds - - class FakeTaxBenefitSystem: - variables = { - "employment_income": FakeVariable( - FakeEntity("person"), - adds=["employment_income_before_lsr"], - ), - "employment_income_before_lsr": FakeVariable(FakeEntity("person")), - } - - class FakeSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - assert Path(dataset).exists() - assert dataset_year == 2024 - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert variable == "employment_income" - assert period == 2024 - assert map_to is None - return np.array([10.0, 20.0, 30.0]) - - report = evaluate_policyengine_us_target_set( - tables, - [ - TargetSpec( - name="employment_income_total", - entity=EntityType.PERSON, - value=90.0, - period=2024, - measure="employment_income", - aggregation="sum", - ) - ], - period=2024, - dataset_year=2024, - simulation_cls=FakeSimulation, - ) - - assert report.materialized_variables == ("employment_income",) - assert report.evaluations[0].actual_value == 90.0 - - -def test_evaluate_policyengine_us_target_set_skips_failed_materializations(tmp_path): - base_tables = _sample_tables() - tables = PolicyEngineUSEntityTableBundle( - households=base_tables.households.drop(columns=["snap"]), - persons=base_tables.persons, - tax_units=base_tables.tax_units, - spm_units=base_tables.spm_units, - families=base_tables.families, - marital_units=base_tables.marital_units, - ) - - class FakeEntity: - def __init__(self, key: str): - self.key = key - - class FakeVariable: - def __init__(self, entity: FakeEntity, formulas: dict[str, object] | None = None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self) -> bool: - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "employment_income": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable(FakeEntity("household"), formulas={"2024": object()}), - "income_tax": FakeVariable(FakeEntity("person"), formulas={"2024": object()}), - } - - class FakeSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - assert Path(dataset).exists() - assert dataset_year == 2024 - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert period == 2024 - assert map_to is None - if variable == "snap": - return np.array([120.0, 50.0]) - if variable == "income_tax": - raise RuntimeError("missing test parameter") - raise KeyError(variable) - - report = evaluate_policyengine_us_target_set( - tables, - [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=290.0, - period=2024, - measure="snap", - aggregation="sum", - ), - TargetSpec( - name="income_tax_total", - entity=EntityType.PERSON, - value=0.0, - period=2024, - measure="income_tax", - aggregation="sum", - ), - ], - period=2024, - dataset_year=2024, - simulation_cls=FakeSimulation, - ) - - assert report.materialized_variables == ("snap",) - assert report.materialization_failures == { - "income_tax": "RuntimeError: missing test parameter" - } - assert report.supported_target_count == 1 - assert report.evaluations[0].target.name == "snap_total" - assert [target.name for target in report.unsupported_targets] == ["income_tax_total"] - - -def test_evaluate_policyengine_us_target_set_marks_district_targets_unsupported_when_district_geography_materializes_to_defaults(): - base_tables = _sample_tables() - tables = PolicyEngineUSEntityTableBundle( - households=base_tables.households, - persons=base_tables.persons, - tax_units=base_tables.tax_units, - spm_units=base_tables.spm_units, - families=base_tables.families, - marital_units=base_tables.marital_units, - ) - - class FakeEntity: - def __init__(self, key: str): - self.key = key - - class FakeVariable: - def __init__(self, entity: FakeEntity, formulas: dict[str, object] | None = None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self) -> bool: - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "state_fips": FakeVariable(FakeEntity("household")), - "congressional_district_geoid": FakeVariable( - FakeEntity("household"), - formulas={"2024": object()}, - ), - } - - class FakeSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - assert Path(dataset).exists() - _ = dataset_year, kwargs - - def calculate(self, variable, period=None, map_to=None): - assert period == 2024 - assert map_to is None - if variable == "congressional_district_geoid": - return np.array([0.0, 0.0]) - raise KeyError(variable) - - report = evaluate_policyengine_us_target_set( - tables, - [ - TargetSpec( - name="district_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=( - TargetFilter( - "congressional_district_geoid", - FilterOperator.EQ, - 601, - ), - ), - ) - ], - period=2024, - dataset_year=2024, - simulation_cls=FakeSimulation, - ) - - assert report.materialized_variables == ("congressional_district_geoid",) - assert report.supported_target_count == 0 - assert len(report.evaluations) == 0 - assert [target.name for target in report.unsupported_targets] == [ - "district_households" - ] - - -def test_evaluate_policyengine_us_target_set_supports_district_targets_with_real_district_geography(): - base_tables = _sample_tables() - tables = PolicyEngineUSEntityTableBundle( - households=base_tables.households.assign( - congressional_district_geoid=np.array([601, 3601]) - ), - persons=base_tables.persons, - tax_units=base_tables.tax_units, - spm_units=base_tables.spm_units, - families=base_tables.families, - marital_units=base_tables.marital_units, - ) - - report = evaluate_policyengine_us_target_set( - tables, - [ - TargetSpec( - name="district_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=( - TargetFilter( - "congressional_district_geoid", - FilterOperator.EQ, - 601, - ), - ), - ) - ], - period=2024, - ) - - assert report.materialized_variables == () - assert report.supported_target_count == 1 - assert not report.unsupported_targets - assert report.evaluations[0].actual_value == 2.0 - - -def test_evaluate_policyengine_us_target_set_raises_on_strict_materialization_failure( - tmp_path, -): - base_tables = _sample_tables() - tables = PolicyEngineUSEntityTableBundle( - households=base_tables.households.drop(columns=["snap"]), - persons=base_tables.persons, - tax_units=base_tables.tax_units, - spm_units=base_tables.spm_units, - families=base_tables.families, - marital_units=base_tables.marital_units, - ) - - class FakeEntity: - def __init__(self, key: str): - self.key = key - - class FakeVariable: - def __init__(self, entity: FakeEntity, formulas: dict[str, object] | None = None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self) -> bool: - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "employment_income": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable(FakeEntity("household"), formulas={"2024": object()}), - } - - class FailingSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - assert Path(dataset).exists() - assert dataset_year == 2024 - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert variable == "snap" - assert period == 2024 - assert map_to is None - raise RuntimeError("missing test parameter") - - with pytest.raises(PolicyEngineUSMaterializationError, match="candidate"): - evaluate_policyengine_us_target_set( - tables, - [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=290.0, - period=2024, - measure="snap", - aggregation="sum", - ) - ], - period=2024, - dataset_year=2024, - simulation_cls=FailingSimulation, - strict_materialization=True, - ) - - -def test_evaluate_policyengine_us_target_set_supports_person_to_tax_unit_count_filters(): - report = evaluate_policyengine_us_target_set( - _sample_tables(), - [ - TargetSpec( - name="adult_tax_units", - entity=EntityType.TAX_UNIT, - value=1.0, - period=2024, - aggregation="count", - filters=( - TargetFilter("age", FilterOperator.GTE, 18), - ), - ) - ], - period=2024, - ) - - assert report.supported_target_count == 1 - assert len(report.evaluations) == 1 - assert report.evaluations[0].target.name == "adult_tax_units" - assert report.evaluations[0].actual_value == pytest.approx(3.0) - assert report.unsupported_targets == [] - - -def test_evaluate_policyengine_us_target_set_batches_supported_constraint_compilation( - monkeypatch, -): - call_count = 0 - real_compile = ( - comparison_module.compile_supported_policyengine_us_household_linear_constraints - ) - - def record_compile(*args, **kwargs): - nonlocal call_count - call_count += 1 - return real_compile(*args, **kwargs) - - monkeypatch.setattr( - comparison_module, - "compile_supported_policyengine_us_household_linear_constraints", - record_compile, - ) - - report = evaluate_policyengine_us_target_set( - _sample_tables(), - [ - TargetSpec( - name="ca_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=(TargetFilter("state_fips", FilterOperator.EQ, 6),), - ), - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=250.0, - period=2024, - measure="snap", - aggregation="sum", - ), - ], - period=2024, - ) - - assert report.supported_target_count == 2 - assert call_count == 1 - - -def test_compare_policyengine_us_target_query_to_baseline(tmp_path): - class EmploymentIncomeProvider: - def load_target_set(self, query=None): - _ = query - return [ - TargetSpec( - name="employment_income_before_lsr_total", - entity=EntityType.PERSON, - value=80_000.0, - period=2024, - measure="employment_income_before_lsr", - aggregation="sum", - ) - ] - - provider = EmploymentIncomeProvider() - - baseline_tables = _sample_tables() - baseline_arrays = build_policyengine_us_time_period_arrays( - baseline_tables, - period=2024, - household_variable_map={"state_fips": "state_fips"}, - person_variable_map={ - "age": "age", - "employment_income_before_lsr": "employment_income_before_lsr", - }, - ) - baseline_path = tmp_path / "enhanced_cps_2024.h5" - write_policyengine_us_time_period_dataset(baseline_arrays, baseline_path) - - base_candidate = _sample_tables() - candidate_tables = PolicyEngineUSEntityTableBundle( - households=base_candidate.households, - persons=base_candidate.persons.assign( - employment_income_before_lsr=np.array([20_000.0, 0.0, 20_000.0]) - ), - tax_units=base_candidate.tax_units, - spm_units=base_candidate.spm_units, - families=base_candidate.families, - marital_units=base_candidate.marital_units, - ) - - report = compare_policyengine_us_target_query_to_baseline( - candidate_tables, - provider, - TargetQuery( - period=2024, - provider_filters={"variables": ["employment_income_before_lsr"]}, - ), - baseline_dataset=baseline_path, - candidate_label="microplex", - baseline_label="enhanced_cps", - ) - - assert isinstance(report, PolicyEngineUSTargetComparisonReport) - assert report.candidate.label == "microplex" - assert report.baseline is not None - assert report.baseline.label == "enhanced_cps" - assert report.candidate.mean_abs_relative_error == pytest.approx(0.25) - assert report.baseline.mean_abs_relative_error == 0.0 - assert report.mean_abs_relative_error_delta == pytest.approx(0.25) - - -def test_policyengine_us_comparison_report_uses_common_target_intersection(): - shared_target = TargetSpec( - name="shared", - entity=EntityType.HOUSEHOLD, - value=10.0, - period=2024, - measure="snap", - aggregation="sum", - source="snap", - ) - candidate_only_target = TargetSpec( - name="candidate_only", - entity=EntityType.HOUSEHOLD, - value=10.0, - period=2024, - measure="snap", - aggregation="sum", - source="snap", - ) - baseline_only_target = TargetSpec( - name="baseline_only", - entity=EntityType.HOUSEHOLD, - value=10.0, - period=2024, - measure="snap", - aggregation="sum", - source="snap", - ) - - report = PolicyEngineUSTargetComparisonReport( - candidate=PolicyEngineUSTargetEvaluationReport( - label="candidate", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation(target=shared_target, actual_value=8.0), - PolicyEngineUSTargetEvaluation(target=candidate_only_target, actual_value=100.0), - ], - ), - baseline=PolicyEngineUSTargetEvaluationReport( - label="baseline", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation(target=shared_target, actual_value=9.0), - PolicyEngineUSTargetEvaluation(target=baseline_only_target, actual_value=0.0), - ], - ), - ) - - assert report.common_target_count == 1 - assert report.mean_abs_relative_error_delta == pytest.approx(0.1) - assert report.target_win_rate == 0.0 - - -def test_compare_policyengine_us_target_query_to_baseline_evaluates_baseline_first( - monkeypatch, - tmp_path, -): - provider_db = tmp_path / "policy_data.db" - _create_snap_targets_db(provider_db) - provider = PolicyEngineUSDBTargetProvider(provider_db) - - baseline_tables = _sample_tables() - baseline_arrays = build_policyengine_us_time_period_arrays( - baseline_tables, - period=2024, - household_variable_map={"state_fips": "state_fips", "snap": "snap"}, - person_variable_map={"age": "age"}, - ) - baseline_path = tmp_path / "enhanced_cps_2024.h5" - write_policyengine_us_time_period_dataset(baseline_arrays, baseline_path) - - call_order: list[str] = [] - - def fake_evaluate(*args, label: str, **kwargs): - _ = args, kwargs - call_order.append(label) - return type( - "FakeReport", - (), - { - "label": label, - "period": 2024, - "evaluations": [], - "unsupported_targets": [], - "materialized_variables": (), - "materialization_failures": {}, - "mean_abs_relative_error": 0.0, - "max_abs_relative_error": 0.0, - "supported_target_count": 0, - }, - )() - - monkeypatch.setattr( - "microplex_us.policyengine.comparison.evaluate_policyengine_us_target_set", - fake_evaluate, - ) - - compare_policyengine_us_target_query_to_baseline( - _sample_tables(), - provider, - TargetQuery(period=2024, provider_filters={"variables": ["snap"]}), - baseline_dataset=baseline_path, - candidate_label="microplex", - baseline_label="enhanced_cps", - ) - - assert call_order == ["enhanced_cps", "microplex"] - - -def test_compare_policyengine_us_target_query_to_baseline_reuses_cache(monkeypatch): - class CountingProvider: - def __init__(self): - self.load_calls = 0 - - def load_target_set(self, query=None): - _ = query - self.load_calls += 1 - return [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=250.0, - period=2024, - measure="snap", - aggregation="sum", - ) - ] - - provider = CountingProvider() - cache = PolicyEngineUSComparisonCache() - load_counts = {"tables": 0} - eval_counts = {"baseline": 0, "microplex": 0} - - def fake_load_policyengine_us_entity_tables(dataset, period): - _ = dataset - _ = period - load_counts["tables"] += 1 - return _sample_tables() - - def fake_evaluate(*args, label: str, **kwargs): - _ = args, kwargs - eval_counts[label] = eval_counts.get(label, 0) + 1 - return type( - "FakeReport", - (), - { - "label": label, - "period": 2024, - "evaluations": [], - "unsupported_targets": [], - "materialized_variables": (), - "materialization_failures": {}, - "mean_abs_relative_error": 0.0, - "max_abs_relative_error": 0.0, - "supported_target_count": 0, - }, - )() - - monkeypatch.setattr( - "microplex_us.policyengine.comparison.load_policyengine_us_entity_tables", - fake_load_policyengine_us_entity_tables, - ) - monkeypatch.setattr( - "microplex_us.policyengine.comparison.evaluate_policyengine_us_target_set", - fake_evaluate, - ) - - for _ in range(2): - compare_policyengine_us_target_query_to_baseline( - _sample_tables(), - provider, - TargetQuery(period=2024, names=("snap_total",)), - baseline_dataset="/tmp/enhanced_cps_2024.h5", - baseline_label="baseline", - cache=cache, - ) - - assert provider.load_calls == 1 - assert load_counts["tables"] == 1 - assert eval_counts["baseline"] == 1 - assert eval_counts["microplex"] == 2 - - -def test_compare_policyengine_us_target_query_to_baseline_raises_on_strict_materialization_failure( - tmp_path, -): - provider = PolicyEngineUSDBTargetProvider(tmp_path / "policy_data.db") - _create_snap_targets_db(provider.db_path) - - baseline_tables = _sample_tables() - baseline_arrays = build_policyengine_us_time_period_arrays( - PolicyEngineUSEntityTableBundle( - households=baseline_tables.households.drop(columns=["snap"]), - persons=baseline_tables.persons, - tax_units=baseline_tables.tax_units, - spm_units=baseline_tables.spm_units, - families=baseline_tables.families, - marital_units=baseline_tables.marital_units, - ), - period=2024, - household_variable_map={"state_fips": "state_fips"}, - person_variable_map={"age": "age"}, - ) - baseline_path = tmp_path / "baseline_missing_snap.h5" - write_policyengine_us_time_period_dataset(baseline_arrays, baseline_path) - - class FakeEntity: - def __init__(self, key: str): - self.key = key - - class FakeVariable: - def __init__(self, entity: FakeEntity, formulas: dict[str, object] | None = None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self) -> bool: - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable(FakeEntity("household"), formulas={"2024": object()}), - } - - class FailingSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - assert Path(dataset).exists() - assert dataset_year == 2024 - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert variable == "snap" - assert period == 2024 - assert map_to is None - raise RuntimeError("snap materialization unavailable") - - with pytest.raises(PolicyEngineUSMaterializationError, match="enhanced_cps"): - compare_policyengine_us_target_query_to_baseline( - _sample_tables(), - provider, - TargetQuery(period=2024, provider_filters={"variables": ["snap"]}), - baseline_dataset=baseline_path, - dataset_year=2024, - simulation_cls=FailingSimulation, - candidate_label="microplex", - baseline_label="enhanced_cps", - strict_materialization=True, - ) diff --git a/tests/policyengine/test_forbidden_export_block.py b/tests/policyengine/test_forbidden_export_block.py deleted file mode 100644 index 7a4f7c2a..00000000 --- a/tests/policyengine/test_forbidden_export_block.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Regression test: the eCPS-contract forbidden columns are excluded from export. - -The contract's ``forbidden`` set (transient ``*_reported`` takeup inputs and the -PUF reported/calculated tax-credit outputs) must never reach the written H5. -``resolve_policyengine_excluded_export_variables`` is the single chokepoint that -the materialize/persist path uses to build the ``excluded`` set, so we assert it -excludes every forbidden column that appears in the exported inputs, and that the -exclusion is sourced from the committed contract (single source of truth). -""" - -import json -from pathlib import Path - -from microplex_us.policyengine.us import ( - _contract_forbidden_export_columns, - resolve_policyengine_excluded_export_variables, -) - -CONTRACT_PATH = ( - Path(__file__).resolve().parents[2] - / "src" - / "microplex_us" - / "pipelines" - / "ecps_export_contract.json" -) - - -def _contract_forbidden() -> set[str]: - return set(json.loads(CONTRACT_PATH.read_text())["forbidden"]) - - -class _NoComputedVariables: - """Tax-benefit-system stub whose variables are all plain inputs. - - Isolates the forbidden-block behaviour from PolicyEngine-computed detection: - with no known variables, ``detect_policyengine_computed_export_variables`` - contributes nothing, so anything excluded comes from the forbidden contract. - """ - - variables: dict = {} - - -def test_helper_matches_committed_contract_forbidden(): - assert _contract_forbidden_export_columns() == _contract_forbidden() - # Non-trivial: the contract forbids the *_reported + PUF tax-credit family. - assert "snap_reported" in _contract_forbidden_export_columns() - assert "general_business_credit" in _contract_forbidden_export_columns() - - -def test_forbidden_columns_present_in_export_are_excluded(): - forbidden = sorted(_contract_forbidden()) - # Mix forbidden columns with legitimate inputs. - exported = forbidden + ["age", "employment_income", "household_weight"] - excluded = resolve_policyengine_excluded_export_variables( - _NoComputedVariables(), exported - ) - # Every forbidden column that was offered for export must be excluded. - assert set(forbidden).issubset(excluded) - # Legitimate inputs must NOT be excluded. - assert "age" not in excluded - assert "employment_income" not in excluded - assert "household_weight" not in excluded - - -def test_forbidden_columns_absent_are_not_spuriously_excluded(): - # If no forbidden column is exported, none are added to the excluded set. - exported = ["age", "employment_income", "household_weight"] - excluded = resolve_policyengine_excluded_export_variables( - _NoComputedVariables(), exported - ) - assert excluded.isdisjoint(_contract_forbidden()) - - -def test_forbidden_columns_absent_from_written_h5(tmp_path): - # End-to-end: the forbidden columns must not appear in the written H5, - # not merely in the resolver's return value. This pins the full chain - # resolve -> excluded_variables -> writer. - import numpy as np - - from microplex_us.policyengine.us import ( - write_policyengine_us_time_period_dataset, - ) - - forbidden = sorted(_contract_forbidden()) - exported = forbidden + ["age", "household_weight"] - # data shape: {variable: {period: array}} - data = {name: {"2024": np.zeros(3)} for name in exported} - - excluded = resolve_policyengine_excluded_export_variables( - _NoComputedVariables(), exported - ) - out = write_policyengine_us_time_period_dataset( - data, tmp_path / "export.h5", excluded_variables=excluded - ) - - import h5py - - with h5py.File(out, "r") as handle: - written = set(handle.keys()) - # No forbidden column reaches the H5; legitimate inputs do. - assert written.isdisjoint(set(forbidden)) - assert "age" in written - assert "household_weight" in written diff --git a/tests/policyengine/test_harness.py b/tests/policyengine/test_harness.py deleted file mode 100644 index ea341b92..00000000 --- a/tests/policyengine/test_harness.py +++ /dev/null @@ -1,720 +0,0 @@ -"""Tests for the persistent PE-US comparison harness.""" - -from __future__ import annotations - -import json -from pathlib import Path - -import pandas as pd -import pytest -from microplex.core import EntityType -from microplex.targets import ( - FilterOperator, - StaticTargetProvider, - TargetFilter, - TargetQuery, - TargetSet, - TargetSpec, -) - -import microplex_us.policyengine.comparison as comparison_module -import microplex_us.policyengine.harness as harness_module -from microplex_us.policyengine import ( - PolicyEngineUSEntityTableBundle, - PolicyEngineUSHarnessRun, - PolicyEngineUSHarnessSlice, - PolicyEngineUSHarnessSliceResult, - PolicyEngineUSMaterializationError, - build_policyengine_us_time_period_arrays, - default_policyengine_us_db_all_target_slices, - default_policyengine_us_db_harness_slices, - default_policyengine_us_db_parity_slices, - evaluate_policyengine_us_harness, - filter_nonempty_policyengine_us_harness_slices, - write_policyengine_us_time_period_dataset, -) -from microplex_us.policyengine.comparison import ( - PolicyEngineUSTargetComparisonReport, - PolicyEngineUSTargetEvaluation, - PolicyEngineUSTargetEvaluationReport, -) - - -def _candidate_tables() -> PolicyEngineUSEntityTableBundle: - return PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [2.0, 1.0], - "state_fips": [6, 36], - "snap": [100.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [100, 100, 200], - "spm_unit_id": [1000, 1000, 2000], - "family_id": [5000, 5000, 6000], - "marital_unit_id": [7000, 7000, 8000], - "age": [40.0, 10.0, 30.0], - "employment_income": [30_000.0, 0.0, 20_000.0], - "employment_income_before_lsr": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 200], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000, 2000], - "household_id": [1, 2], - } - ), - families=pd.DataFrame( - { - "family_id": [5000, 6000], - "household_id": [1, 2], - } - ), - marital_units=pd.DataFrame( - { - "marital_unit_id": [7000, 8000], - "household_id": [1, 2], - } - ), - ) - - -def _baseline_dataset(tmp_path: Path) -> Path: - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - "snap": [75.0, 50.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "tax_unit_id": [100, 100, 200], - "spm_unit_id": [1000, 1000, 2000], - "family_id": [5000, 5000, 6000], - "marital_unit_id": [7000, 7000, 8000], - "age": [40.0, 10.0, 30.0], - "employment_income": [30_000.0, 0.0, 20_000.0], - "employment_income_before_lsr": [30_000.0, 0.0, 20_000.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 200], - "household_id": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000, 2000], - "household_id": [1, 2], - } - ), - families=pd.DataFrame( - { - "family_id": [5000, 6000], - "household_id": [1, 2], - } - ), - marital_units=pd.DataFrame( - { - "marital_unit_id": [7000, 8000], - "household_id": [1, 2], - } - ), - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map={"state_fips": "state_fips", "snap": "snap"}, - person_variable_map={ - "age": "age", - "employment_income_before_lsr": "employment_income_before_lsr", - }, - tax_unit_variable_map={"filing_status": "filing_status"}, - ) - dataset_path = tmp_path / "baseline.h5" - write_policyengine_us_time_period_dataset(arrays, dataset_path) - return dataset_path - - -def test_evaluate_policyengine_us_harness_scores_candidate_against_baseline(tmp_path): - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="ca_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=(TargetFilter("state_fips", FilterOperator.EQ, 6),), - ), - TargetSpec( - name="employment_income_before_lsr_total", - entity=EntityType.PERSON, - value=80_000.0, - period=2024, - measure="employment_income_before_lsr", - aggregation="sum", - ), - ] - ) - ) - slices = [ - PolicyEngineUSHarnessSlice( - name="counts", - tags=("national", "counts"), - query=TargetQuery(period=2024, names=("ca_households",)), - ), - PolicyEngineUSHarnessSlice( - name="employment_income_before_lsr", - tags=("national", "programs"), - query=TargetQuery(period=2024, names=("employment_income_before_lsr_total",)), - ), - ] - - run = evaluate_policyengine_us_harness( - _candidate_tables(), - provider, - slices, - baseline_dataset=_baseline_dataset(tmp_path), - dataset_year=2024, - candidate_label="candidate", - baseline_label="baseline", - metadata={"git_commit": "abc123"}, - ) - - assert run.candidate_label == "candidate" - assert run.baseline_label == "baseline" - assert len(run.slice_results) == 2 - assert run.mean_abs_relative_error_delta is not None - assert run.mean_abs_relative_error_delta < 0.0 - assert run.benchmark_suite.mean_abs_relative_error_delta is not None - assert run.benchmark_suite.mean_abs_relative_error_delta < 0.0 - assert run.candidate_composite_parity_loss is not None - assert run.baseline_composite_parity_loss is not None - assert run.composite_parity_loss_delta is not None - assert run.composite_parity_loss_delta < 0.0 - assert run.slice_win_rate == 1.0 - assert run.benchmark_suite.slice_win_rate == 1.0 - assert run.target_win_rate == 1.0 - assert run.supported_target_rate == 1.0 - assert run.tag_summaries["national"]["supported_target_rate"] == 1.0 - assert run.tag_summaries["national"]["candidate_composite_parity_loss"] is not None - assert run.parity_scorecard["overall"]["candidate_beats_baseline"] is True - assert run.parity_scorecard["national"]["candidate_beats_baseline"] is True - assert run.attribute_cell_summaries - assert run.metadata["git_commit"] == "abc123" - - -def test_policyengine_us_harness_run_round_trips_json(tmp_path): - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="ca_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=(TargetFilter("state_fips", FilterOperator.EQ, 6),), - ), - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=250.0, - period=2024, - measure="snap", - aggregation="sum", - ), - ] - ) - ) - run = evaluate_policyengine_us_harness( - _candidate_tables(), - provider, - [ - PolicyEngineUSHarnessSlice( - name="core", - query=TargetQuery(period=2024, names=("ca_households", "snap_total")), - description="Core parity slice", - tags=("national", "programs"), - ) - ], - baseline_dataset=_baseline_dataset(tmp_path), - dataset_year=2024, - ) - - output_path = run.save(tmp_path / "reports" / "harness.json") - loaded = PolicyEngineUSHarnessRun.load(output_path) - payload = json.loads(output_path.read_text()) - - assert loaded.candidate_label == run.candidate_label - assert loaded.baseline_label == run.baseline_label - assert loaded.period == run.period - assert loaded.slice_results[0].slice.name == "core" - assert loaded.slice_results[0].slice.description == "Core parity slice" - assert loaded.slice_results[0].slice.tags == ("national", "programs") - assert loaded.slice_results[0].comparison.candidate.evaluations[0].target.filters - assert ( - loaded.slice_results[0] - .comparison.candidate.evaluations[0] - .target.filters[0] - .feature - == "state_fips" - ) - assert loaded.slice_win_rate == run.slice_win_rate - assert loaded.candidate_composite_parity_loss == run.candidate_composite_parity_loss - assert payload["slices"][0]["summary"]["candidate_supported_target_count"] == 2 - assert payload["slices"][0]["summary"]["baseline_supported_target_count"] == 2 - - -def test_policyengine_us_harness_preserves_duplicate_target_names_across_slices(): - target = TargetSpec( - name="population", - entity=EntityType.HOUSEHOLD, - value=100.0, - period=2024, - aggregation="count", - metadata={ - "geo_level": "state", - "domain_variable": "age", - }, - ) - run = PolicyEngineUSHarnessRun( - candidate_label="candidate", - baseline_label="baseline", - period=2024, - slice_results=[ - PolicyEngineUSHarnessSliceResult( - slice=PolicyEngineUSHarnessSlice( - name="slice_a", - query=TargetQuery(period=2024, names=("population",)), - ), - comparison=PolicyEngineUSTargetComparisonReport( - candidate=PolicyEngineUSTargetEvaluationReport( - label="candidate", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation( - target=target, - actual_value=100.0, - ) - ], - ), - baseline=PolicyEngineUSTargetEvaluationReport( - label="baseline", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation( - target=target, - actual_value=80.0, - ) - ], - ), - ), - ), - PolicyEngineUSHarnessSliceResult( - slice=PolicyEngineUSHarnessSlice( - name="slice_b", - query=TargetQuery(period=2024, names=("population",)), - ), - comparison=PolicyEngineUSTargetComparisonReport( - candidate=PolicyEngineUSTargetEvaluationReport( - label="candidate", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation( - target=target, - actual_value=50.0, - ) - ], - ), - baseline=PolicyEngineUSTargetEvaluationReport( - label="baseline", - period=2024, - evaluations=[ - PolicyEngineUSTargetEvaluation( - target=target, - actual_value=60.0, - ) - ], - ), - ), - ), - ], - ) - - cell_key = "geo=state|entity=household|aggregation=count|feature=household_count|domain=age" - - assert run.candidate_micro_mean_abs_relative_error == pytest.approx(0.25) - assert run.baseline_micro_mean_abs_relative_error == pytest.approx(0.30) - assert run.attribute_cell_summaries[cell_key]["candidate_target_count"] == 2 - assert run.attribute_cell_summaries[cell_key]["baseline_target_count"] == 2 - - -def test_evaluate_policyengine_us_harness_raises_on_strict_materialization_failure( - tmp_path, -): - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=250.0, - period=2024, - measure="snap", - aggregation="sum", - ), - ] - ) - ) - baseline_tables = PolicyEngineUSEntityTableBundle( - households=_candidate_tables().households.drop(columns=["snap"]), - persons=_candidate_tables().persons, - tax_units=_candidate_tables().tax_units, - spm_units=_candidate_tables().spm_units, - families=_candidate_tables().families, - marital_units=_candidate_tables().marital_units, - ) - arrays = build_policyengine_us_time_period_arrays( - baseline_tables, - period=2024, - household_variable_map={"state_fips": "state_fips"}, - person_variable_map={"age": "age", "employment_income": "employment_income"}, - tax_unit_variable_map={"filing_status": "filing_status"}, - ) - baseline_path = tmp_path / "baseline_missing_snap.h5" - write_policyengine_us_time_period_dataset(arrays, baseline_path) - - class FakeEntity: - def __init__(self, key: str): - self.key = key - - class FakeVariable: - def __init__(self, entity: FakeEntity, formulas: dict[str, object] | None = None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self) -> bool: - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable(FakeEntity("household"), formulas={"2024": object()}), - } - - class FailingSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - assert Path(dataset).exists() - assert dataset_year == 2024 - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert variable == "snap" - assert period == 2024 - assert map_to is None - raise RuntimeError("snap materialization unavailable") - - with pytest.raises(PolicyEngineUSMaterializationError, match="candidate"): - evaluate_policyengine_us_harness( - _candidate_tables(), - provider, - [ - PolicyEngineUSHarnessSlice( - name="snap", - query=TargetQuery(period=2024, names=("snap_total",)), - ) - ], - baseline_dataset=baseline_path, - dataset_year=2024, - simulation_cls=FailingSimulation, - candidate_label="candidate", - baseline_label="baseline", - strict_materialization=True, - ) - - -def test_filter_nonempty_policyengine_us_harness_slices_drops_empty_queries(): - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=250.0, - period=2024, - measure="snap", - aggregation="sum", - ), - ] - ) - ) - - filtered = filter_nonempty_policyengine_us_harness_slices( - provider, - ( - PolicyEngineUSHarnessSlice( - name="counts", - query=TargetQuery(period=2024, names=("household_count",)), - ), - PolicyEngineUSHarnessSlice( - name="snap", - query=TargetQuery(period=2024, names=("snap_total",)), - ), - ), - ) - - assert [slice_spec.name for slice_spec in filtered] == ["snap"] - - -def test_evaluate_policyengine_us_harness_reuses_union_evaluation(tmp_path, monkeypatch): - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="ca_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=(TargetFilter("state_fips", FilterOperator.EQ, 6),), - ), - TargetSpec( - name="employment_income_before_lsr_total", - entity=EntityType.PERSON, - value=80_000.0, - period=2024, - measure="employment_income_before_lsr", - aggregation="sum", - ), - ] - ) - ) - evaluate_calls: list[tuple[str, ...]] = [] - real_evaluate = comparison_module.evaluate_policyengine_us_target_set - - def record_evaluate(*args, **kwargs): - targets = args[1] - if isinstance(targets, TargetSet): - target_names = tuple(target.name for target in targets.targets) - else: - target_names = tuple(target.name for target in targets) - evaluate_calls.append(target_names) - return real_evaluate(*args, **kwargs) - - monkeypatch.setattr(comparison_module, "evaluate_policyengine_us_target_set", record_evaluate) - monkeypatch.setattr(harness_module, "evaluate_policyengine_us_target_set", record_evaluate) - - run = evaluate_policyengine_us_harness( - _candidate_tables(), - provider, - [ - PolicyEngineUSHarnessSlice( - name="counts", - query=TargetQuery(period=2024, names=("ca_households",)), - ), - PolicyEngineUSHarnessSlice( - name="employment_income_before_lsr", - query=TargetQuery(period=2024, names=("employment_income_before_lsr_total",)), - ), - ], - baseline_dataset=_baseline_dataset(tmp_path), - dataset_year=2024, - ) - - assert run.slice_win_rate == 1.0 - assert evaluate_calls == [ - ("ca_households", "employment_income_before_lsr_total"), - ("ca_households", "employment_income_before_lsr_total"), - ] - - -def test_evaluate_policyengine_us_harness_passes_candidate_direct_override_variables( - tmp_path, - monkeypatch, -): - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="snap_total", - entity=EntityType.HOUSEHOLD, - value=250.0, - period=2024, - measure="snap", - aggregation="sum", - ), - ] - ) - ) - captured: list[tuple[str, ...]] = [] - real_evaluate = comparison_module.evaluate_policyengine_us_target_sets - - def record_evaluate(*args, **kwargs): - captured.append(tuple(kwargs.get("direct_override_variables", ()))) - return real_evaluate(*args, **kwargs) - - monkeypatch.setattr( - comparison_module, - "evaluate_policyengine_us_target_sets", - record_evaluate, - ) - monkeypatch.setattr( - harness_module, - "evaluate_policyengine_us_target_sets", - record_evaluate, - ) - - evaluate_policyengine_us_harness( - _candidate_tables(), - provider, - [ - PolicyEngineUSHarnessSlice( - name="snap", - query=TargetQuery(period=2024, names=("snap_total",)), - ), - ], - baseline_dataset=_baseline_dataset(tmp_path), - dataset_year=2024, - candidate_direct_override_variables=("snap", "ssi"), - ) - - assert captured == [("snap", "ssi")] - - -def test_evaluate_policyengine_us_harness_excludes_zero_common_slices_from_suite( - tmp_path, -): - provider = StaticTargetProvider( - TargetSet( - [ - TargetSpec( - name="ca_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=(TargetFilter("state_fips", FilterOperator.EQ, 6),), - ), - TargetSpec( - name="district_households", - entity=EntityType.HOUSEHOLD, - value=2.0, - period=2024, - aggregation="count", - filters=( - TargetFilter( - "congressional_district_geoid", - FilterOperator.EQ, - 601, - ), - ), - ), - ] - ) - ) - - run = evaluate_policyengine_us_harness( - _candidate_tables(), - provider, - [ - PolicyEngineUSHarnessSlice( - name="state", - query=TargetQuery(period=2024, names=("ca_households",)), - tags=("local", "state"), - ), - PolicyEngineUSHarnessSlice( - name="district", - query=TargetQuery(period=2024, names=("district_households",)), - tags=("local", "district"), - ), - ], - baseline_dataset=_baseline_dataset(tmp_path), - dataset_year=2024, - strict_materialization=False, - ) - - assert len(run.slice_results) == 2 - assert run.slice_results[1].comparison.benchmark_comparison is None - assert [result.slice.name for result in run.benchmark_suite.slice_results] == ["state"] - assert run.metadata["excluded_slice_names"] == ["district"] - - -def test_default_policyengine_us_db_harness_slices_tracks_provider_filters(): - slices = default_policyengine_us_db_harness_slices( - period=2024, - variables=("snap", "household_count"), - domain_variables=("snap",), - geo_levels=("state",), - reform_id=3, - ) - - assert [slice_spec.name for slice_spec in slices] == [ - "all_targets", - "snap", - "household_count", - ] - assert slices[0].query.provider_filters == { - "reform_id": 3, - "variables": ["snap", "household_count"], - "domain_variables": ["snap"], - "geo_levels": ["state"], - } - assert slices[1].query.provider_filters["variables"] == ["snap"] - assert slices[2].query.provider_filters["variables"] == ["household_count"] - - -def test_default_policyengine_us_db_all_target_slices_span_all_active_targets(): - slices = default_policyengine_us_db_all_target_slices(period=2024, reform_id=3) - - assert [slice_spec.name for slice_spec in slices] == ["all_targets"] - assert slices[0].tags == ("benchmark", "all_targets") - assert slices[0].query.provider_filters == {"reform_id": 3} - - -def test_default_policyengine_us_db_parity_slices_track_tags_and_filters(): - slices = default_policyengine_us_db_parity_slices( - period=2024, - variables=("snap", "household_count"), - domain_variables=("snap",), - geo_levels=("state", "district"), - reform_id=3, - ) - - assert [slice_spec.name for slice_spec in slices] == [ - "state_programs_core", - "district_snap_households", - ] - assert slices[0].tags == ("parity", "local", "state", "programs") - assert slices[0].query.provider_filters == { - "reform_id": 3, - "variables": ["household_count"], - "domain_variable_values": ["snap"], - "geo_levels": ["state"], - } - assert slices[-1].tags == ("parity", "local", "district", "programs", "snap") - assert slices[-1].query.provider_filters["geo_levels"] == ["district"] - assert slices[-1].query.provider_filters["domain_variable_values"] == ["snap"] diff --git a/tests/policyengine/test_materialize_batched.py b/tests/policyengine/test_materialize_batched.py deleted file mode 100644 index 13c5c968..00000000 --- a/tests/policyengine/test_materialize_batched.py +++ /dev/null @@ -1,254 +0,0 @@ -"""Batched-materialize equivalence tests. - -Covers the batched path of :func:`materialize_policyengine_us_variables` -without spinning up a real PolicyEngine Microsimulation. A fake -``simulation_cls`` mimics the per-record-scalar semantics that -calibration targets actually use (each output is a function of the -calling chunk's own data, independent of other chunks). The test then -proves that running the function with ``batch_size=None`` and with a -sub-full batch size produces identical results. -""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any - -import numpy as np -import pandas as pd -import pytest -from microplex.core import EntityType - -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - materialize_policyengine_us_variables, -) - - -@dataclass -class FakeVariable: - """Stand-in for a PolicyEngine Variable metadata entry.""" - - name: str - entity: str # "household" | "person" | etc. - - -class FakeEntity: - def __init__(self, key: str) -> None: - self.key = key - - -class FakeTaxBenefitSystem: - """Enough of the TaxBenefitSystem interface to satisfy the materializer. - - The real resolver checks a variables registry + entity registry. The - fake returns hardcoded entries for the test's target variables. - """ - - def __init__(self, variables: dict[str, FakeVariable]) -> None: - self.variables = variables - self.entities = [FakeEntity(k) for k in ("person", "household", "tax_unit")] - - def get_variable(self, name: str) -> FakeVariable: - if name not in self.variables: - raise KeyError(name) - return self.variables[name] - - -class FakeSimulation: - """Fake Microsimulation that computes per-record values deterministically. - - Each variable's value is a pure function of a household-level input - column the fake reads from the provided dataset path. Writing a - real h5 would require the full PolicyEngine dataset machinery; for - the test we instead accept an in-memory ``dataset`` dict. - """ - - def __init__(self, dataset: str | None = None, **kwargs: Any) -> None: - # The real code writes an h5 and points the sim at its path; - # for this fake we pull the chunk arrays off ``_fake_chunk_data`` - # (set via the monkeypatch below). - chunk = getattr(FakeSimulation, "_fake_chunk_data", None) - if chunk is None: - raise RuntimeError( - "FakeSimulation needs _fake_chunk_data set by the test." - ) - self._hh = chunk["households"] - self.tax_benefit_system = FakeTaxBenefitSystem( - { - "doubled_base": FakeVariable(name="doubled_base", entity="household"), - "squared_base": FakeVariable(name="squared_base", entity="household"), - } - ) - - def calculate(self, variable: str, period: Any = None, map_to: Any = None): - # Pure per-record scalar; returns len(households) values. - base = self._hh["base_value"].to_numpy(dtype=float) - if variable == "doubled_base": - return base * 2.0 - if variable == "squared_base": - return base**2 - raise KeyError(variable) - - -@pytest.fixture -def fake_sim(monkeypatch): - """Register FakeSimulation as the simulation_cls and patch the - materializer's internal helpers so they accept our in-memory chunk.""" - # Patch the module-level resolver the materializer uses to look up - # the tax-benefit system. We monkey the whole pipeline rather than - # write a real h5. - from microplex_us.policyengine import us as us_module - - monkeypatch.setattr( - us_module, - "_resolve_policyengine_us_tax_benefit_system", - lambda simulation_cls=None: FakeTaxBenefitSystem( - { - "doubled_base": FakeVariable("doubled_base", "household"), - "squared_base": FakeVariable("squared_base", "household"), - } - ), - ) - monkeypatch.setattr( - us_module, - "build_policyengine_us_export_variable_maps", - lambda tables, **_: { - "household": {"base_value": "base_value"}, - "person": {}, - "tax_unit": {}, - "spm_unit": {}, - "family": {}, - }, - ) - monkeypatch.setattr( - us_module, - "resolve_policyengine_excluded_export_variables", - lambda *args, **kwargs: set(), - ) - - def _build_arrays(tables, **kwargs): - # The real function produces a period-keyed dict of arrays; we - # just stash the chunk on the fake class and ignore the output. - FakeSimulation._fake_chunk_data = { - "households": tables.households, - } - return {} - - monkeypatch.setattr( - us_module, - "build_policyengine_us_time_period_arrays", - _build_arrays, - ) - monkeypatch.setattr( - us_module, - "write_policyengine_us_time_period_dataset", - lambda *args, **kwargs: None, - ) - - # Patch the adapter factory to return our fake - from microplex_us.policyengine.us import ( - PolicyEngineUSMicrosimulationAdapter, - ) - - def _fake_from_dataset(*args, **kwargs): - return PolicyEngineUSMicrosimulationAdapter(simulation=FakeSimulation()) - - monkeypatch.setattr( - PolicyEngineUSMicrosimulationAdapter, - "from_dataset", - classmethod(lambda cls, *a, **k: _fake_from_dataset(*a, **k)), - ) - - # Patch variable_entity so the attach helper routes all variables - # to the household table. - monkeypatch.setattr( - PolicyEngineUSMicrosimulationAdapter, - "variable_entity", - lambda self, variable: EntityType.HOUSEHOLD, - ) - - -def _make_bundle(n: int = 50, seed: int = 0) -> PolicyEngineUSEntityTableBundle: - rng = np.random.default_rng(seed) - household_ids = np.arange(n) + 1 - households = pd.DataFrame( - { - "household_id": household_ids, - "base_value": rng.uniform(1, 10, size=n), - } - ) - persons = pd.DataFrame( - { - "household_id": household_ids, - "person_id": household_ids * 10, - } - ) - return PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=None, - spm_units=None, - families=None, - marital_units=None, - ) - - -class TestBatchedMaterializeEquivalence: - """Batched output must equal single-pass output element-wise.""" - - def test_single_pass_vs_batched_equivalent(self, fake_sim) -> None: - tables = _make_bundle(n=50) - - full_tables, full_bindings = materialize_policyengine_us_variables( - tables, - variables=["doubled_base", "squared_base"], - period=2024, - batch_size=None, - ) - batched_tables, batched_bindings = materialize_policyengine_us_variables( - tables, - variables=["doubled_base", "squared_base"], - period=2024, - batch_size=10, # 5 chunks - ) - - pd.testing.assert_frame_equal( - full_tables.households.sort_values("household_id").reset_index(drop=True), - batched_tables.households.sort_values("household_id").reset_index(drop=True), - ) - assert set(full_bindings) == set(batched_bindings) - - def test_batch_size_larger_than_data_is_noop(self, fake_sim) -> None: - tables = _make_bundle(n=10) - full, _ = materialize_policyengine_us_variables( - tables, - variables=["doubled_base"], - period=2024, - batch_size=None, - ) - batched, _ = materialize_policyengine_us_variables( - tables, - variables=["doubled_base"], - period=2024, - batch_size=10_000, # > n=10 - ) - pd.testing.assert_frame_equal(full.households, batched.households) - - def test_uneven_batch_split(self, fake_sim) -> None: - """50 records with batch_size=17 → chunks of 17, 17, 16.""" - tables = _make_bundle(n=50) - batched, _ = materialize_policyengine_us_variables( - tables, - variables=["doubled_base"], - period=2024, - batch_size=17, - ) - assert len(batched.households) == 50 - # Values correct (doubled_base = 2 * base_value) - np.testing.assert_allclose( - batched.households["doubled_base"].to_numpy(), - 2.0 * batched.households["base_value"].to_numpy(), - rtol=0, - atol=0, - ) diff --git a/tests/policyengine/test_target_profiles.py b/tests/policyengine/test_target_profiles.py deleted file mode 100644 index 68382cfc..00000000 --- a/tests/policyengine/test_target_profiles.py +++ /dev/null @@ -1,473 +0,0 @@ -from __future__ import annotations - -from microplex_us.policyengine.target_profiles import ( - policyengine_us_target_profile_exclusion_reasons, - policyengine_us_target_profile_names, - resolve_policyengine_us_target_profile, -) - - -def test_policyengine_us_target_profile_names_include_no_state_aca_variant() -> None: - assert "pe_native_broad" in policyengine_us_target_profile_names() - assert "pe_native_broad_no_state_aca" in policyengine_us_target_profile_names() - assert "pe_native_broad_source_backed" in policyengine_us_target_profile_names() - - -def test_broad_profile_includes_soi_employment_income_cells() -> None: - broad = resolve_policyengine_us_target_profile("pe_native_broad") - broad_cells = { - (cell.variable, cell.geo_level, cell.domain_variable, cell.geographic_id) - for cell in broad - } - - assert ( - "employment_income", - "national", - "employment_income", - None, - ) in broad_cells - assert ( - "tax_unit_count", - "national", - "employment_income", - None, - ) in broad_cells - assert ( - "employment_income", - "state", - "employment_income", - None, - ) in broad_cells - assert ( - "tax_unit_count", - "state", - "employment_income", - None, - ) in broad_cells - - -def test_broad_profile_includes_bea_full_population_amount_cells() -> None: - broad = resolve_policyengine_us_target_profile("pe_native_broad") - broad_cells = { - (cell.variable, cell.geo_level, cell.domain_variable, cell.geographic_id) - for cell in broad - } - - assert ( - "dividend_income", - "national", - None, - None, - ) in broad_cells - assert ( - "employment_income_before_lsr", - "national", - None, - None, - ) in broad_cells - assert ( - "rental_income", - "national", - None, - None, - ) in broad_cells - assert ( - "self_employment_income", - "national", - None, - None, - ) in broad_cells - assert ( - "employment_income_before_lsr", - "state", - None, - None, - ) in broad_cells - assert ( - "self_employment_income", - "state", - None, - None, - ) in broad_cells - - -def test_broad_profile_covers_current_policyengine_target_db_cells() -> None: - broad = resolve_policyengine_us_target_profile("pe_native_broad") - broad_cells = { - (cell.variable, cell.geo_level, cell.domain_variable, cell.geographic_id) - for cell in broad - } - - added_policyengine_cells = { - ("aca_ptc", "national", "aca_ptc", None), - ("adjusted_gross_income", "national", "adjusted_gross_income", None), - ( - "adjusted_gross_income", - "national", - "adjusted_gross_income,filing_status,income_tax_before_credits", - None, - ), - ( - "adjusted_gross_income", - "national", - "adjusted_gross_income,income_tax_before_credits", - None, - ), - ("childcare_expenses", "national", None, None), - ("deductible_mortgage_interest", "national", None, None), - ("household_count", "national", "spm_unit_energy_subsidy_reported", None), - ( - "medical_expense_deduction", - "national", - "medical_expense_deduction,tax_unit_itemizes", - None, - ), - ( - "non_refundable_ctc", - "national", - "adjusted_gross_income,non_refundable_ctc", - None, - ), - ("non_refundable_ctc", "national", "non_refundable_ctc", None), - ( - "real_estate_taxes", - "national", - "real_estate_taxes,tax_unit_itemizes", - None, - ), - ( - "refundable_ctc", - "national", - "adjusted_gross_income,refundable_ctc", - None, - ), - ("roth_401k_contributions", "national", None, None), - ("salt", "national", "salt,tax_unit_itemizes", None), - ("self_employed_pension_contribution_ald", "national", None, None), - ("spm_unit_count", "national", "tanf", None), - ("tanf", "national", "tanf", None), - ("tax_unit_count", "national", "adjusted_gross_income", None), - ( - "tax_unit_count", - "national", - "adjusted_gross_income,filing_status,income_tax_before_credits", - None, - ), - ( - "tax_unit_count", - "national", - "adjusted_gross_income,income_tax_before_credits", - None, - ), - ( - "tax_unit_count", - "national", - "adjusted_gross_income,non_refundable_ctc", - None, - ), - ( - "tax_unit_count", - "national", - "adjusted_gross_income,refundable_ctc", - None, - ), - ( - "tax_unit_count", - "national", - "medical_expense_deduction,tax_unit_itemizes", - None, - ), - ("tax_unit_count", "national", "non_refundable_ctc", None), - ( - "tax_unit_count", - "national", - "real_estate_taxes,tax_unit_itemizes", - None, - ), - ("tax_unit_count", "national", "salt,tax_unit_itemizes", None), - ("tax_unit_count", "national", "total_self_employment_income", None), - ( - "total_self_employment_income", - "national", - "total_self_employment_income", - None, - ), - ("traditional_401k_contributions", "national", None, None), - ("aca_ptc", "state", "aca_ptc", None), - ("adjusted_gross_income", "state", "adjusted_gross_income", None), - ( - "medical_expense_deduction", - "state", - "medical_expense_deduction,tax_unit_itemizes", - None, - ), - ("non_refundable_ctc", "state", "non_refundable_ctc", None), - ("person_count", "state", "aca_ptc,is_aca_ptc_eligible", None), - ("person_count", "state", "is_pregnant", None), - ( - "real_estate_taxes", - "state", - "real_estate_taxes,tax_unit_itemizes", - None, - ), - ("salt", "state", "salt,tax_unit_itemizes", None), - ("spm_unit_count", "state", "tanf", None), - ("tanf", "state", "tanf", None), - ( - "tax_unit_count", - "state", - "medical_expense_deduction,tax_unit_itemizes", - None, - ), - ("tax_unit_count", "state", "non_refundable_ctc", None), - ( - "tax_unit_count", - "state", - "real_estate_taxes,tax_unit_itemizes", - None, - ), - ("tax_unit_count", "state", "salt,tax_unit_itemizes", None), - ( - "tax_unit_count", - "state", - "selected_marketplace_plan_benchmark_ratio,used_aca_ptc", - None, - ), - ("tax_unit_count", "state", "total_self_employment_income", None), - ("tax_unit_count", "state", "used_aca_ptc", None), - ( - "total_self_employment_income", - "state", - "total_self_employment_income", - None, - ), - } - - assert added_policyengine_cells <= broad_cells - - -def test_broad_profile_has_no_duplicate_cells() -> None: - broad = resolve_policyengine_us_target_profile("pe_native_broad") - broad_cells = [ - (cell.variable, cell.geo_level, cell.domain_variable, cell.geographic_id) - for cell in broad - ] - - assert len(broad_cells) == len(set(broad_cells)) - - -def test_no_state_aca_profile_excludes_only_state_aca_cells() -> None: - broad = resolve_policyengine_us_target_profile("pe_native_broad") - no_state_aca = resolve_policyengine_us_target_profile( - "pe_native_broad_no_state_aca" - ) - - broad_cells = { - (cell.variable, cell.geo_level, cell.domain_variable, cell.geographic_id) - for cell in broad - } - no_state_aca_cells = { - (cell.variable, cell.geo_level, cell.domain_variable, cell.geographic_id) - for cell in no_state_aca - } - - assert ( - "aca_ptc", - "state", - None, - None, - ) in broad_cells - assert ( - "tax_unit_count", - "state", - "aca_ptc", - None, - ) in broad_cells - assert ( - "aca_ptc", - "state", - "aca_ptc", - None, - ) in broad_cells - assert ( - "person_count", - "state", - "aca_ptc,is_aca_ptc_eligible", - None, - ) in broad_cells - assert ( - "tax_unit_count", - "state", - "used_aca_ptc", - None, - ) in broad_cells - assert ( - "tax_unit_count", - "national", - "aca_ptc", - None, - ) in no_state_aca_cells - assert ( - "aca_ptc", - "state", - None, - None, - ) not in no_state_aca_cells - assert ( - "tax_unit_count", - "state", - "aca_ptc", - None, - ) not in no_state_aca_cells - assert ( - "aca_ptc", - "state", - "aca_ptc", - None, - ) not in no_state_aca_cells - assert ( - "person_count", - "state", - "aca_ptc", - None, - ) not in no_state_aca_cells - assert ( - "person_count", - "state", - "aca_ptc,is_aca_ptc_eligible", - None, - ) not in no_state_aca_cells - assert ( - "tax_unit_count", - "state", - "selected_marketplace_plan_benchmark_ratio,used_aca_ptc", - None, - ) not in no_state_aca_cells - assert ( - "tax_unit_count", - "state", - "used_aca_ptc", - None, - ) not in no_state_aca_cells - - -def test_source_backed_profile_excludes_only_documented_non_source_cells() -> None: - broad = resolve_policyengine_us_target_profile("pe_native_broad") - source_backed = resolve_policyengine_us_target_profile( - "pe_native_broad_source_backed" - ) - exclusion_reasons = policyengine_us_target_profile_exclusion_reasons( - "pe_native_broad_source_backed" - ) - - broad_cells = { - (cell.variable, cell.geo_level, cell.domain_variable, cell.geographic_id) - for cell in broad - } - source_backed_cells = { - (cell.variable, cell.geo_level, cell.domain_variable, cell.geographic_id) - for cell in source_backed - } - - assert len(broad_cells) == 220 - assert len(exclusion_reasons) == 37 - assert all(reason for reason in exclusion_reasons.values()) - assert set(exclusion_reasons) <= broad_cells - assert len(source_backed_cells) == 183 - assert source_backed_cells == broad_cells - set(exclusion_reasons) - assert ( - "childcare_expenses", - "national", - None, - None, - ) not in source_backed_cells - assert ( - "person_count", - "state", - "is_pregnant", - None, - ) not in source_backed_cells - assert ( - "person_count", - "state", - "adjusted_gross_income", - None, - ) not in source_backed_cells - assert ( - "salt", - "national", - "salt", - None, - ) not in source_backed_cells - assert ( - "tax_unit_count", - "national", - "salt,tax_unit_itemizes", - None, - ) not in source_backed_cells - assert ( - "salt", - "state", - "salt,tax_unit_itemizes", - None, - ) not in source_backed_cells - assert ( - "employment_income_before_lsr", - "national", - None, - None, - ) in source_backed_cells - assert ( - "employment_income_before_lsr", - "state", - None, - None, - ) in source_backed_cells - assert ( - "alimony_expense", - "national", - None, - None, - ) in source_backed_cells - assert ( - "alimony_income", - "national", - None, - None, - ) in source_backed_cells - assert ( - "medicare_part_b_premiums", - "national", - None, - None, - ) in source_backed_cells - assert ( - "net_worth", - "national", - None, - None, - ) in source_backed_cells - assert ( - "salt", - "national", - "salt,tax_unit_itemizes", - None, - ) in source_backed_cells - assert ( - "person_count", - "state", - "ssi,is_ssi_disabled", - None, - ) in source_backed_cells - assert ( - "ssi", - "state", - None, - None, - ) in source_backed_cells - assert ( - "ssi", - "national", - "ssi,is_ssi_aged", - None, - ) in source_backed_cells diff --git a/tests/policyengine/test_us.py b/tests/policyengine/test_us.py deleted file mode 100644 index 2f7965f5..00000000 --- a/tests/policyengine/test_us.py +++ /dev/null @@ -1,3698 +0,0 @@ -"""Tests for PolicyEngine US integration helpers.""" - -from __future__ import annotations - -import sqlite3 -import subprocess -import sys -from pathlib import Path - -import h5py -import numpy as np -import pandas as pd -import pytest -from microplex.core import EntityType -from microplex.targets import ( - TargetAggregation, - TargetFilter, - TargetProvider, - TargetQuery, - TargetSimulationModifier, - TargetSpec, -) - -from microplex_us.pipelines.us import USMicroplexBuildConfig, USMicroplexPipeline -from microplex_us.policyengine.us import ( - POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES, - SAFE_POLICYENGINE_US_EXPORT_VARIABLES, - PolicyEngineUSConstraint, - PolicyEngineUSDBTarget, - PolicyEngineUSDBTargetProvider, - PolicyEngineUSEntityTableBundle, - PolicyEngineUSMicrosimulationAdapter, - PolicyEngineUSQuantityTarget, - PolicyEngineUSSimulationTargetCompiler, - PolicyEngineUSStratum, - PolicyEngineUSTargetValidationError, - PolicyEngineUSVariableBinding, - PolicyEngineUSVariableMaterializationResult, - build_policyengine_us_export_column_names, - build_policyengine_us_export_variable_maps, - build_policyengine_us_time_period_arrays, - compile_policyengine_us_household_linear_constraints, - compute_marketplace_plan_benchmark_ratio, - compute_policyengine_us_definition_hash, - detect_policyengine_pseudo_inputs, - materialize_policyengine_us_variables, - materialize_policyengine_us_variables_safely, - policyengine_us_variables_to_materialize, - project_frame_to_time_period_arrays, - resolve_policyengine_excluded_export_variables, - write_policyengine_us_time_period_dataset, -) - - -def _create_policyengine_targets_db(path: Path) -> None: - national_constraints: tuple[PolicyEngineUSConstraint, ...] = () - california_senior_constraints = ( - PolicyEngineUSConstraint("state_fips", "==", "06"), - PolicyEngineUSConstraint("age", ">=", "65"), - ) - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - - CREATE VIEW target_overview AS - SELECT - t.target_id, - t.stratum_id, - t.variable, - t.value, - t.period, - t.active, - CASE - WHEN t.stratum_id = 2 THEN 'state' - ELSE 'national' - END AS geo_level, - CASE - WHEN t.stratum_id = 2 THEN '06' - ELSE 'US' - END AS geographic_id, - CASE - WHEN t.stratum_id = 2 THEN 'snap' - ELSE NULL - END AS domain_variable - FROM targets AS t; - """ - ) - conn.executemany( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - [ - ( - 1, - compute_policyengine_us_definition_hash(national_constraints), - None, - ), - ( - 2, - compute_policyengine_us_definition_hash( - california_senior_constraints, - parent_stratum_id=1, - ), - 1, - ), - ], - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - constraint_variable, - operation, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (2, "state_fips", "==", "06"), - (2, "age", ">=", "65"), - ], - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (10, "snap", 2024, 1, 0, 114_100_000_000.0, 1, 5.0, "CBO", "National SNAP"), - ( - 11, - "snap", - 2024, - 2, - 0, - 9_500_000_000.0, - 1, - 10.0, - "CBO", - "California senior SNAP", - ), - ], - ) - conn.commit() - conn.close() - - -def _create_profile_filter_targets_db(path: Path) -> None: - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - - CREATE VIEW target_overview AS - SELECT - t.target_id, - t.stratum_id, - t.variable, - t.value, - t.period, - t.active, - 'national' AS geo_level, - 'US' AS geographic_id, - NULL AS domain_variable - FROM targets AS t; - """ - ) - conn.execute( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - (1, compute_policyengine_us_definition_hash(()), None), - ) - conn.executemany( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 20, - "employment_income_before_lsr", - 2024, - 1, - 0, - 8_000_000_000_000.0, - 1, - 5.0, - "BEA", - "Source-backed employment income", - ), - ( - 21, - "childcare_expenses", - 2024, - 1, - 0, - 80_000_000_000.0, - 1, - 5.0, - "synthetic", - "Broad profile only", - ), - ], - ) - conn.commit() - conn.close() - - -class TestPolicyEngineUSDBTargetProvider: - def test_load_targets_includes_constraints(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_policyengine_targets_db(db_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - targets = provider.load_targets(period=2024, variables=["snap"]) - - assert len(targets) == 2 - unconstrained, constrained = targets - assert unconstrained.target_id == 10 - assert unconstrained.constraints == () - assert constrained.target_id == 11 - assert { - (c.variable, c.operation, c.value) for c in constrained.constraints - } == { - ("age", ">=", "65"), - ("state_fips", "==", "06"), - } - assert constrained.parent_stratum_id == 1 - assert constrained.definition_hash is not None - - def test_load_strata_returns_hierarchy_metadata(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_policyengine_targets_db(db_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - strata = provider.load_strata([2]) - - assert strata == { - 1: PolicyEngineUSStratum( - stratum_id=1, - definition_hash=compute_policyengine_us_definition_hash(()), - parent_stratum_id=None, - constraints=(), - ), - 2: PolicyEngineUSStratum( - stratum_id=2, - definition_hash=compute_policyengine_us_definition_hash( - ( - PolicyEngineUSConstraint("state_fips", "==", "06"), - PolicyEngineUSConstraint("age", ">=", "65"), - ), - parent_stratum_id=1, - ), - parent_stratum_id=1, - constraints=( - PolicyEngineUSConstraint("age", ">=", "65"), - PolicyEngineUSConstraint("state_fips", "==", "06"), - ), - ), - } - - def test_load_target_set_returns_canonical_targets(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_policyengine_targets_db(db_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={"variables": ["snap"]}, - ) - ) - - assert isinstance(provider, TargetProvider) - assert len(target_set.targets) == 2 - assert all(isinstance(target, TargetSpec) for target in target_set.targets) - assert target_set.targets[0].measure == "snap" - assert target_set.targets[1].metadata["parent_stratum_id"] == 1 - assert target_set.targets[1].metadata["constraint_count"] == 2 - assert target_set.targets[1].metadata["stratum_definition_hash"] is not None - - def test_load_target_set_applies_calibration_target_profile(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_profile_filter_targets_db(db_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "target_profile": "pe_native_broad", - "calibration_target_profile": "pe_native_broad_source_backed", - }, - ) - ) - - assert [target.measure for target in target_set.targets] == [ - "employment_income_before_lsr" - ] - - def test_load_target_set_rejects_profile_with_explicit_cells(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_profile_filter_targets_db(db_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - with pytest.raises(ValueError, match="cannot be combined"): - provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "target_profile": "pe_native_broad", - "target_cells": [{"variable": "snap"}], - }, - ) - ) - - def test_load_targets_supports_exact_and_null_domain_filters(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_policyengine_targets_db(db_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - - national_targets = provider.load_targets( - period=2024, - variables=["snap"], - geo_levels=["national"], - domain_variable_is_null=True, - ) - state_targets = provider.load_targets( - period=2024, - variables=["snap"], - geo_levels=["state"], - domain_variable_values=["snap"], - ) - - assert [target.target_id for target in national_targets] == [10] - assert [target.target_id for target in state_targets] == [11] - - def test_load_targets_supports_exact_target_cells(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_policyengine_targets_db(db_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - national_targets = provider.load_targets( - period=2024, - target_cells=[ - { - "variable": "snap", - "geo_level": "national", - "domain_variable": None, - } - ], - ) - state_targets = provider.load_targets( - period=2024, - target_cells=[ - { - "variable": "snap", - "geo_level": "state", - "domain_variable": "snap", - } - ], - ) - - assert [target.target_id for target in national_targets] == [10] - assert [target.target_id for target in state_targets] == [11] - - def test_load_targets_supports_multi_domain_target_cells(self, tmp_path): - db_path = tmp_path / "policy_data.db" - conn = sqlite3.connect(db_path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - - CREATE VIEW target_overview AS - SELECT - t.target_id, - t.stratum_id, - t.variable, - t.value, - t.period, - t.active, - 'state' AS geo_level, - '06' AS geographic_id, - 'eitc,eitc_child_count' AS domain_variable - FROM targets AS t; - """ - ) - constraints = ( - PolicyEngineUSConstraint("state_fips", "==", "06"), - PolicyEngineUSConstraint("tax_unit_is_filer", "==", "1"), - PolicyEngineUSConstraint("eitc", ">", "0"), - PolicyEngineUSConstraint("eitc_child_count", "==", "2"), - ) - conn.execute( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - ( - 1, - compute_policyengine_us_definition_hash(constraints), - None, - ), - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - constraint_variable, - operation, - value - ) VALUES (?, ?, ?, ?) - """, - [(1, c.variable, c.operation, c.value) for c in constraints], - ) - conn.execute( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 20, - "tax_unit_count", - 2024, - 1, - 0, - 123.0, - 1, - 5.0, - "IRS SOI", - "California EITC recipients with two children", - ), - ) - conn.commit() - conn.close() - - provider = PolicyEngineUSDBTargetProvider(db_path) - - targets = provider.load_targets( - period=2024, - target_cells=[ - { - "variable": "tax_unit_count", - "geo_level": "state", - "domain_variable": "eitc_child_count", - } - ], - ) - by_domain = provider.load_targets( - period=2024, - variables=["tax_unit_count"], - domain_variables=["eitc_child_count"], - ) - - assert [target.target_id for target in targets] == [20] - assert [target.target_id for target in by_domain] == [20] - - def test_load_targets_allows_geographic_hierarchy_without_literal_inheritance( - self, - tmp_path, - ): - db_path = tmp_path / "policy_data.db" - conn = sqlite3.connect(db_path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - - CREATE VIEW target_overview AS - SELECT - t.target_id, - t.stratum_id, - t.variable, - t.value, - t.period, - t.active, - 'district' AS geo_level, - '0601' AS geographic_id, - NULL AS domain_variable - FROM targets AS t; - """ - ) - parent_constraints = (PolicyEngineUSConstraint("state_fips", "==", "06"),) - child_constraints = ( - PolicyEngineUSConstraint("congressional_district_geoid", "==", "0601"), - ) - conn.executemany( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, ?) - """, - [ - ( - 1, - compute_policyengine_us_definition_hash(parent_constraints), - None, - ), - ( - 2, - compute_policyengine_us_definition_hash( - child_constraints, - parent_stratum_id=1, - ), - 1, - ), - ], - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - constraint_variable, - operation, - value - ) VALUES (?, ?, ?, ?) - """, - (1, "state_fips", "==", "06"), - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - constraint_variable, - operation, - value - ) VALUES (?, ?, ?, ?) - """, - (2, "congressional_district_geoid", "==", "0601"), - ) - conn.execute( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - (1, "household_count", 2024, 2, 0, 100.0, 1, None, "test", "district"), - ) - conn.commit() - conn.close() - - provider = PolicyEngineUSDBTargetProvider(db_path) - targets = provider.load_targets(period=2024) - - assert [target.target_id for target in targets] == [1] - - def test_load_target_set_keeps_best_available_period_targets(self, tmp_path): - db_path = tmp_path / "policy_data.db" - conn = sqlite3.connect(db_path) - conn.executescript( - """ - CREATE TABLE strata ( - stratum_id INTEGER PRIMARY KEY, - definition_hash TEXT, - parent_stratum_id INTEGER - ); - - CREATE TABLE stratum_constraints ( - stratum_id INTEGER NOT NULL, - constraint_variable TEXT NOT NULL, - operation TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - target_id INTEGER PRIMARY KEY, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - stratum_id INTEGER NOT NULL, - reform_id INTEGER NOT NULL DEFAULT 0, - value REAL, - active BOOLEAN NOT NULL DEFAULT 1, - tolerance REAL, - source TEXT, - notes TEXT - ); - - CREATE VIEW target_overview AS - SELECT - t.target_id, - t.stratum_id, - t.variable, - t.value, - t.period, - t.active, - 'national' AS geo_level, - 'US' AS geographic_id, - NULL AS domain_variable - FROM targets AS t; - """ - ) - conn.execute( - """ - INSERT INTO strata (stratum_id, definition_hash, parent_stratum_id) - VALUES (?, ?, NULL) - """, - (1, compute_policyengine_us_definition_hash(())), - ) - conn.execute( - """ - INSERT INTO targets ( - target_id, - variable, - period, - stratum_id, - reform_id, - value, - active, - tolerance, - source, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - (1, "snap", 2023, 1, 0, 100.0, 1, None, "test", "best-period"), - ) - conn.commit() - conn.close() - - provider = PolicyEngineUSDBTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={"variables": ["snap"]}, - ) - ) - - assert len(target_set.targets) == 1 - assert target_set.targets[0].period == 2023 - - def test_load_targets_rejects_invalid_parent_child_constraints(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_policyengine_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - constraint_variable, - operation, - value - ) VALUES (?, ?, ?, ?) - """, - (1, "state_fips", "==", "06"), - ) - conn.execute( - "DELETE FROM stratum_constraints WHERE stratum_id = 2 AND constraint_variable = 'state_fips'" - ) - conn.execute( - """ - UPDATE strata - SET definition_hash = ? - WHERE stratum_id = 1 - """, - ( - compute_policyengine_us_definition_hash( - (PolicyEngineUSConstraint("state_fips", "==", "06"),), - ), - ), - ) - conn.execute( - """ - UPDATE strata - SET definition_hash = ? - WHERE stratum_id = 2 - """, - ( - compute_policyengine_us_definition_hash( - (PolicyEngineUSConstraint("age", ">=", "65"),), - parent_stratum_id=1, - ), - ), - ) - conn.commit() - conn.close() - - provider = PolicyEngineUSDBTargetProvider(db_path) - - with pytest.raises( - PolicyEngineUSTargetValidationError, - match="missing inherited parent constraints", - ): - provider.load_targets(period=2024, variables=["snap"]) - - def test_load_targets_rejects_invalid_definition_hash(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_policyengine_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - "UPDATE strata SET definition_hash = 'broken' WHERE stratum_id = 2" - ) - conn.commit() - conn.close() - - provider = PolicyEngineUSDBTargetProvider(db_path) - - with pytest.raises( - PolicyEngineUSTargetValidationError, - match="definition_hash", - ): - provider.load_targets(period=2024, variables=["snap"]) - - def test_to_quantity_targets_selects_only_unconstrained_targets(self, tmp_path): - db_path = tmp_path / "policy_data.db" - _create_policyengine_targets_db(db_path) - - provider = PolicyEngineUSDBTargetProvider(db_path) - specs = provider.to_quantity_targets({"snap": "snap"}, period=2024) - - assert specs == ( - PolicyEngineUSQuantityTarget( - name="snap", - variable="snap", - column="snap", - period=2024, - ), - ) - - -class TestPolicyEngineUSMicrosimulationAdapter: - def test_compute_targets_supports_sum_and_count_positive(self): - class FakeSimulation: - def calculate(self, variable, period=None, map_to=None): - assert period == 2024 - if variable == "snap": - return np.array([10.0, 0.0, 5.0]) - if variable == "in_poverty": - return np.array([1.0, 0.0, 1.0, 1.0]) - raise KeyError(variable) - - adapter = PolicyEngineUSMicrosimulationAdapter(simulation=FakeSimulation()) - targets = adapter.compute_targets( - ( - PolicyEngineUSQuantityTarget( - name="snap_total", - variable="snap", - column="snap", - period=2024, - aggregation="sum", - ), - PolicyEngineUSQuantityTarget( - name="poverty_count", - variable="in_poverty", - column="in_poverty", - period=2024, - aggregation="count_positive", - ), - ) - ) - - assert targets == { - "snap_total": 15.0, - "poverty_count": 3.0, - } - - def test_from_dataset_retries_without_dataset_year_if_unsupported(self, tmp_path): - dataset_path = tmp_path / "microplex.h5" - dataset_path.write_text("placeholder") - - class FakeSimulation: - def __init__(self, dataset): - self.dataset = dataset - - def calculate(self, variable, period=None, map_to=None): - _ = variable, period, map_to - return np.array([0.0]) - - adapter = PolicyEngineUSMicrosimulationAdapter.from_dataset( - dataset_path, - dataset_year=2024, - simulation_cls=FakeSimulation, - ) - - assert isinstance(adapter.simulation, FakeSimulation) - assert adapter.simulation.dataset == str(dataset_path) - - -class TestPolicyEngineUSConstraintCompilation: - def test_compiles_db_targets_to_household_linear_constraints(self): - households = pd.DataFrame( - { - "household_id": [1, 2], - "state_fips": ["06", "36"], - "weight": [1.0, 1.0], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11, 12], - "household_id": [1, 1, 2], - "age": [70, 40, 30], - } - ) - spm_units = pd.DataFrame( - { - "spm_unit_id": [100, 101], - "household_id": [1, 2], - "snap": [100.0, 0.0], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - spm_units=spm_units, - ) - targets = ( - PolicyEngineUSDBTarget( - target_id=20, - variable="snap", - period=2024, - stratum_id=1, - reform_id=0, - value=200.0, - active=True, - constraints=(PolicyEngineUSConstraint("state_fips", "==", "06"),), - ), - PolicyEngineUSDBTarget( - target_id=21, - variable="household_count", - period=2024, - stratum_id=2, - reform_id=0, - value=2.0, - active=True, - constraints=( - PolicyEngineUSConstraint("snap", ">", "0"), - PolicyEngineUSConstraint("state_fips", "==", "06"), - ), - ), - PolicyEngineUSDBTarget( - target_id=22, - variable="person_count", - period=2024, - stratum_id=3, - reform_id=0, - value=2.0, - active=True, - constraints=( - PolicyEngineUSConstraint("age", ">=", "65"), - PolicyEngineUSConstraint("state_fips", "==", "06"), - ), - ), - ) - variable_bindings = { - "state_fips": PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, - column="state_fips", - ), - "age": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="age", - ), - "snap": PolicyEngineUSVariableBinding( - entity=EntityType.SPM_UNIT, - column="snap", - ), - } - - constraints = compile_policyengine_us_household_linear_constraints( - targets=targets, - tables=tables, - variable_bindings=variable_bindings, - ) - - assert len(constraints) == 3 - np.testing.assert_allclose(constraints[0].coefficients, np.array([100.0, 0.0])) - np.testing.assert_allclose(constraints[1].coefficients, np.array([1.0, 0.0])) - np.testing.assert_allclose(constraints[2].coefficients, np.array([1.0, 0.0])) - assert [constraint.target for constraint in constraints] == [200.0, 2.0, 2.0] - - def test_amount_targets_apply_same_entity_constraints_before_household_aggregation( - self, - ): - households = pd.DataFrame( - { - "household_id": [1, 2], - "state_fips": ["06", "36"], - "weight": [1.0, 1.0], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11, 12], - "household_id": [1, 1, 2], - "age": [70, 40, 30], - "medicaid": [1_000.0, 500.0, 300.0], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - ) - constraints = compile_policyengine_us_household_linear_constraints( - targets=( - PolicyEngineUSDBTarget( - target_id=23, - variable="medicaid", - period=2024, - stratum_id=4, - reform_id=0, - value=1_500.0, - active=True, - constraints=(PolicyEngineUSConstraint("age", ">=", "65"),), - ), - ), - tables=tables, - variable_bindings={ - "age": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="age", - ), - "medicaid": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="medicaid", - ), - }, - ) - - np.testing.assert_allclose( - constraints[0].coefficients, np.array([1_000.0, 0.0]) - ) - - def test_amount_targets_exclude_negative_rows_under_positive_same_entity_filter( - self, - ): - households = pd.DataFrame( - { - "household_id": [1, 2], - "weight": [1.0, 1.0], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11, 12], - "household_id": [1, 1, 2], - "self_employment_income": [100.0, -80.0, -40.0], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - ) - - constraints = compile_policyengine_us_household_linear_constraints( - targets=( - TargetSpec( - name="positive_self_employment_income", - entity=EntityType.HOUSEHOLD, - value=100.0, - period=2024, - measure="self_employment_income", - aggregation=TargetAggregation.SUM, - filters=( - TargetFilter( - feature="self_employment_income", - operator=">", - value=0, - ), - ), - ), - ), - tables=tables, - variable_bindings={ - "self_employment_income": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="self_employment_income", - ), - }, - ) - - np.testing.assert_allclose(constraints[0].coefficients, np.array([100.0, 0.0])) - - def test_compiled_constraints_run_through_calibrator(self): - from microplex.calibration import Calibrator - - households = pd.DataFrame( - { - "household_id": [1, 2], - "state_fips": ["06", "36"], - "weight": [1.0, 1.0], - } - ) - spm_units = pd.DataFrame( - { - "spm_unit_id": [100, 101], - "household_id": [1, 2], - "snap": [100.0, 0.0], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=households, - spm_units=spm_units, - ) - constraints = compile_policyengine_us_household_linear_constraints( - targets=( - PolicyEngineUSDBTarget( - target_id=30, - variable="snap", - period=2024, - stratum_id=1, - reform_id=0, - value=200.0, - active=True, - constraints=(PolicyEngineUSConstraint("state_fips", "==", "06"),), - ), - PolicyEngineUSDBTarget( - target_id=31, - variable="household_count", - period=2024, - stratum_id=2, - reform_id=0, - value=2.0, - active=True, - constraints=(PolicyEngineUSConstraint("snap", ">", "0"),), - ), - ), - tables=tables, - variable_bindings={ - "state_fips": PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, - column="state_fips", - ), - "snap": PolicyEngineUSVariableBinding( - entity=EntityType.SPM_UNIT, - column="snap", - ), - }, - ) - - calibrator = Calibrator(method="entropy") - calibrated = calibrator.fit_transform( - households, - {}, - linear_constraints=constraints, - ) - report = calibrator.validate(calibrated) - - assert report["max_error"] < 1e-6 - np.testing.assert_allclose( - calibrated["weight"].values, - np.array([2.0, 1.0]), - rtol=1e-5, - ) - - def test_canonical_target_specs_compile_to_household_constraints(self): - households = pd.DataFrame( - { - "household_id": [1, 2], - "state_fips": ["06", "36"], - "weight": [1.0, 1.0], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11, 12], - "household_id": [1, 1, 2], - "age": [70, 40, 30], - } - ) - spm_units = pd.DataFrame( - { - "spm_unit_id": [100, 101], - "household_id": [1, 2], - "snap": [100.0, 0.0], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - spm_units=spm_units, - ) - - constraints = compile_policyengine_us_household_linear_constraints( - targets=( - TargetSpec( - name="snap_california", - entity=EntityType.SPM_UNIT, - value=100.0, - period=2024, - measure="snap", - aggregation=TargetAggregation.SUM, - filters=( - TargetFilter(feature="state_fips", operator="==", value="06"), - ), - ), - TargetSpec( - name="senior_households", - entity=EntityType.HOUSEHOLD, - value=1.0, - period=2024, - aggregation=TargetAggregation.COUNT, - filters=(TargetFilter(feature="age", operator=">=", value=65),), - ), - ), - tables=tables, - variable_bindings={ - "state_fips": PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, - column="state_fips", - ), - "age": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="age", - ), - "snap": PolicyEngineUSVariableBinding( - entity=EntityType.SPM_UNIT, - column="snap", - ), - }, - ) - - assert [constraint.name for constraint in constraints] == [ - "snap_california", - "senior_households", - ] - np.testing.assert_allclose(constraints[0].coefficients, np.array([100.0, 0.0])) - np.testing.assert_allclose(constraints[1].coefficients, np.array([1.0, 0.0])) - - def test_amount_targets_align_tax_unit_constraints_before_household_aggregation( - self, - ): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "weight": [1.0, 1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 12], - "household_id": [1, 1, 2], - "tax_unit_id": [100, 101, 200], - "dividend_income": [100.0, 200.0, 300.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 101, 200], - "household_id": [1, 1, 2], - "tax_unit_is_filer": [1, 0, 1], - } - ), - ) - - constraints = compile_policyengine_us_household_linear_constraints( - targets=( - TargetSpec( - name="filer_dividend_income", - entity=EntityType.PERSON, - value=400.0, - period=2024, - measure="dividend_income", - aggregation=TargetAggregation.SUM, - filters=( - TargetFilter( - feature="tax_unit_is_filer", - operator="==", - value=1, - ), - ), - ), - ), - tables=tables, - variable_bindings={ - "dividend_income": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="dividend_income", - ), - "tax_unit_is_filer": PolicyEngineUSVariableBinding( - entity=EntityType.TAX_UNIT, - column="tax_unit_is_filer", - ), - }, - ) - - np.testing.assert_allclose( - constraints[0].coefficients, - np.array([100.0, 300.0]), - ) - - def test_count_targets_align_person_constraints_to_tax_unit_rows(self): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1, 2], - "weight": [1.0, 1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [10, 11, 12], - "household_id": [1, 1, 2], - "tax_unit_id": [100, 101, 200], - "dividend_income": [100.0, 200.0, 300.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 101, 200], - "household_id": [1, 1, 2], - "tax_unit_is_filer": [1, 0, 1], - } - ), - ) - - constraints = compile_policyengine_us_household_linear_constraints( - targets=( - TargetSpec( - name="filer_tax_units_with_dividends", - entity=EntityType.TAX_UNIT, - value=2.0, - period=2024, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter( - feature="dividend_income", - operator=">", - value=0.0, - ), - TargetFilter( - feature="tax_unit_is_filer", - operator="==", - value=1, - ), - ), - ), - ), - tables=tables, - variable_bindings={ - "dividend_income": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="dividend_income", - ), - "tax_unit_is_filer": PolicyEngineUSVariableBinding( - entity=EntityType.TAX_UNIT, - column="tax_unit_is_filer", - ), - }, - ) - - np.testing.assert_allclose( - constraints[0].coefficients, - np.array([1.0, 1.0]), - ) - - def test_materializes_formula_variables_before_compiling_constraints( - self, tmp_path - ): - households = pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - "state_fips": [6, 36], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11], - "household_id": [1, 2], - "tax_unit_id": [100, 200], - "spm_unit_id": [1000, 2000], - "family_id": [5000, 6000], - "marital_unit_id": [7000, 8000], - "age": [34, 52], - "employment_income": [20_000.0, 15_000.0], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 200], - "household_id": [1, 2], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000, 2000], - "household_id": [1, 2], - } - ), - families=pd.DataFrame( - { - "family_id": [5000, 6000], - "household_id": [1, 2], - } - ), - marital_units=pd.DataFrame( - { - "marital_unit_id": [7000, 8000], - "household_id": [1, 2], - } - ), - ) - - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self): - return not self.formulas - - class FakeSystem: - variables = { - "employment_income": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable( - FakeEntity("household"), - formulas={"2024": object()}, - ), - } - - class FakeSimulation: - tax_benefit_system = FakeSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - self.dataset = dataset - self.dataset_year = dataset_year - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert Path(self.dataset).exists() - assert self.dataset_year == 2024 - assert period == 2024 - assert map_to is None - if variable == "snap": - return np.array([120.0, 0.0]) - raise KeyError(variable) - - materialized_tables, materialized_bindings = ( - materialize_policyengine_us_variables( - tables, - variables=("snap",), - period=2024, - dataset_year=2024, - simulation_cls=FakeSimulation, - temp_dir=tmp_path, - ) - ) - - assert materialized_bindings["snap"] == PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, - column="snap", - ) - np.testing.assert_allclose( - materialized_tables.households["snap"].to_numpy(dtype=float), - np.array([120.0, 0.0]), - ) - - constraints = compile_policyengine_us_household_linear_constraints( - targets=( - PolicyEngineUSDBTarget( - target_id=32, - variable="snap", - period=2024, - stratum_id=1, - reform_id=0, - value=240.0, - active=True, - constraints=(), - ), - PolicyEngineUSDBTarget( - target_id=33, - variable="household_count", - period=2024, - stratum_id=2, - reform_id=0, - value=2.0, - active=True, - constraints=(PolicyEngineUSConstraint("snap", ">", "0"),), - ), - ), - tables=materialized_tables, - variable_bindings=materialized_bindings, - ) - - np.testing.assert_allclose(constraints[0].coefficients, np.array([120.0, 0.0])) - np.testing.assert_allclose(constraints[1].coefficients, np.array([1.0, 0.0])) - - def test_variables_to_materialize_can_force_formula_outputs(self): - targets = [ - TargetSpec( - name="ssi", - entity=EntityType.PERSON, - value=100.0, - period=2024, - measure="ssi", - ) - ] - bindings = { - "ssi": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="ssi", - ), - "employment_income": PolicyEngineUSVariableBinding( - entity=EntityType.PERSON, - column="employment_income", - ), - } - - assert policyengine_us_variables_to_materialize(targets, bindings) == set() - assert policyengine_us_variables_to_materialize( - targets, - bindings, - force_materialize_variables={"ssi"}, - ) == {"ssi"} - - def test_materialization_supports_nested_system_attribute(self, tmp_path): - households = pd.DataFrame( - { - "household_id": [1], - "household_weight": [1.0], - "state_fips": [6], - } - ) - persons = pd.DataFrame( - { - "person_id": [10], - "household_id": [1], - "tax_unit_id": [100], - "spm_unit_id": [1000], - "family_id": [5000], - "marital_unit_id": [7000], - "employment_income": [20_000.0], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=pd.DataFrame({"tax_unit_id": [100], "household_id": [1]}), - spm_units=pd.DataFrame({"spm_unit_id": [1000], "household_id": [1]}), - families=pd.DataFrame({"family_id": [5000], "household_id": [1]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [7000], "household_id": [1]} - ), - ) - - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self): - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "employment_income": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable( - FakeEntity("household"), - formulas={"2024": object()}, - ), - } - - class FakeSystemModule: - system = FakeTaxBenefitSystem() - - class FakeSimulation: - system = FakeSystemModule() - - def __init__(self, dataset, dataset_year=None, **kwargs): - self.dataset = dataset - self.dataset_year = dataset_year - _ = kwargs - - def calculate(self, variable, period=None, map_to=None): - assert Path(self.dataset).exists() - assert self.dataset_year == 2024 - assert period == 2024 - assert map_to is None - if variable == "snap": - return np.array([75.0]) - raise KeyError(variable) - - materialized_tables, materialized_bindings = ( - materialize_policyengine_us_variables( - tables, - variables=("snap",), - period=2024, - dataset_year=2024, - simulation_cls=FakeSimulation, - temp_dir=tmp_path, - ) - ) - - assert materialized_bindings["snap"] == PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, - column="snap", - ) - np.testing.assert_allclose( - materialized_tables.households["snap"].to_numpy(dtype=float), - np.array([75.0]), - ) - - def test_materialization_skips_derived_pipeline_support_columns(self, tmp_path): - households = pd.DataFrame( - { - "household_id": [1], - "household_weight": [1.0], - "state_fips": [6], - } - ) - persons = pd.DataFrame( - { - "person_id": [10, 11], - "household_id": [1, 1], - "tax_unit_id": [100, 100], - "spm_unit_id": [1000, 1000], - "family_id": [5000, 5000], - "marital_unit_id": [7000, 7000], - "age": [34, 12], - "age_group": ["18-34", "0-17"], - "employment_income_before_lsr": [20_000.0, 0.0], - } - ) - tables = PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=pd.DataFrame({"tax_unit_id": [100], "household_id": [1]}), - spm_units=pd.DataFrame({"spm_unit_id": [1000], "household_id": [1]}), - families=pd.DataFrame({"family_id": [5000], "household_id": [1]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [7000], "household_id": [1]} - ), - ) - - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = entity - self.formulas = formulas or {} - - def is_input_variable(self): - return not self.formulas - - class FakeTaxBenefitSystem: - variables = { - "age": FakeVariable(FakeEntity("person")), - "age_group": FakeVariable(FakeEntity("person")), - "employment_income_before_lsr": FakeVariable(FakeEntity("person")), - "state_fips": FakeVariable(FakeEntity("household")), - "snap": FakeVariable( - FakeEntity("household"), - formulas={"2024": object()}, - ), - } - - class FakeSimulation: - tax_benefit_system = FakeTaxBenefitSystem() - - def __init__(self, dataset, dataset_year=None, **kwargs): - _ = dataset_year, kwargs - with h5py.File(dataset, "r") as handle: - assert "age" in handle - assert "employment_income_before_lsr" in handle - assert "state_fips" in handle - assert "age_group" not in handle - assert len(handle["age"]["2024"]) == 2 - assert len(handle["state_fips"]["2024"]) == 1 - - def calculate(self, variable, period=None, map_to=None): - _ = period, map_to - if variable == "snap": - return np.array([10.0]) - raise KeyError(variable) - - materialized_tables, _ = materialize_policyengine_us_variables( - tables, - variables=("snap",), - period=2024, - dataset_year=2024, - simulation_cls=FakeSimulation, - temp_dir=tmp_path, - ) - - np.testing.assert_allclose( - materialized_tables.households["snap"].to_numpy(dtype=float), - np.array([10.0]), - ) - - def test_safe_materialization_one_by_one_uses_prior_materialized_outputs( - self, - monkeypatch, - ): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1], - "household_weight": [1.0], - "state_fips": [6], - } - ), - persons=pd.DataFrame( - { - "person_id": [10], - "household_id": [1], - "tax_unit_id": [100], - "spm_unit_id": [1000], - "family_id": [5000], - "marital_unit_id": [7000], - } - ), - tax_units=pd.DataFrame({"tax_unit_id": [100], "household_id": [1]}), - spm_units=pd.DataFrame({"spm_unit_id": [1000], "household_id": [1]}), - families=pd.DataFrame({"family_id": [5000], "household_id": [1]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [7000], "household_id": [1]} - ), - ) - - def fake_materialize( - incoming_tables, - *, - variables, - period, - dataset_year=None, - simulation_cls=None, - microsimulation_kwargs=None, - temp_dir=None, - direct_override_variables=(), - ): - _ = ( - period, - dataset_year, - simulation_cls, - microsimulation_kwargs, - temp_dir, - direct_override_variables, - ) - if tuple(variables) == ("a", "b"): - raise RuntimeError("batch failed") - if tuple(variables) == ("a",): - updated = PolicyEngineUSEntityTableBundle( - households=incoming_tables.households.assign(a=[1.0]), - persons=incoming_tables.persons.copy(), - tax_units=incoming_tables.tax_units.copy(), - spm_units=incoming_tables.spm_units.copy(), - families=incoming_tables.families.copy(), - marital_units=incoming_tables.marital_units.copy(), - ) - return updated, { - "a": PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, - column="a", - ) - } - if tuple(variables) == ("b",): - assert "a" in incoming_tables.households.columns - updated = PolicyEngineUSEntityTableBundle( - households=incoming_tables.households.assign( - b=incoming_tables.households["a"] + 1.0 - ), - persons=incoming_tables.persons.copy(), - tax_units=incoming_tables.tax_units.copy(), - spm_units=incoming_tables.spm_units.copy(), - families=incoming_tables.families.copy(), - marital_units=incoming_tables.marital_units.copy(), - ) - return updated, { - "b": PolicyEngineUSVariableBinding( - entity=EntityType.HOUSEHOLD, - column="b", - ) - } - raise AssertionError(f"unexpected variables: {variables}") - - monkeypatch.setattr( - "microplex_us.policyengine.us.materialize_policyengine_us_variables", - fake_materialize, - ) - - result = materialize_policyengine_us_variables_safely( - tables, - variables=("a", "b"), - period=2024, - ) - - assert result.materialized_variables == ("a", "b") - assert result.failed_variables == {} - np.testing.assert_allclose( - result.tables.households["b"].to_numpy(dtype=float), - np.array([2.0]), - ) - - -class TestPolicyEngineUSSimulationTargetCompiler: - def test_missing_modifier_handler_skips_target(self): - households = pd.DataFrame({"household_id": [1, 2], "weight": [1.0, 1.0]}) - spm_units = pd.DataFrame( - { - "spm_unit_id": [100, 200], - "household_id": [1, 2], - } - ) - target = TargetSpec( - name="snap_takeup", - entity=EntityType.SPM_UNIT, - value=10.0, - period=2024, - measure="snap", - sim_modifiers=(TargetSimulationModifier("unknown_modifier"),), - ) - - result = PolicyEngineUSSimulationTargetCompiler( - period=2024 - ).compile_simulation_target_constraints( - targets=(target,), - entity_frames={ - EntityType.HOUSEHOLD: households, - EntityType.SPM_UNIT: spm_units, - }, - entity_weight_indexes={EntityType.HOUSEHOLD: np.array([0, 1])}, - ) - - assert result.constraints == () - assert result.skipped_targets == ( - ( - "snap_takeup", - "missing_policyengine_us_sim_modifier_handler:unknown_modifier", - ), - ) - - def test_default_rerandomized_takeup_handler_compiles_sparse_constraint( - self, - monkeypatch, - ): - households = pd.DataFrame({"household_id": [1, 2], "weight": [1.0, 1.0]}) - spm_units = pd.DataFrame( - { - "spm_unit_id": [100, 200], - "household_id": [1, 2], - } - ) - - monkeypatch.setattr( - "microplex_us.policyengine.takeup._load_microplex_takeup_rate", - lambda variable_name, year: ( - 1.0 if variable_name == "snap" and year == 2024 else 0.0 - ), - ) - target = TargetSpec( - name="snap_takeup_units", - entity=EntityType.SPM_UNIT, - value=2.0, - period=2024, - aggregation=TargetAggregation.COUNT, - filters=(TargetFilter("takes_up_snap_if_eligible", "==", True),), - sim_modifiers=( - TargetSimulationModifier( - "rerandomize_takeup", - parameters={"features": ["takes_up_snap_if_eligible"]}, - ), - ), - ) - - result = PolicyEngineUSSimulationTargetCompiler( - period=2024 - ).compile_simulation_target_constraints( - targets=(target,), - entity_frames={ - EntityType.HOUSEHOLD: households, - EntityType.SPM_UNIT: spm_units, - }, - entity_weight_indexes={EntityType.HOUSEHOLD: np.array([0, 1])}, - ) - - assert result.skipped_targets == () - constraint = result.constraints[0] - assert constraint.name == "snap_takeup_units" - assert constraint.weight_indexes.tolist() == [0, 1] - assert constraint.coefficients.tolist() == [1.0, 1.0] - assert constraint.target == 2.0 - - def test_default_rerandomized_takeup_handler_skips_unsupported_feature(self): - households = pd.DataFrame({"household_id": [1, 2], "weight": [1.0, 1.0]}) - persons = pd.DataFrame( - { - "person_id": [10, 20], - "household_id": [1, 2], - } - ) - target = TargetSpec( - name="medicare_takeup_people", - entity=EntityType.PERSON, - value=1.0, - period=2024, - aggregation=TargetAggregation.COUNT, - filters=(TargetFilter("takes_up_medicare_if_eligible", "==", True),), - sim_modifiers=( - TargetSimulationModifier( - "rerandomize_takeup", - parameters={"features": ["takes_up_medicare_if_eligible"]}, - ), - ), - ) - - result = PolicyEngineUSSimulationTargetCompiler( - period=2024 - ).compile_simulation_target_constraints( - targets=(target,), - entity_frames={ - EntityType.HOUSEHOLD: households, - EntityType.PERSON: persons, - }, - entity_weight_indexes={EntityType.HOUSEHOLD: np.array([0, 1])}, - ) - - assert result.constraints == () - assert result.skipped_targets == ( - ( - "medicare_takeup_people", - "policyengine_us_rerandomize_takeup_unsupported_features:" - "takes_up_medicare_if_eligible", - ), - ) - - def test_materializes_calculated_output_before_sparse_constraint( - self, - monkeypatch, - ): - households = pd.DataFrame({"household_id": [1, 2], "weight": [1.0, 1.0]}) - spm_units = pd.DataFrame( - { - "spm_unit_id": [100, 200], - "household_id": [1, 2], - "snap": [999.0, 999.0], - } - ) - - def fake_materialize( - tables, - *, - variables, - period, - **kwargs, - ): - del kwargs - assert variables == ("snap",) - assert period == 2024 - return PolicyEngineUSVariableMaterializationResult( - tables=PolicyEngineUSEntityTableBundle( - households=tables.households.copy(), - spm_units=tables.spm_units.assign(snap=[3.0, 0.0]), - ), - bindings={ - "snap": PolicyEngineUSVariableBinding( - entity=EntityType.SPM_UNIT, - column="snap", - ) - }, - materialized_variables=("snap",), - ) - - monkeypatch.setattr( - "microplex_us.policyengine.us.materialize_policyengine_us_variables_safely", - fake_materialize, - ) - - target = TargetSpec( - name="snap_total", - entity=EntityType.SPM_UNIT, - value=10.0, - period=2024, - measure="snap", - sim_modifiers=(TargetSimulationModifier("policyengine_us_materialize"),), - ) - result = PolicyEngineUSSimulationTargetCompiler( - period=2024 - ).compile_simulation_target_constraints( - targets=(target,), - entity_frames={ - EntityType.HOUSEHOLD: households, - EntityType.SPM_UNIT: spm_units, - }, - entity_weight_indexes={EntityType.HOUSEHOLD: np.array([0, 1])}, - ) - - assert result.skipped_targets == () - constraint = result.constraints[0] - assert constraint.name == "snap_total" - assert constraint.weight_indexes.tolist() == [0] - assert constraint.coefficients.tolist() == [3.0] - assert constraint.target == 10.0 - assert ( - constraint.metadata["compiled_by"] - == "policyengine_us_simulation_target_compiler" - ) - - def test_modifier_handler_runs_before_materialization(self, monkeypatch): - households = pd.DataFrame({"household_id": [1, 2], "weight": [1.0, 1.0]}) - spm_units = pd.DataFrame( - { - "spm_unit_id": [100, 200], - "household_id": [1, 2], - } - ) - calls: list[tuple[str, ...]] = [] - - def rerandomize_takeup(tables, *, targets, parameters): - calls.append(tuple(target.name for target in targets)) - assert parameters == ({"features": ["takes_up_snap_if_eligible"]},) - return PolicyEngineUSEntityTableBundle( - households=tables.households.copy(), - spm_units=tables.spm_units.assign( - takes_up_snap_if_eligible=[True, False] - ), - ) - - def fake_materialize( - tables, - *, - variables, - period, - **kwargs, - ): - del kwargs - assert variables == ("snap",) - assert period == 2024 - assert tables.spm_units["takes_up_snap_if_eligible"].tolist() == [ - True, - False, - ] - return PolicyEngineUSVariableMaterializationResult( - tables=PolicyEngineUSEntityTableBundle( - households=tables.households.copy(), - spm_units=tables.spm_units.assign(snap=[5.0, 7.0]), - ), - bindings={ - "snap": PolicyEngineUSVariableBinding( - entity=EntityType.SPM_UNIT, - column="snap", - ) - }, - materialized_variables=("snap",), - ) - - monkeypatch.setattr( - "microplex_us.policyengine.us.materialize_policyengine_us_variables_safely", - fake_materialize, - ) - target = TargetSpec( - name="snap_takeup_total", - entity=EntityType.SPM_UNIT, - value=10.0, - period=2024, - measure="snap", - filters=(TargetFilter("takes_up_snap_if_eligible", "==", True),), - sim_modifiers=( - TargetSimulationModifier( - "rerandomize_takeup", - parameters={"features": ["takes_up_snap_if_eligible"]}, - ), - TargetSimulationModifier( - "policyengine_us_materialize", - parameters={"features": ["snap"]}, - ), - ), - ) - - result = PolicyEngineUSSimulationTargetCompiler( - period=2024, - modifier_handlers={"rerandomize_takeup": rerandomize_takeup}, - ).compile_simulation_target_constraints( - targets=(target,), - entity_frames={ - EntityType.HOUSEHOLD: households, - EntityType.SPM_UNIT: spm_units, - }, - entity_weight_indexes={EntityType.HOUSEHOLD: np.array([0, 1])}, - ) - - assert calls == [("snap_takeup_total",)] - assert result.skipped_targets == () - constraint = result.constraints[0] - assert constraint.weight_indexes.tolist() == [0] - assert constraint.coefficients.tolist() == [5.0] - - -class TestPolicyEngineUSProjection: - def test_builds_structural_time_period_arrays_from_entity_tables(self): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20], - "household_weight": [1.5, 2.5], - "state_code": ["CA", "NY"], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "tax_unit_id": [100, 100, 200], - "spm_unit_id": [1000, 1000, 2000], - "family_id": [5000, 5000, 6000], - "marital_unit_id": [7000, 7000, 8000], - "age": [34, 12, 45], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 200], - "household_id": [10, 20], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000, 2000], - "household_id": [10, 20], - "snap": [1200.0, 300.0], - } - ), - families=pd.DataFrame( - { - "family_id": [5000, 6000], - "household_id": [10, 20], - } - ), - marital_units=pd.DataFrame( - { - "marital_unit_id": [7000, 8000], - "household_id": [10, 20], - } - ), - ) - - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map={"state_code": "state_code"}, - person_variable_map={"age": "age"}, - spm_unit_variable_map={"snap": "snap"}, - ) - - expected_keys = { - "household_id", - "person_id", - "tax_unit_id", - "spm_unit_id", - "family_id", - "marital_unit_id", - "person_household_id", - "person_tax_unit_id", - "person_spm_unit_id", - "person_family_id", - "person_marital_unit_id", - "household_weight", - "state_code", - "age", - "snap", - } - assert expected_keys.issubset(arrays) - assert set(arrays["household_id"]) == {"2024"} - np.testing.assert_array_equal( - arrays["household_id"]["2024"], np.array([10, 20]) - ) - np.testing.assert_array_equal( - arrays["person_household_id"]["2024"], np.array([10, 10, 20]) - ) - np.testing.assert_array_equal( - arrays["person_tax_unit_id"]["2024"], np.array([100, 100, 200]) - ) - np.testing.assert_array_equal( - arrays["person_spm_unit_id"]["2024"], np.array([1000, 1000, 2000]) - ) - np.testing.assert_array_equal( - arrays["person_family_id"]["2024"], np.array([5000, 5000, 6000]) - ) - np.testing.assert_array_equal( - arrays["person_marital_unit_id"]["2024"], np.array([7000, 7000, 8000]) - ) - np.testing.assert_allclose( - arrays["household_weight"]["2024"], np.array([1.5, 2.5], dtype=np.float32) - ) - assert "person_weight" not in arrays - assert "tax_unit_weight" not in arrays - assert "spm_unit_weight" not in arrays - assert "family_weight" not in arrays - assert "marital_unit_weight" not in arrays - np.testing.assert_array_equal(arrays["age"]["2024"], np.array([34, 12, 45])) - np.testing.assert_allclose(arrays["snap"]["2024"], np.array([1200.0, 300.0])) - - def test_export_column_names_match_written_time_period_arrays(self): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "age": FakeVariable("person"), - "snap": FakeVariable("spm_unit"), - "state_code": FakeVariable("household"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.5], - "state_code": ["CA"], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 10], - "tax_unit_id": [100, 100], - "spm_unit_id": [1000, 1000], - "family_id": [5000, 5000], - "marital_unit_id": [7000, 7000], - "age": [34, 12], - } - ), - tax_units=pd.DataFrame({"tax_unit_id": [100], "household_id": [10]}), - spm_units=pd.DataFrame( - {"spm_unit_id": [1000], "household_id": [10], "snap": [1200.0]} - ), - families=pd.DataFrame({"family_id": [5000], "household_id": [10]}), - marital_units=pd.DataFrame( - {"marital_unit_id": [7000], "household_id": [10]} - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map=export_maps["household"], - person_variable_map=export_maps["person"], - spm_unit_variable_map=export_maps["spm_unit"], - ) - columns = build_policyengine_us_export_column_names( - tables, - tax_benefit_system=FakeSystem(), - ) - excluded = resolve_policyengine_excluded_export_variables( - FakeSystem(), - sorted(arrays), - ) - - assert columns == set(arrays) - excluded - - def test_derives_household_head_export_from_relationship_to_head(self): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20], - "household_weight": [1.0, 1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "relationship_to_head": [0, 2, 0], - } - ), - ) - - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - person_variable_map={"is_household_head": "is_household_head"}, - ) - - np.testing.assert_array_equal( - arrays["is_household_head"]["2024"], - np.array([True, False, True]), - ) - - def test_export_variable_maps_include_derived_household_head(self): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "is_household_head": FakeVariable("person"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "relationship_to_head": [0], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - - assert export_maps["person"]["is_household_head"] == "is_household_head" - - def test_derives_missing_group_tables_from_person_memberships(self): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20], - "weight": [1.5, 2.5], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 10, 20], - "tax_unit_id": [100, 100, 200], - "spm_unit_id": [1000, 1000, 2000], - "family_id": [5000, 5000, 6000], - "marital_unit_id": [7000, 7000, 8000], - } - ), - ) - - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - ) - - np.testing.assert_array_equal( - arrays["tax_unit_id"]["2024"], np.array([100, 200]) - ) - np.testing.assert_array_equal( - arrays["spm_unit_id"]["2024"], np.array([1000, 2000]) - ) - np.testing.assert_array_equal( - arrays["family_id"]["2024"], np.array([5000, 6000]) - ) - np.testing.assert_array_equal( - arrays["marital_unit_id"]["2024"], np.array([7000, 8000]) - ) - assert "family_weight" not in arrays - assert "marital_unit_weight" not in arrays - - def test_detects_computed_exports_from_formula_and_aggregate_variables(self): - class FakeVariable: - def __init__(self, adds=None, subtracts=None, formulas=None): - self.adds = adds or [] - self.subtracts = subtracts or [] - self.formulas = formulas or {} - - class FakeSystem: - variables = { - "employment_income": FakeVariable(), - "filing_status": FakeVariable(formulas={"2024": object()}), - "self_employed_pension_contribution_ald_person": FakeVariable( - formulas={"2024": object()} - ), - "self_employed_pension_contribution_ald": FakeVariable( - adds=["self_employed_pension_contribution_ald_person"] - ), - } - - pseudo_inputs = detect_policyengine_pseudo_inputs( - FakeSystem(), - [ - "employment_income", - "filing_status", - "self_employed_pension_contribution_ald", - ], - ) - - assert pseudo_inputs == { - "filing_status", - "self_employed_pension_contribution_ald", - } - - def test_build_policyengine_us_export_variable_maps_includes_tax_inputs(self): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "state_fips": FakeVariable("household"), - "alimony_income": FakeVariable("person"), - "child_support_expense": FakeVariable("person"), - "child_support_received": FakeVariable("person"), - "disability_benefits": FakeVariable("person"), - "employment_income_before_lsr": FakeVariable("person"), - "health_insurance_premiums_without_medicare_part_b": FakeVariable( - "person" - ), - "is_female": FakeVariable("person"), - "medicare_part_b_premiums": FakeVariable("person"), - "other_medical_expenses": FakeVariable("person"), - "over_the_counter_health_expenses": FakeVariable("person"), - "rent": FakeVariable("person"), - "real_estate_taxes": FakeVariable("person"), - "medicaid": FakeVariable("person"), - "medicaid_enrolled": FakeVariable("person"), - "ssi": FakeVariable("person"), - "ssi_reported": FakeVariable("person"), - "takes_up_ssi_if_eligible": FakeVariable("person"), - "self_employment_income_before_lsr": FakeVariable("person"), - "taxable_interest_income": FakeVariable("person"), - "qualified_dividend_income": FakeVariable("person"), - "non_qualified_dividend_income": FakeVariable("person"), - "short_term_capital_gains": FakeVariable("person"), - "long_term_capital_gains_before_response": FakeVariable("person"), - "rental_income": FakeVariable("person"), - "unemployment_compensation": FakeVariable("person"), - "snap": FakeVariable("spm_unit"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - "state_fips": [6], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "alimony_income": [500.0], - "child_support_expense": [350.0], - "child_support_received": [200.0], - "disability_benefits": [300.0], - "employment_income_before_lsr": [50_000.0], - "health_insurance_premiums_without_medicare_part_b": [900.0], - "is_female": [True], - "medicare_part_b_premiums": [400.0], - "other_medical_expenses": [250.0], - "over_the_counter_health_expenses": [75.0], - "rent": [1_200.0], - "real_estate_taxes": [300.0], - "medicaid": [1_200.0], - "medicaid_enrolled": [True], - "ssi": [400.0], - "ssi_reported": [400.0], - "takes_up_ssi_if_eligible": [True], - "self_employment_income_before_lsr": [2_000.0], - "taxable_interest_income": [100.0], - "qualified_dividend_income": [40.0], - "non_qualified_dividend_income": [60.0], - "short_term_capital_gains": [25.0], - "long_term_capital_gains_before_response": [75.0], - "rental_income": [500.0], - "unemployment_compensation": [0.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100], - "household_id": [10], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000], - "household_id": [10], - "snap": [1_800.0], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - - assert export_maps["household"] == {"state_fips": "state_fips"} - assert export_maps["tax_unit"] == {} - assert "snap" not in export_maps["spm_unit"] - expected_person_exports = { - "alimony_income": "alimony_income", - "child_support_expense": "child_support_expense", - "child_support_received": "child_support_received", - "disability_benefits": "disability_benefits", - "employment_income_before_lsr": "employment_income_before_lsr", - "health_insurance_premiums_without_medicare_part_b": "health_insurance_premiums_without_medicare_part_b", - "is_female": "is_female", - "other_medical_expenses": "other_medical_expenses", - "over_the_counter_health_expenses": "over_the_counter_health_expenses", - "real_estate_taxes": "real_estate_taxes", - "self_employment_income_before_lsr": "self_employment_income_before_lsr", - "takes_up_ssi_if_eligible": "takes_up_ssi_if_eligible", - "taxable_interest_income": "taxable_interest_income", - "qualified_dividend_income": "qualified_dividend_income", - "non_qualified_dividend_income": "non_qualified_dividend_income", - "short_term_capital_gains": "short_term_capital_gains", - "long_term_capital_gains_before_response": "long_term_capital_gains_before_response", - "rental_income": "rental_income", - "unemployment_compensation": "unemployment_compensation", - } - assert expected_person_exports.items() <= export_maps["person"].items() - assert "medicare_part_b_premiums" not in export_maps["person"].values() - assert "rent" not in export_maps["person"].values() - assert "medicaid" not in export_maps["person"].values() - - def test_build_policyengine_us_export_variable_maps_includes_contract_inputs(self): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - person_contract_inputs = ( - "alimony_expense", - "amt_foreign_tax_credit", - "bank_account_assets", - "bond_assets", - "casualty_loss", - "charitable_cash_donations", - "charitable_non_cash_donations", - "early_withdrawal_penalty", - "educator_expense", - "excess_withheld_payroll_tax", - "general_business_credit", - "investment_income_elected_form_4952", - "is_household_head", - "long_term_capital_gains_on_collectibles", - "miscellaneous_income", - "non_sch_d_capital_gains", - "other_credits", - "own_children_in_household", - "prior_year_minimum_tax_credit", - "qualified_tuition_expenses", - "salt_refund_income", - "stock_assets", - "taxable_ira_distributions", - "tip_income", - "unreimbursed_business_employee_expenses", - ) - household_contract_inputs = ( - "auto_loan_balance", - "auto_loan_interest", - "block_geoid", - "congressional_district_geoid", - "county_fips", - "tenure_type", - "tract_geoid", - ) - tax_unit_contract_inputs = ( - "domestic_production_ald", - "recapture_of_investment_credit", - "unrecaptured_section_1250_gain", - "unreported_payroll_tax", - ) - spm_unit_contract_inputs = ( - "receives_housing_assistance", - "spm_unit_pre_subsidy_childcare_expenses", - "spm_unit_tenure_type", - ) - legacy_spm_unit_contract_inputs = ( - "spm_unit_energy_subsidy", - "takes_up_housing_assistance_if_eligible", - ) - legacy_person_contract_inputs = ("count_under_18", "count_under_6") - - class FakeSystem: - variables = { - **{name: FakeVariable("person") for name in person_contract_inputs}, - **{ - name: FakeVariable("household") - for name in household_contract_inputs - }, - **{name: FakeVariable("tax_unit") for name in tax_unit_contract_inputs}, - **{ - name: FakeVariable("spm_unit") - for name in ( - *spm_unit_contract_inputs, - *legacy_spm_unit_contract_inputs, - ) - }, - "self_employed_health_insurance_ald": FakeVariable("tax_unit"), - "self_employed_pension_contribution_ald": FakeVariable("tax_unit"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - **{name: [5.0] for name in household_contract_inputs}, - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - **{name: [1.0] for name in person_contract_inputs}, - **{name: [0] for name in legacy_person_contract_inputs}, - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100], - "household_id": [10], - **{name: [2.0] for name in tax_unit_contract_inputs}, - "self_employed_health_insurance_ald": [3.0], - "self_employed_pension_contribution_ald": [4.0], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000], - "household_id": [10], - "receives_housing_assistance": [True], - "spm_unit_pre_subsidy_childcare_expenses": [1500.0], - "spm_unit_tenure_type": ["RENTER"], - **{name: [1.0] for name in legacy_spm_unit_contract_inputs}, - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - - assert { - name: name - for name in (*person_contract_inputs, *legacy_person_contract_inputs) - }.items() <= export_maps["person"].items() - assert export_maps["household"] == { - name: name for name in household_contract_inputs - } - assert export_maps["tax_unit"] == { - name: name for name in tax_unit_contract_inputs - } - assert { - name: name - for name in ( - *spm_unit_contract_inputs, - *legacy_spm_unit_contract_inputs, - ) - }.items() <= export_maps["spm_unit"].items() - - def test_build_policyengine_us_export_variable_maps_blocks_computed_direct_overrides( - self, - ): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = FakeEntity(entity) - self.formulas = formulas or {} - - class FakeSystem: - variables = { - "state_fips": FakeVariable("household"), - "employment_income_before_lsr": FakeVariable("person"), - "is_female": FakeVariable("person"), - "medicaid": FakeVariable("person", formulas={"2024": object()}), - "medicaid_enrolled": FakeVariable( - "person", - formulas={"2024": object()}, - ), - "ssi": FakeVariable("person", formulas={"2024": object()}), - "filing_status": FakeVariable("tax_unit", formulas={"2024": object()}), - "snap": FakeVariable("spm_unit", formulas={"2024": object()}), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - "state_fips": [6], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "employment_income_before_lsr": [50_000.0], - "is_female": [True], - "medicaid": [1_200.0], - "medicaid_enrolled": [True], - "ssi": [400.0], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100], - "household_id": [10], - "filing_status": ["SINGLE"], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000], - "household_id": [10], - "snap": [1_800.0], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - direct_override_variables=( - "filing_status", - "snap", - "ssi", - "medicaid", - "medicaid_enrolled", - ), - ) - - assert { - "employment_income_before_lsr": "employment_income_before_lsr", - "is_female": "is_female", - }.items() <= export_maps["person"].items() - assert "medicaid" not in export_maps["person"].values() - assert "medicaid_enrolled" not in export_maps["person"].values() - assert "ssi" not in export_maps["person"].values() - assert export_maps["tax_unit"] == {} - assert "snap" not in export_maps["spm_unit"].values() - - def test_build_policyengine_us_export_variable_maps_drops_reported_social_security_retirement_alias( - self, - ): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "social_security_retirement_reported": FakeVariable("person"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "social_security_retirement": [12_000.0], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - - assert ( - "social_security_retirement_reported" not in export_maps["person"].values() - ) - - def test_build_policyengine_us_export_variable_maps_drops_computed_alias_inputs( - self, - ): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "medicare_part_b_premiums_reported": FakeVariable("person"), - "roth_ira_contributions_desired": FakeVariable("person"), - "self_employed_pension_contributions_desired": FakeVariable("person"), - "traditional_ira_contributions_desired": FakeVariable("person"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "medicare_part_b_premiums": [1_800.0], - "roth_ira_contributions": [2_000.0], - "self_employed_pension_contributions": [4_000.0], - "traditional_ira_contributions": [3_000.0], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - - assert "medicare_part_b_premiums_reported" not in export_maps["person"].values() - assert "roth_ira_contributions_desired" not in export_maps["person"].values() - assert ( - "self_employed_pension_contributions_desired" - not in export_maps["person"].values() - ) - assert ( - "traditional_ira_contributions_desired" - not in export_maps["person"].values() - ) - - def test_default_policyengine_us_export_surface_avoids_formula_aggregates(self): - from policyengine_us import CountryTaxBenefitSystem - - tbs = CountryTaxBenefitSystem() - - overlaps = sorted( - name - for name in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - if name in tbs.variables - and name not in POLICYENGINE_US_ALLOWED_COMPUTED_EXPORT_VARIABLES - and ( - getattr(tbs.variables[name], "formulas", None) - or getattr(tbs.variables[name], "adds", None) - or getattr(tbs.variables[name], "subtracts", None) - ) - ) - - assert overlaps == [] - assert "estate_income" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "child_support_expense" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "farm_operations_income" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "farm_rent_income" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "health_savings_account_ald" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "filing_status" not in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "rent" not in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "social_security_retirement" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert ( - "social_security_retirement_reported" - not in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - ) - assert ( - "medicare_part_b_premiums_reported" - not in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - ) - assert "traditional_ira_contributions" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert ( - "traditional_ira_contributions_desired" - in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - ) - assert "roth_ira_contributions" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "roth_ira_contributions_desired" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "traditional_401k_contributions" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "roth_401k_contributions" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert ( - "self_employed_pension_contributions" - in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - ) - assert ( - "self_employed_pension_contributions_desired" - in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - ) - assert ( - "spm_unit_capped_work_childcare_expenses" - in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - ) - assert "non_sch_d_capital_gains" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "receives_wic" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "ssn_card_type" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "tenure_type" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "spm_unit_tenure_type" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "is_separated" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "is_surviving_spouse" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "is_blind" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "takes_up_ssi_if_eligible" in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "ssi" not in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert "ssi_reported" not in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - assert ( - "self_employed_health_insurance_ald" - not in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - ) - assert ( - "self_employed_pension_contribution_ald" - not in SAFE_POLICYENGINE_US_EXPORT_VARIABLES - ) - - def test_resolve_policyengine_excluded_export_variables_preserves_explicit_overrides( - self, - ): - class FakeVariable: - def __init__(self, adds=None, subtracts=None, formulas=None): - self.adds = adds or [] - self.subtracts = subtracts or [] - self.formulas = formulas or {} - - class FakeSystem: - variables = { - "employment_income": FakeVariable(), - "self_employed_pension_contribution_ald_person": FakeVariable( - formulas={"2024": object()} - ), - "self_employed_pension_contribution_ald": FakeVariable( - adds=["self_employed_pension_contribution_ald_person"] - ), - } - - excluded = resolve_policyengine_excluded_export_variables( - FakeSystem(), - ["employment_income", "self_employed_pension_contribution_ald"], - direct_override_variables=("self_employed_pension_contribution_ald",), - ) - - assert excluded == {"self_employed_pension_contribution_ald"} - - def test_build_policyengine_us_export_variable_maps_supports_exact_pre_sim_names( - self, - ): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "cps_race": FakeVariable("person"), - "non_sch_d_capital_gains": FakeVariable("person"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "race": [4], - "non_sch_d_capital_gains": [250.0], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - direct_override_variables=("non_sch_d_capital_gains",), - ) - - assert { - "race": "cps_race", - "non_sch_d_capital_gains": "non_sch_d_capital_gains", - }.items() <= export_maps["person"].items() - - def test_build_policyengine_us_export_variable_maps_prefers_exact_pre_sim_names( - self, - ): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "cps_race": FakeVariable("person"), - "is_hispanic": FakeVariable("person"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - "race": [4], - "cps_race": [3], - "is_hispanic": [False], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - - assert { - "cps_race": "cps_race", - "is_hispanic": "is_hispanic", - }.items() <= export_maps["person"].items() - - def test_build_policyengine_us_export_variable_maps_aliases_rent_to_pre_subsidy_rent( - self, - ): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = FakeEntity(entity) - self.formulas = formulas or {} - - class FakeSystem: - variables = { - "pre_subsidy_rent": FakeVariable("person"), - "rent": FakeVariable("person", formulas={"2024": object()}), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20], - "household_weight": [1.0, 1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 20], - "rent": [14_400.0, 0.0], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - person_variable_map=export_maps["person"], - ) - - assert export_maps["person"]["rent"] == "pre_subsidy_rent" - assert "pre_subsidy_rent" not in export_maps["person"] - assert "rent" not in export_maps["person"].values() - assert arrays["pre_subsidy_rent"]["2024"].tolist() == [14_400.0, 0.0] - - def test_build_policyengine_us_export_variable_maps_includes_absent_export_defaults( - self, - ): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "auto_loan_balance": FakeVariable("household"), - "first_home_mortgage_balance": FakeVariable("tax_unit"), - "immigration_status_str": FakeVariable("person"), - "net_worth": FakeVariable("household"), - "spm_unit_pre_subsidy_childcare_expenses": FakeVariable("spm_unit"), - "spm_unit_tenure_type": FakeVariable("spm_unit"), - "ssn_card_type": FakeVariable("person"), - "takes_up_aca_if_eligible": FakeVariable("tax_unit"), - "takes_up_early_head_start_if_eligible": FakeVariable("person"), - "takes_up_eitc": FakeVariable("tax_unit"), - "takes_up_snap_if_eligible": FakeVariable("spm_unit"), - "tenure_type": FakeVariable("household"), - "weeks_unemployed": FakeVariable("person"), - "would_claim_wic": FakeVariable("person"), - "would_file_taxes_voluntarily": FakeVariable("tax_unit"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100], - "household_id": [10], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000], - "household_id": [10], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - - assert { - "immigration_status_str": "immigration_status_str", - "ssn_card_type": "ssn_card_type", - "takes_up_early_head_start_if_eligible": "takes_up_early_head_start_if_eligible", - "weeks_unemployed": "weeks_unemployed", - "would_claim_wic": "would_claim_wic", - }.items() <= export_maps["person"].items() - assert export_maps["household"] == { - "auto_loan_balance": "auto_loan_balance", - "net_worth": "net_worth", - "tenure_type": "tenure_type", - } - assert export_maps["tax_unit"] == { - "first_home_mortgage_balance": "first_home_mortgage_balance", - "takes_up_aca_if_eligible": "takes_up_aca_if_eligible", - "takes_up_eitc": "takes_up_eitc", - "would_file_taxes_voluntarily": "would_file_taxes_voluntarily", - } - assert { - "spm_unit_pre_subsidy_childcare_expenses": "spm_unit_pre_subsidy_childcare_expenses", - "spm_unit_tenure_type": "spm_unit_tenure_type", - "takes_up_snap_if_eligible": "takes_up_snap_if_eligible", - }.items() <= export_maps["spm_unit"].items() - - def test_time_period_arrays_derive_ecps_persisted_computed_inputs(self): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity, formulas=None): - self.entity = FakeEntity(entity) - self.formulas = formulas or {} - - class FakeSystem: - variables = { - "fsla_overtime_premium": FakeVariable("person"), - "has_itin": FakeVariable("person", formulas={"2024": object()}), - "has_tin": FakeVariable("person", formulas={"2024": object()}), - "hours_worked_last_week": FakeVariable("person"), - "in_nyc": FakeVariable("household", formulas={"2024": object()}), - "meets_ssi_disability_criteria": FakeVariable("person"), - "weekly_hours_worked_before_lsr": FakeVariable("person"), - } - - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 11], - "household_weight": [1.0, 2.0], - "state_fips": [36, 6], - "county_fips": [61, 1], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 11], - "ssn_card_type": ["NONE", "CITIZEN"], - "age": [40, 70], - "difficulty_hearing": [True, False], - "ssi": [0.0, 0.0], - "employment_income": [0.0, 55_000.0], - "hours_worked": [0.0, 50.0], - "weeks_worked": [0.0, 52.0], - "is_paid_hourly": [False, True], - "has_never_worked": [False, False], - "is_military": [False, False], - "is_executive_administrative_professional": [False, False], - "is_farmer_fisher": [False, False], - "is_computer_scientist": [False, False], - } - ), - ) - - export_maps = build_policyengine_us_export_variable_maps( - tables, - tax_benefit_system=FakeSystem(), - ) - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map=export_maps["household"], - person_variable_map=export_maps["person"], - ) - - columns = build_policyengine_us_export_column_names( - tables, - tax_benefit_system=FakeSystem(), - ) - - assert arrays["in_nyc"]["2024"].tolist() == [True, False] - assert arrays["has_tin"]["2024"].tolist() == [False, True] - assert arrays["has_itin"]["2024"].tolist() == [False, True] - assert arrays["hours_worked_last_week"]["2024"].tolist() == [0.0, 50.0] - assert arrays["weekly_hours_worked_before_lsr"]["2024"].tolist() == [ - 0.0, - 50.0, - ] - assert arrays["meets_ssi_disability_criteria"]["2024"].tolist() == [ - True, - False, - ] - np.testing.assert_allclose( - arrays["fsla_overtime_premium"]["2024"], - np.array([0.0, 5_000.0], dtype=np.float32), - ) - assert { - "fsla_overtime_premium", - "has_itin", - "has_tin", - "hours_worked_last_week", - "in_nyc", - "meets_ssi_disability_criteria", - "weekly_hours_worked_before_lsr", - }.issubset(columns) - - def test_pipeline_export_uses_desired_retirement_leaves_for_final_inputs(self): - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig(policyengine_dataset_year=2024) - ) - persons = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 11], - "age": [40, 52], - "self_employed_pension_contributions": [0.0, 99_000.0], - "traditional_401k_contributions": [10_000.0, 99_000.0], - "roth_401k_contributions": [0.0, 99_000.0], - "traditional_ira_contributions": [0.0, 99_000.0], - "roth_ira_contributions": [0.0, 99_000.0], - "self_employed_pension_contributions_desired": [0.0, 1_000.0], - "traditional_401k_contributions_desired": [7_718.0, 40_000.0], - "roth_401k_contributions_desired": [1_362.0, 5_000.0], - "traditional_ira_contributions_desired": [360.64, 10_000.0], - "roth_ira_contributions_desired": [559.36, 3_000.0], - } - ) - - result = pipeline._augment_policyengine_person_inputs(persons) - - np.testing.assert_allclose( - result[ - [ - "self_employed_pension_contributions", - "traditional_401k_contributions", - "roth_401k_contributions", - "traditional_ira_contributions", - "roth_ira_contributions", - ] - ].to_numpy(), - np.array( - [ - [0.0, 7_718.0, 1_362.0, 360.64, 559.36], - [1_000.0, 30_500.0, 0.0, 8_000.0, 0.0], - ] - ), - ) - - def test_projects_frame_and_writes_time_period_dataset(self, tmp_path): - frame = pd.DataFrame( - { - "person_id": [1, 2], - "age": [34, 12], - "state_code": ["CA", "NY"], - "self_employed_pension_contribution_ald": [10.0, 20.0], - } - ) - - arrays = project_frame_to_time_period_arrays( - frame, - period=2024, - column_map={ - "person_id": "person_id", - "age": "age", - "state_code": "state_code", - "self_employed_pension_contribution_ald": ( - "self_employed_pension_contribution_ald" - ), - }, - ) - output_path = tmp_path / "microplex_pe.h5" - write_policyengine_us_time_period_dataset( - arrays, - output_path, - excluded_variables={"self_employed_pension_contribution_ald"}, - ) - - with h5py.File(output_path, "r") as handle: - assert set(handle.keys()) == {"age", "person_id", "state_code"} - assert np.array_equal(handle["age"]["2024"][:], np.array([34, 12])) - assert handle["state_code"]["2024"].dtype.kind == "S" - - def test_writer_rejects_computed_policyengine_variables(self, tmp_path): - class FakeVariable: - def __init__(self, formulas=None): - self.formulas = formulas or {} - - class FakeSystem: - variables = { - "age": FakeVariable(), - "filing_status": FakeVariable(formulas={"2024": object()}), - } - - arrays = { - "age": {"2024": np.asarray([34, 12])}, - "filing_status": {"2024": np.asarray([1, 2])}, - } - - with pytest.raises(ValueError, match="filing_status"): - write_policyengine_us_time_period_dataset( - arrays, - tmp_path / "microplex_pe.h5", - tax_benefit_system=FakeSystem(), - ) - - def test_build_time_period_arrays_defaults_missing_ssn_card_type_to_citizen(self): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20], - "household_weight": [1.0, 1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [10, 20], - "ssn_card_type": ["NONE", None], - } - ), - ) - - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - person_variable_map={"ssn_card_type": "ssn_card_type"}, - ) - - assert arrays["ssn_card_type"]["2024"].tolist() == [b"NONE", b"CITIZEN"] - - def test_compute_marketplace_plan_benchmark_ratio_clips_marketplace_takers(self): - ratio = compute_marketplace_plan_benchmark_ratio( - reported_premium=np.array([300.0, 50.0, 500.0, 500.0]), - aca_ptc=np.array([700.0, 0.0, 0.0, 500.0]), - slcsp=np.array([1_000.0, 1_000.0, 0.0, 1_000.0]), - takes_up_aca=np.array([True, True, True, False]), - ) - - np.testing.assert_allclose(ratio, np.array([1.0, 0.5, 1.0, 1.0])) - - def test_build_time_period_arrays_derives_marketplace_plan_ratio(self): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20, 30], - "household_weight": [1.0, 1.0, 1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 20, 30], - "tax_unit_id": [100, 200, 300], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100, 200, 300], - "household_id": [10, 20, 30], - "health_insurance_premiums_without_medicare_part_b": [ - 300.0, - 50.0, - 200.0, - ], - "aca_ptc": [700.0, 0.0, 100.0], - "slcsp": [1_000.0, 1_000.0, 0.0], - "takes_up_aca_if_eligible": [True, True, True], - } - ), - ) - - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - tax_unit_variable_map={ - "selected_marketplace_plan_benchmark_ratio": ( - "selected_marketplace_plan_benchmark_ratio" - ) - }, - ) - - np.testing.assert_allclose( - arrays["selected_marketplace_plan_benchmark_ratio"]["2024"], - np.array([1.0, 0.5, 1.0]), - ) - - def test_build_time_period_arrays_defaults_absent_export_inputs(self): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10], - "household_weight": [1.0], - } - ), - persons=pd.DataFrame( - { - "person_id": [1], - "household_id": [10], - } - ), - tax_units=pd.DataFrame( - { - "tax_unit_id": [100], - "household_id": [10], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [1000], - "household_id": [10], - } - ), - ) - - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - person_variable_map={ - "immigration_status_str": "immigration_status_str", - "ssn_card_type": "ssn_card_type", - "takes_up_early_head_start_if_eligible": "takes_up_early_head_start_if_eligible", - "weeks_unemployed": "weeks_unemployed", - "would_claim_wic": "would_claim_wic", - }, - household_variable_map={ - "auto_loan_balance": "auto_loan_balance", - "net_worth": "net_worth", - "tenure_type": "tenure_type", - }, - tax_unit_variable_map={ - "first_home_mortgage_balance": "first_home_mortgage_balance", - "selected_marketplace_plan_benchmark_ratio": ( - "selected_marketplace_plan_benchmark_ratio" - ), - "takes_up_aca_if_eligible": "takes_up_aca_if_eligible", - "takes_up_eitc": "takes_up_eitc", - "would_file_taxes_voluntarily": "would_file_taxes_voluntarily", - }, - spm_unit_variable_map={ - "spm_unit_pre_subsidy_childcare_expenses": "spm_unit_pre_subsidy_childcare_expenses", - "spm_unit_tenure_type": "spm_unit_tenure_type", - "takes_up_snap_if_eligible": "takes_up_snap_if_eligible", - }, - ) - - assert arrays["ssn_card_type"]["2024"].tolist() == [b"CITIZEN"] - assert arrays["immigration_status_str"]["2024"].tolist() == [b"CITIZEN"] - assert arrays["auto_loan_balance"]["2024"].tolist() == [0.0] - assert arrays["net_worth"]["2024"].tolist() == [0] - assert arrays["tenure_type"]["2024"].tolist() == [b"NONE"] - assert arrays["takes_up_early_head_start_if_eligible"]["2024"].tolist() == [ - True - ] - assert arrays["weeks_unemployed"]["2024"].tolist() == [0] - assert arrays["would_claim_wic"]["2024"].tolist() == [True] - assert arrays["first_home_mortgage_balance"]["2024"].tolist() == [0.0] - assert arrays["selected_marketplace_plan_benchmark_ratio"]["2024"].tolist() == [ - 1.0 - ] - assert arrays["takes_up_aca_if_eligible"]["2024"].tolist() == [True] - assert arrays["takes_up_eitc"]["2024"].tolist() == [True] - assert arrays["would_file_taxes_voluntarily"]["2024"].tolist() == [False] - assert arrays["spm_unit_pre_subsidy_childcare_expenses"]["2024"].tolist() == [0] - assert arrays["spm_unit_tenure_type"]["2024"].tolist() == [b"RENTER"] - assert arrays["takes_up_snap_if_eligible"]["2024"].tolist() == [True] - - def test_build_time_period_arrays_normalizes_numeric_tenure_codes(self): - tables = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [10, 20, 30], - "household_weight": [1.0, 1.0, 1.0], - "tenure_type": [0, 1, 2], - } - ), - persons=pd.DataFrame( - { - "person_id": [1, 2, 3], - "household_id": [10, 20, 30], - "spm_unit_id": [100, 200, 300], - } - ), - spm_units=pd.DataFrame( - { - "spm_unit_id": [100, 200, 300], - "household_id": [10, 20, 30], - "spm_unit_tenure_type": [0, 1, 2], - } - ), - ) - - arrays = build_policyengine_us_time_period_arrays( - tables, - period=2024, - household_variable_map={"tenure_type": "tenure_type"}, - spm_unit_variable_map={"spm_unit_tenure_type": "spm_unit_tenure_type"}, - ) - - assert arrays["tenure_type"]["2024"].tolist() == [ - b"NONE", - b"OWNED_WITH_MORTGAGE", - b"RENTED", - ] - assert arrays["spm_unit_tenure_type"]["2024"].tolist() == [ - b"RENTER", - b"OWNER_WITH_MORTGAGE", - b"RENTER", - ] - - -class TestUSPipelinePolicyEngineTargets: - def test_build_policyengine_continuous_targets_uses_adapter(self): - seed_data = pd.DataFrame({"snap": [10.0, 20.0], "income": [1.0, 2.0]}) - pipeline = USMicroplexPipeline(USMicroplexBuildConfig()) - - class FakeAdapter: - def compute_targets(self, specs): - assert len(specs) == 1 - return {"snap_total": 42.0} - - targets = pipeline.build_policyengine_continuous_targets( - seed_data=seed_data, - adapter=FakeAdapter(), - quantity_targets=( - PolicyEngineUSQuantityTarget( - name="snap_total", - variable="snap", - column="snap", - period=2024, - ), - ), - ) - - assert targets == {"snap": 42.0} - - def test_build_targets_requires_dataset_when_policyengine_targets_configured(self): - households = pd.DataFrame( - { - "household_id": [1], - "state_fips": [6], - "hh_weight": [100.0], - "tenure": [1], - } - ) - persons = pd.DataFrame( - { - "person_id": [10], - "household_id": [1], - "age": [34], - "sex": [1], - "education": [3], - "employment_status": [1], - "income": [55_000.0], - "snap": [1_200.0], - } - ) - config = USMicroplexBuildConfig( - policyengine_quantity_targets=( - PolicyEngineUSQuantityTarget( - name="snap_total", - variable="snap", - column="snap", - period=2024, - ), - ), - ) - pipeline = USMicroplexPipeline(config) - seed = pipeline.prepare_seed_data(persons, households) - - with pytest.raises(ValueError, match="policyengine_dataset"): - pipeline.build_targets(seed) - - -class TestPolicyEngineImportBoundaries: - def test_policyengine_submodule_import_does_not_require_polars(self): - src_dir = Path(__file__).resolve().parents[2] / "src" - code = f""" -import importlib -import sys - -sys.path.insert(0, {src_dir.as_posix()!r}) -sys.modules['polars'] = None -module = importlib.import_module('microplex_us.policyengine.us') -print(module.__name__) -""" - - result = subprocess.run( - [sys.executable, "-c", code], - capture_output=True, - text=True, - check=False, - ) - - assert result.returncode == 0, result.stderr - assert result.stdout.strip() == "microplex_us.policyengine.us" diff --git a/tests/policyengine/test_us_pipeline_checkpoint.py b/tests/policyengine/test_us_pipeline_checkpoint.py deleted file mode 100644 index 45579954..00000000 --- a/tests/policyengine/test_us_pipeline_checkpoint.py +++ /dev/null @@ -1,163 +0,0 @@ -"""US pipeline checkpoint save/load tests. - -The pipeline takes ~11 hours to synthesize + impute + build PE tables -before calibration even starts. Then PE microsim materializes target -variables (~30 min) before calibration fits. If any later stage fails -(OOM, bad config, disk full, sparsity collapse), we want to iterate -without re-paying earlier work. - -``save_us_pipeline_checkpoint`` and ``load_us_pipeline_checkpoint`` -round-trip a ``PolicyEngineUSEntityTableBundle`` at a named pipeline -stage so a downstream rerun can resume from that point. - -These tests drive: - -1. Basic round-trip equivalence at each stage. -2. Partial bundles (some entity tables ``None``) round-trip correctly. -3. Metadata file is written alongside the parquet files and contains - enough info to validate the bundle (row counts, column names, stage). -4. Load from a missing path raises a clear error. -5. Save with invalid stage raises. -6. Loading with ``expected_stage`` mismatch raises. -7. Saving twice to the same path replaces the earlier snapshot. -""" - -from __future__ import annotations - -from pathlib import Path - -import numpy as np -import pandas as pd -import pytest - -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - load_us_pipeline_checkpoint, - save_us_pipeline_checkpoint, -) - - -def _make_bundle(n: int = 50, seed: int = 0) -> PolicyEngineUSEntityTableBundle: - rng = np.random.default_rng(seed) - household_ids = np.arange(n) + 1 - households = pd.DataFrame( - { - "household_id": household_ids, - "household_weight": rng.uniform(0.5, 2.0, size=n), - "state_fips": rng.integers(1, 57, size=n), - } - ) - persons = pd.DataFrame( - { - "person_id": household_ids * 10, - "household_id": household_ids, - "age": rng.integers(0, 85, size=n), - "employment_income": rng.uniform(0, 200_000, size=n), - } - ) - tax_units = pd.DataFrame( - { - "tax_unit_id": household_ids * 100, - "household_id": household_ids, - "filing_status": rng.choice(["SINGLE", "JOINT"], size=n), - } - ) - return PolicyEngineUSEntityTableBundle( - households=households, - persons=persons, - tax_units=tax_units, - spm_units=None, - families=None, - marital_units=None, - ) - - -class TestUSPipelineCheckpoint: - @pytest.mark.parametrize("stage", ["post_imputation", "post_microsim"]) - def test_full_roundtrip_equivalent(self, tmp_path: Path, stage: str) -> None: - bundle = _make_bundle(n=100) - save_us_pipeline_checkpoint(bundle, tmp_path / "checkpoint", stage=stage) - loaded, metadata = load_us_pipeline_checkpoint(tmp_path / "checkpoint") - - pd.testing.assert_frame_equal(loaded.households, bundle.households) - pd.testing.assert_frame_equal(loaded.persons, bundle.persons) - pd.testing.assert_frame_equal(loaded.tax_units, bundle.tax_units) - assert loaded.spm_units is None - assert loaded.families is None - assert loaded.marital_units is None - assert metadata["stage"] == stage - - def test_partial_bundle_roundtrip(self, tmp_path: Path) -> None: - """A households-only bundle (no other entity tables) round-trips.""" - households = pd.DataFrame( - {"household_id": [1, 2, 3], "household_weight": [1.0, 2.0, 3.0]} - ) - bundle = PolicyEngineUSEntityTableBundle( - households=households, - persons=None, - tax_units=None, - spm_units=None, - families=None, - marital_units=None, - ) - save_us_pipeline_checkpoint( - bundle, tmp_path / "checkpoint", stage="post_imputation" - ) - loaded, _ = load_us_pipeline_checkpoint(tmp_path / "checkpoint") - - pd.testing.assert_frame_equal(loaded.households, bundle.households) - assert loaded.persons is None - assert loaded.tax_units is None - - def test_metadata_written_with_row_counts(self, tmp_path: Path) -> None: - bundle = _make_bundle(n=75) - save_us_pipeline_checkpoint( - bundle, tmp_path / "checkpoint", stage="post_microsim" - ) - - metadata_path = tmp_path / "checkpoint" / "metadata.json" - assert metadata_path.exists() - - import json - - metadata = json.loads(metadata_path.read_text()) - assert metadata["stage"] == "post_microsim" - assert metadata["households"]["rows"] == 75 - assert "household_id" in metadata["households"]["columns"] - assert metadata["persons"]["rows"] == 75 - assert metadata["tax_units"]["rows"] == 75 - assert metadata["spm_units"] is None - - def test_load_missing_path_raises(self, tmp_path: Path) -> None: - with pytest.raises(FileNotFoundError, match="US pipeline checkpoint"): - load_us_pipeline_checkpoint(tmp_path / "does_not_exist") - - def test_save_with_invalid_stage_raises(self, tmp_path: Path) -> None: - bundle = _make_bundle(n=5) - with pytest.raises(ValueError, match="stage must be one of"): - save_us_pipeline_checkpoint(bundle, tmp_path / "checkpoint", stage="bogus") # type: ignore[arg-type] - - def test_load_with_stage_mismatch_raises(self, tmp_path: Path) -> None: - bundle = _make_bundle(n=5) - save_us_pipeline_checkpoint( - bundle, tmp_path / "checkpoint", stage="post_imputation" - ) - with pytest.raises(ValueError, match="expected 'post_microsim'"): - load_us_pipeline_checkpoint( - tmp_path / "checkpoint", expected_stage="post_microsim" - ) - - def test_save_overwrites_existing(self, tmp_path: Path) -> None: - first = _make_bundle(n=10, seed=0) - second = _make_bundle(n=20, seed=1) - - save_us_pipeline_checkpoint( - first, tmp_path / "checkpoint", stage="post_imputation" - ) - save_us_pipeline_checkpoint( - second, tmp_path / "checkpoint", stage="post_imputation" - ) - - loaded, _ = load_us_pipeline_checkpoint(tmp_path / "checkpoint") - assert len(loaded.households) == 20 - pd.testing.assert_frame_equal(loaded.households, second.households) diff --git a/tests/specs/test_us_2024_spec.py b/tests/specs/test_us_2024_spec.py deleted file mode 100644 index c121e784..00000000 --- a/tests/specs/test_us_2024_spec.py +++ /dev/null @@ -1,239 +0,0 @@ -from __future__ import annotations - -import json -from collections import Counter -from importlib.resources import files -from pathlib import Path - -from microplex.spec import ( - DEMOGRAPHICS_TOKEN, - ImputationOrder, - SpineMethod, - VariableOperationKind, - load_spec, -) - -from microplex_us.pipelines.us import ( - PUF_SUPPORT_CLONE_IMPUTED_VARIABLES, - PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES, - PUF_SUPPORT_CLONE_SPECIAL_VARIABLES, -) -from microplex_us.variables import PE_STYLE_PUF_IRS_DEMOGRAPHIC_PREDICTORS - -SPEC_PATH = Path(str(files("microplex_us.specs").joinpath("us-2024.yaml"))) -CONTRACT_PATH = Path( - str(files("microplex_us.pipelines").joinpath("ecps_export_contract.json")) -) - - -def _spec(): - return load_spec(SPEC_PATH) - - -def _required_contract_variables() -> set[str]: - payload = json.loads(CONTRACT_PATH.read_text(encoding="utf-8")) - return set(payload["required"]) - - -def _declared_imputation_variables(spec) -> set[str]: - return {name for step in spec.imputation for name in step.vars} - - -def test_us_2024_spec_loads_and_names_release_surface() -> None: - spec = _spec() - - assert spec.meta.country == "us" - assert spec.meta.model_year == 2024 - assert spec.meta.policyengine_model == "policyengine-us" - assert spec.sources["cps_asec"].dataset == "cps_asec_2025_calendar_2024" - assert spec.sources["puf"].dataset == "puf_2024" - assert set(spec.sources) == {"cps_asec", "puf", "acs", "sipp", "scf"} - - assert spec.targets is not None - assert spec.targets.arch.country == "us" - assert spec.targets.arch.model_year == 2024 - assert spec.targets.arch.target_profile == "pe_native_broad" - assert ( - spec.targets.arch.resolved_calibration_target_profile - == "pe_native_broad_source_backed" - ) - assert spec.calibrate is not None - assert spec.calibrate.loss == "pe_native_bucketed_huber_v1" - assert spec.calibrate.method.value == "apg" - - -def test_us_2024_spec_declares_ecps_clone_spine() -> None: - spec = _spec() - - assert spec.spine.base == "cps_asec" - assert spec.spine.method is SpineMethod.CLONE - assert spec.spine.clone.seed == 20260529 - assert spec.spine.passthrough_half.name == "cps_keep" - assert spec.spine.passthrough_half.keep == "all" - assert spec.spine.synthetic_half.name == "synthetic_puf" - assert spec.spine.synthetic_half.strip_to == [DEMOGRAPHICS_TOKEN] - - -def test_us_2024_spec_declares_demographic_only_puf_synthesis() -> None: - spec = _spec() - all_puf_vars = list( - PUF_SUPPORT_CLONE_IMPUTED_VARIABLES + PUF_SUPPORT_CLONE_SPECIAL_VARIABLES - ) - - synthetic, cps_fill, cps_override = spec.imputation - - assert synthetic.onto == "synthetic_puf" - assert synthetic.from_ == "puf" - assert synthetic.vars == all_puf_vars - assert synthetic.condition_on == [DEMOGRAPHICS_TOKEN] - assert synthetic.order is ImputationOrder.SPINE_FIRST - assert synthetic.synthesize is True - - assert cps_fill.onto == "cps_keep" - assert cps_fill.from_ == "puf" - assert cps_fill.vars == all_puf_vars - assert cps_fill.condition_on == [DEMOGRAPHICS_TOKEN] - assert cps_fill.synthesize is False - - assert cps_override.onto == "cps_keep" - assert cps_override.from_ == "puf" - assert cps_override.vars == list(PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES) - assert cps_override.condition_on == [DEMOGRAPHICS_TOKEN] - assert cps_override.synthesize is True - - assert set(PUF_SUPPORT_CLONE_OVERRIDDEN_VARIABLES).issubset( - PUF_SUPPORT_CLONE_IMPUTED_VARIABLES - ) - assert "employment_income" in synthetic.vars - assert "employment_income" not in cps_override.vars - assert "employment_income" not in synthetic.condition_on - assert tuple(PE_STYLE_PUF_IRS_DEMOGRAPHIC_PREDICTORS) == ( - "age", - "is_male", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - ) - - -def test_us_2024_spec_declares_provenance_for_every_required_export() -> None: - spec = _spec() - required = _required_contract_variables() - declared = _declared_imputation_variables(spec) - - assert set(spec.variables) == required | declared - - for name in sorted(required | declared): - variable = spec.variables[name] - assert variable.temporary is True, name - assert variable.entity, name - assert variable.role, name - assert variable.entity in { - "person", - "household", - "tax_unit", - "spm_unit", - "family", - }, name - for system in ("ecps", "mp_legacy", "mp_spec"): - provenance = getattr(variable, system) - assert provenance is not None, f"{name}.{system}" - assert provenance.method, f"{name}.{system}.method" - assert provenance.code, f"{name}.{system}.code" - assert all(ref.path for ref in provenance.code), f"{name}.{system}.code" - for ref in provenance.code: - symbol_tokens = (ref.symbol or "").replace("/", " ").split() - assert "POLICYENGINE_US_EXPORT_VARIABLES" not in symbol_tokens, ( - f"{name}.{system}.code" - ) - assert variable.mp_spec.operation is not None, f"{name}.mp_spec.operation" - - -def test_us_2024_spec_operation_kinds_make_python_provenance_temporary() -> None: - spec = _spec() - - operation_counts = Counter( - variable.mp_spec.operation.kind for variable in spec.variables.values() - ) - - assert operation_counts == { - VariableOperationKind.OPEN_DECISION: 116, - VariableOperationKind.IMPUTE: 100, - VariableOperationKind.PASSTHROUGH: 31, - VariableOperationKind.RERANDOMIZE_TAKEUP: 13, - VariableOperationKind.STRUCTURAL_EXPORT: 12, - VariableOperationKind.ENCODE_GEOID: 5, - VariableOperationKind.DERIVE: 1, - } - - puf_imputed = { - name - for name, variable in spec.variables.items() - if variable.role and variable.role.startswith("puf_imputed") - } - assert puf_imputed - assert { - spec.variables[name].mp_spec.operation.source - for name in puf_imputed - } == {"puf"} - - unresolved = [ - name - for name, variable in spec.variables.items() - if variable.mp_spec.operation.kind is VariableOperationKind.OPEN_DECISION - ] - assert len(unresolved) == 116 - assert "net_worth" in unresolved - - -def test_us_2024_spec_covers_manifest_gap_families() -> None: - spec = _spec() - variables = spec.variables - - scf_components = {name for name in variables if name.startswith("scf_")} - reported_health = {name for name in variables if name.startswith("reported_")} - takeup_inputs = { - name - for name in variables - if name.startswith("takes_up_") or name.startswith("would_") - } - - assert len(scf_components) == 19 - assert len(reported_health) == 21 - assert len(takeup_inputs) == 13 - assert { - "state_fips", - "county_fips", - "block_geoid", - "tract_geoid", - "congressional_district_geoid", - "in_nyc", - } <= set(variables) - - net_worth = variables["net_worth"] - assert net_worth.role == "net_worth_open_decision" - assert net_worth.ecps is not None - assert "direct SCF" in net_worth.ecps.method - assert net_worth.mp_spec is not None - assert "OPEN" in net_worth.mp_spec.method - - for name in ( - "social_security", - "self_employment_income", - "rental_income", - "taxable_pension_income", - "alimony_income", - ): - variable = variables[name] - assert variable.mp_spec is not None - assert "keep-CPS collision" in (variable.mp_spec.notes or "") - - weeks_unemployed = variables["weeks_unemployed"] - assert weeks_unemployed.mp_spec is not None - assert "clip[0,52]" in (weeks_unemployed.mp_spec.notes or "") - - -def test_us_2024_spec_keeps_forbes_out_of_replication_baseline() -> None: - assert "forbes" not in SPEC_PATH.read_text(encoding="utf-8").lower() diff --git a/tests/targets/test_aca_ptc.py b/tests/targets/test_aca_ptc.py deleted file mode 100644 index 1927013c..00000000 --- a/tests/targets/test_aca_ptc.py +++ /dev/null @@ -1,222 +0,0 @@ -from __future__ import annotations - -import csv -import json -from pathlib import Path -from typing import Any - -import pytest - -from microplex_us.targets import ( - ACA_AVERAGE_MONTHLY_APTC_CONCEPT, - ACA_MARKETPLACE_EFFECTUATED_ENROLLMENT_CONCEPT, - ACAPTCMultiplierInput, - aca_ptc_multiplier_inputs_from_arch_consumer_facts, - build_aca_ptc_multiplier_rows, - load_arch_consumer_fact_jsonl_rows, - write_policyengine_aca_ptc_multiplier_csv, -) -from microplex_us.targets.aca_ptc import main - - -def test_build_aca_ptc_multiplier_rows_matches_policyengine_formula() -> None: - rows = build_aca_ptc_multiplier_rows( - [ - ACAPTCMultiplierInput( - state="California", - enroll_base=1_701_375, - enroll_target=1_795_695, - aptc_base=459, - aptc_target=526, - ) - ] - ) - - row = rows[0] - assert row.vol_mult == pytest.approx(1_795_695 / 1_701_375) - assert row.val_mult == pytest.approx(526 / 459) - assert row.amount_mult == pytest.approx((1_795_695 / 1_701_375) * (526 / 459)) - assert row.target_factors() == { - "tax_unit_count": pytest.approx(1_795_695 / 1_701_375), - "aca_ptc": pytest.approx((1_795_695 / 1_701_375) * (526 / 459)), - } - - -def test_arch_consumer_fact_inputs_use_oep_with_effectuated_fallback() -> None: - facts = [ - _enrollment_fact("California", "ca", 2022, 1_701_375), - _enrollment_fact("California", "ca", 2024, 1_795_695), - _oep_aptc_fact("California", "ca", 2022, 459), - _effectuated_aptc_fact("California", "ca", 2022, 469.44), - _oep_aptc_fact("California", "ca", 2024, 526), - _enrollment_fact("Nevada", "nv", 2022, 90_397), - _enrollment_fact("Nevada", "nv", 2024, 92_949), - _effectuated_aptc_fact("Nevada", "nv", 2022, 429.75), - _oep_aptc_fact("Nevada", "nv", 2024, 438), - ] - - inputs = aca_ptc_multiplier_inputs_from_arch_consumer_facts(facts) - - by_state = {item.state: item for item in inputs} - assert by_state["California"].aptc_base == 459 - assert by_state["California"].aptc_base_source_kind == "oep" - assert by_state["Nevada"].aptc_base == 429.75 - assert by_state["Nevada"].aptc_base_source_kind == "effectuated" - - rows = build_aca_ptc_multiplier_rows(inputs) - nevada = {row.state: row for row in rows}["Nevada"] - assert nevada.vol_mult == pytest.approx(92_949 / 90_397) - assert nevada.val_mult == pytest.approx(438 / 429.75) - assert nevada.val_mult != pytest.approx(438 / 435) - - -def test_arch_consumer_fact_inputs_can_require_oep_base_aptc() -> None: - facts = [ - _enrollment_fact("Nevada", "nv", 2022, 90_397), - _enrollment_fact("Nevada", "nv", 2024, 92_949), - _effectuated_aptc_fact("Nevada", "nv", 2022, 429.75), - _oep_aptc_fact("Nevada", "nv", 2024, 438), - ] - - with pytest.raises(ValueError, match="Nevada 2022 average APTC"): - aca_ptc_multiplier_inputs_from_arch_consumer_facts( - facts, - base_aptc_policy="oep", - ) - - -def test_write_policyengine_aca_ptc_multiplier_csv(tmp_path: Path) -> None: - rows = build_aca_ptc_multiplier_rows( - [ - ACAPTCMultiplierInput( - state="Nevada", - enroll_base=90_397, - enroll_target=92_949, - aptc_base=429.75, - aptc_target=438, - ) - ] - ) - path = tmp_path / "aca_ptc_multipliers_2022_2024.csv" - - write_policyengine_aca_ptc_multiplier_csv(rows, path) - - with path.open() as file: - records = list(csv.DictReader(file)) - assert records[0]["state"] == "Nevada" - assert records[0]["enroll_2022"] == "90397" - assert records[0]["aptc_2024"] == "438" - assert float(records[0]["enroll_2022"]) == 90_397 - assert float(records[0]["aptc_2022"]) == 429.75 - assert float(records[0]["vol_mult"]) == pytest.approx(92_949 / 90_397) - assert float(records[0]["val_mult"]) == pytest.approx(438 / 429.75) - - -def test_main_builds_policyengine_csv_from_consumer_fact_jsonl( - tmp_path: Path, - capsys: pytest.CaptureFixture[str], -) -> None: - consumer_facts = tmp_path / "consumer_facts.jsonl" - _write_jsonl( - consumer_facts, - [ - _enrollment_fact("Nevada", "nv", 2022, 90_397), - _enrollment_fact("Nevada", "nv", 2024, 92_949), - _effectuated_aptc_fact("Nevada", "nv", 2022, 429.75), - _oep_aptc_fact("Nevada", "nv", 2024, 438), - ], - ) - out = tmp_path / "aca_ptc_multipliers_2022_2024.csv" - - assert main([str(consumer_facts), "--out", str(out)]) == 0 - - captured = capsys.readouterr() - assert f"Wrote 1 ACA PTC multiplier rows to {out}" in captured.out - rows = list(csv.DictReader(out.open())) - assert rows[0]["state"] == "Nevada" - assert rows[0]["aptc_2022"] == "429.75" - - -def test_load_arch_consumer_fact_jsonl_rows_rejects_non_consumer_rows( - tmp_path: Path, -) -> None: - path = tmp_path / "facts.jsonl" - path.write_text(json.dumps({"schema_version": "arch.fact.v1"}) + "\n") - - with pytest.raises(ValueError, match="Unsupported Arch consumer fact schema"): - load_arch_consumer_fact_jsonl_rows([path]) - - -def _enrollment_fact( - state: str, - state_abbr: str, - year: int, - value: float, -) -> dict[str, Any]: - return _fact( - state=state, - period=year, - value=value, - concept=ACA_MARKETPLACE_EFFECTUATED_ENROLLMENT_CONCEPT, - source_record_id=( - f"kff.marketplace_effectuated_enrollment.{year}.state." - f"{state_abbr}.total_effectuated_marketplace_enrollment" - ), - ) - - -def _oep_aptc_fact( - state: str, - state_abbr: str, - year: int, - value: float, -) -> dict[str, Any]: - return _fact( - state=state, - period=year, - value=value, - concept=ACA_AVERAGE_MONTHLY_APTC_CONCEPT, - source_record_id=( - f"cms_aca.oep{year}.state_marketplace.{state_abbr}.average_monthly_aptc" - ), - ) - - -def _effectuated_aptc_fact( - state: str, - state_abbr: str, - year: int, - value: float, -) -> dict[str, Any]: - return _fact( - state=state, - period=year, - value=value, - concept=ACA_AVERAGE_MONTHLY_APTC_CONCEPT, - source_record_id=( - f"cms_aca.effectuated_enrollment.{year}.state_marketplace." - f"{state_abbr}.average_monthly_aptc" - ), - ) - - -def _fact( - *, - state: str, - period: int, - value: float, - concept: str, - source_record_id: str, -) -> dict[str, Any]: - return { - "schema_version": "arch.consumer_fact.v1", - "period": {"type": "calendar_year", "value": period}, - "geography": {"level": "state", "name": state}, - "observed_measure": {"source_concept": concept}, - "lineage": {"source_record_id": source_record_id}, - "value": value, - } - - -def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: - path.write_text("\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n") diff --git a/tests/targets/test_arch.py b/tests/targets/test_arch.py deleted file mode 100644 index 85a80d13..00000000 --- a/tests/targets/test_arch.py +++ /dev/null @@ -1,4145 +0,0 @@ -from __future__ import annotations - -import json -import sqlite3 -from pathlib import Path - -import pytest -from microplex.core import EntityType -from microplex.targets import TargetAggregation, TargetQuery - -from microplex_us.geography import US_STATE_ABBR_BY_FIPS -from microplex_us.pipelines.us import USMicroplexBuildConfig, USMicroplexPipeline -from microplex_us.policyengine.target_profiles import PolicyEngineUSTargetCell -from microplex_us.targets import ( - ArchConsumerFactJSONLTargetProvider, - ArchSQLiteTargetProvider, - summarize_arch_target_gap_queue, - summarize_arch_target_profile_coverage, -) -from microplex_us.targets.arch import ( - ArchTargetRecord, - arch_target_record_to_canonical_spec, - main_gaps, - main_refresh, -) - - -def _create_arch_targets_db(path: Path) -> None: - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - id INTEGER PRIMARY KEY, - name TEXT, - jurisdiction TEXT, - parent_id INTEGER, - definition_hash TEXT - ); - - CREATE TABLE stratum_constraints ( - id INTEGER PRIMARY KEY, - stratum_id INTEGER NOT NULL, - variable TEXT NOT NULL, - operator TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - id INTEGER PRIMARY KEY, - stratum_id INTEGER NOT NULL, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - value REAL NOT NULL, - target_type TEXT NOT NULL, - geographic_level TEXT, - source TEXT NOT NULL, - source_table TEXT, - source_url TEXT, - notes TEXT - ); - """ - ) - conn.executemany( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - [ - (1, "US", "US", "root"), - (2, "US All Filers", "US", "filers"), - (3, "CA Filers AGI $50k-$75k", "US", "ca_50k_75k"), - ], - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (2, "is_tax_filer", "==", "1"), - (3, "is_tax_filer", "==", "1"), - (3, "state_fips", "==", "06"), - (3, "agi_bracket", "==", "50k_to_75k"), - ], - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 1, - 2, - "tax_exempt_interest_returns", - 2023, - 10.0, - "COUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 2, - 2, - "tax_exempt_interest_amount", - 2023, - 100.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 3, - 2, - "adjusted_gross_income", - 2022, - 1_000.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 4, - 2, - "adjusted_gross_income", - 2023, - 1_100.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 5, - 1, - "labor_force_count", - 2023, - 100.0, - "COUNT", - None, - "BLS", - "BLS", - None, - None, - ), - (6, 1, "labor_force", 2024, 110.0, "COUNT", None, "CBO", "CBO", None, None), - ( - 7, - 3, - "tax_unit_count", - 2023, - 20.0, - "COUNT", - "STATE", - "IRS_SOI", - "SOI", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - -def _insert_multi_domain_soi_targets(path: Path) -> None: - conn = sqlite3.connect(path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "US Filers AGI $50k-$75k", "US", "national_50k_75k"), - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (4, "is_tax_filer", "==", "1"), - (4, "agi_bracket", "==", "50k_to_75k"), - ], - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 4, - "tax_exempt_interest_returns", - 2023, - 5.0, - "COUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 9, - 4, - "adjusted_gross_income", - 2023, - 500.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - -def _insert_w2_tip_income_target(path: Path) -> None: - conn = sqlite3.connect(path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (5, "US taxpayers with Form W-2 social security tips", "US", "w2_tips"), - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - (5, "tip_income", ">", "0"), - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 10, - 5, - "tip_income", - 2020, - 80.0, - "AMOUNT", - "NATIONAL", - "IRS_SOI", - "W-2", - None, - None, - ), - ( - 11, - 2, - "adjusted_gross_income", - 2020, - 800.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - -def _insert_irs_soi_itemized_deduction_targets(path: Path) -> None: - conn = sqlite3.connect(path) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 12, - 2, - "medical_amount", - 2023, - 100.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI Individual Returns - Itemized Deductions", - None, - None, - ), - ( - 13, - 2, - "real_estate_taxes_amount", - 2023, - 200.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI Individual Returns - Itemized Deductions", - None, - None, - ), - ( - 14, - 2, - "salt_amount", - 2023, - 300.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI Individual Returns - Itemized Deductions", - None, - None, - ), - ( - 15, - 2, - "medical_claims", - 2023, - 10.0, - "COUNT", - None, - "IRS_SOI", - "SOI Individual Returns - Itemized Deductions", - None, - None, - ), - ( - 16, - 2, - "real_estate_taxes_claims", - 2023, - 20.0, - "COUNT", - None, - "IRS_SOI", - "SOI Individual Returns - Itemized Deductions", - None, - None, - ), - ( - 17, - 2, - "salt_claims", - 2023, - 30.0, - "COUNT", - None, - "IRS_SOI", - "SOI Individual Returns - Itemized Deductions", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - -def _insert_complete_state_rollup_targets(path: Path) -> None: - conn = sqlite3.connect(path) - state_fips_values = sorted( - state_fips for state_fips in US_STATE_ABBR_BY_FIPS if state_fips != "72" - ) - ctc_strata = [ - (1_000 + index, f"{state_fips} CTC filers", "US", f"ctc_{state_fips}") - for index, state_fips in enumerate(state_fips_values) - ] - aca_strata = [ - (2_000 + index, f"{state_fips} ACA marketplace", "US", f"aca_{state_fips}") - for index, state_fips in enumerate(state_fips_values) - ] - conn.executemany( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - [*ctc_strata, *aca_strata], - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - *((stratum_id, "is_tax_filer", "==", "1") for stratum_id, *_ in ctc_strata), - *( - (stratum_id, "state_fips", "==", state_fips) - for stratum_id, _, _, definition_hash in ctc_strata - for state_fips in (definition_hash.removeprefix("ctc_"),) - ), - *( - (stratum_id, "state_fips", "==", state_fips) - for stratum_id, _, _, definition_hash in aca_strata - for state_fips in (definition_hash.removeprefix("aca_"),) - ), - ], - ) - ctc_targets = [ - ( - 10_000 + index * 2, - stratum_id, - "ctc_amount", - 2024, - 1_000.0 + index, - "AMOUNT", - None, - "IRS_SOI", - "State Data FY", - None, - None, - ) - for index, (stratum_id, *_rest) in enumerate(ctc_strata) - ] - ctc_count_targets = [ - ( - 10_001 + index * 2, - stratum_id, - "ctc_claims", - 2024, - 100.0 + index, - "COUNT", - None, - "IRS_SOI", - "State Data FY", - None, - None, - ) - for index, (stratum_id, *_rest) in enumerate(ctc_strata) - ] - deduction_targets = [ - ( - 30_000 + index * 4 + offset, - stratum_id, - variable, - 2024, - value + index, - target_type, - None, - "IRS_SOI", - "SOI Individual Returns - Itemized Deductions", - None, - None, - ) - for index, (stratum_id, *_rest) in enumerate(ctc_strata) - for offset, (variable, value, target_type) in enumerate( - ( - ("qbi_amount", 2_000.0, "AMOUNT"), - ("qbi_claims", 200.0, "COUNT"), - ("medical_amount", 300.0, "AMOUNT"), - ("medical_claims", 30.0, "COUNT"), - ) - ) - ] - aca_targets = [ - ( - 20_000 + index, - stratum_id, - "aca_aptc_amount", - 2024, - 10_000.0 + index, - "AMOUNT", - "STATE", - "CMS_ACA", - "2024 OEP State-Level Public Use File", - None, - None, - ) - for index, (stratum_id, *_rest) in enumerate(aca_strata) - ] - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [*ctc_targets, *ctc_count_targets, *deduction_targets, *aca_targets], - ) - conn.commit() - conn.close() - - -def test_arch_provider_ages_soi_and_maps_return_counts_to_positive_amounts(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["tax_exempt_interest_income"], - "entity_overrides": { - "tax_exempt_interest_income": EntityType.PERSON, - }, - }, - ) - ) - - assert len(target_set.targets) == 2 - count_target = next( - target - for target in target_set.targets - if target.aggregation is TargetAggregation.COUNT - ) - amount_target = next( - target - for target in target_set.targets - if target.aggregation is TargetAggregation.SUM - ) - assert count_target.entity is EntityType.TAX_UNIT - assert count_target.name == "arch_target_1" - assert count_target.description == ( - "Tax-exempt interest returns for US All Filers (IRS SOI, 2024)" - ) - assert count_target.metadata["display_label"] == count_target.description - assert count_target.metadata["target_semantic"] == "count" - assert count_target.metadata["model_variable_role"] == "preserved_input" - assert count_target.metadata["variable"] == "tax_unit_count" - assert count_target.measure is None - assert count_target.value == pytest.approx(11.0) - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in count_target.filters - } == { - ("tax_unit_is_filer", "==", "1"), - ("tax_exempt_interest_income", ">", 0), - } - - assert amount_target.entity is EntityType.PERSON - assert amount_target.description == ( - "Tax-exempt interest amount for US All Filers (IRS SOI, 2024)" - ) - assert amount_target.metadata["display_label"] == amount_target.description - assert amount_target.metadata["target_semantic"] == "amount" - assert amount_target.metadata["model_variable_role"] == "preserved_input" - assert amount_target.measure == "tax_exempt_interest_income" - assert amount_target.value == pytest.approx(110.0) - assert amount_target.metadata["arch_source_period"] == 2023 - assert amount_target.metadata["arch_aging_amount_method"] == ( - "soi_total_agi_last_growth_extrapolation" - ) - - -def test_arch_provider_maps_agi_bracket_constraints_to_agi_ranges(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["tax_unit_count"], - }, - ) - ) - - assert {target.metadata["target_id"] for target in target_set.targets} == {7} - bracket_target = next( - target for target in target_set.targets if target.metadata["target_id"] == 7 - ) - assert bracket_target.name == "arch_target_7" - assert bracket_target.description == ( - "Tax unit count for CA Filers AGI $50k-$75k (IRS SOI, 2024)" - ) - assert bracket_target.metadata["display_label"] == bracket_target.description - assert bracket_target.value == pytest.approx(22.0) - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in bracket_target.filters - } == { - ("tax_unit_is_filer", "==", "1"), - ("state_fips", "==", "06"), - ("adjusted_gross_income", ">=", 50_000), - ("adjusted_gross_income", "<", 75_000), - } - - -def test_arch_provider_includes_parent_stratum_constraints(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, parent_id, definition_hash) - VALUES (?, ?, ?, ?, ?) - """, - (4, "US Filers AGI $75k-$100k", "US", 2, "national_75k_100k"), - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - (4, "agi_bracket", "==", "75k_to_100k"), - ) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 4, - "adjusted_gross_income", - 2023, - 750.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["adjusted_gross_income"], - }, - ) - ) - - child_target = next( - target for target in target_set.targets if target.metadata["target_id"] == 8 - ) - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in child_target.filters - } == { - ("tax_unit_is_filer", "==", "1"), - ("adjusted_gross_income", ">=", 75_000), - ("adjusted_gross_income", "<", 100_000), - } - - -def test_arch_provider_honors_policyengine_target_cells(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "tax_exempt_interest_income", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [1] - target = target_set.targets[0] - assert target.aggregation is TargetAggregation.COUNT - assert target.measure is None - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "tax_exempt_interest_income", - "geo_level": "national", - "domain_variable": "tax_exempt_interest_income", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [2] - - -def test_arch_provider_target_cell_domain_match_is_exact(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - _insert_multi_domain_soi_targets(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "tax_exempt_interest_income", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [1] - - -def test_arch_provider_matches_aliased_amount_self_domain_target_cells(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 2, - "income_tax_liability", - 2023, - 80.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "income_tax", - "geo_level": "national", - "domain_variable": "income_tax", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "income_tax_positive", - "geo_level": "national", - "domain_variable": None, - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - assert target_set.targets[0].measure == "income_tax" - assert target_set.targets[0].metadata["arch_variable"] == "income_tax_liability" - - -def test_arch_provider_matches_current_profile_aliases(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 2, - "alimony_received_amount", - 2023, - 20.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 9, - 2, - "alimony_paid_amount", - 2023, - 25.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 10, - 2, - "schedule_c_income_amount", - 2023, - 30.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 11, - 1, - "medicaid_total_enrollment", - 2024, - 40.0, - "COUNT", - None, - "CMS_MEDICAID", - "CMS", - None, - None, - ), - ( - 12, - 2, - "wages_salaries_amount", - 2023, - 50.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 13, - 2, - "wages_salaries_returns", - 2023, - 60.0, - "COUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 14, - 2, - "schedule_c_income_returns", - 2023, - 70.0, - "COUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "target_cells": [ - { - "variable": "alimony_income", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "alimony_expense", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "self_employment_income", - "geo_level": "national", - "domain_variable": "self_employment_income", - }, - { - "variable": "total_self_employment_income", - "geo_level": "national", - "domain_variable": "total_self_employment_income", - }, - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "total_self_employment_income", - }, - { - "variable": "person_count", - "geo_level": "national", - "domain_variable": "medicaid", - }, - { - "variable": "employment_income", - "geo_level": "national", - "domain_variable": "employment_income", - }, - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "employment_income", - }, - ], - }, - ) - ) - - assert {target.metadata["target_id"] for target in target_set.targets} == { - 8, - 9, - 10, - 11, - 12, - 13, - 14, - } - variables_by_id = { - target.metadata["target_id"]: target.metadata["variable"] - for target in target_set.targets - } - assert variables_by_id == { - 8: "alimony_income", - 9: "alimony_expense", - 10: "self_employment_income", - 11: "person_count", - 12: "employment_income", - 13: "tax_unit_count", - 14: "tax_unit_count", - } - - -def test_arch_target_rejects_broad_proprietors_income_as_self_employment(): - record = ArchTargetRecord( - target_id=1, - stratum_id=1, - variable="schedule_c_income_amount", - period=2024, - value=2_023_080_000_000, - target_type="AMOUNT", - geographic_level=None, - geography_id=None, - source="BEA", - source_table="NIPA annual personal income components", - source_url=None, - notes=None, - stratum_name="US", - jurisdiction="US", - constraints=(), - concept=( - "bea_nipa.proprietors_income_with_inventory_valuation_and_capital_" - "consumption_adjustments" - ), - source_concept=( - "bea_nipa.a041rc_proprietors_income_with_inventory_valuation_and_" - "capital_consumption_adjustments" - ), - ) - - with pytest.raises( - ValueError, - match="cannot be exposed as plain self_employment_income", - ): - arch_target_record_to_canonical_spec(record) - - -def test_arch_provider_maps_eitc_child_count_constraints(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "US EITC 3+ Children", "US", "eitc_3plus_children"), - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (4, "is_tax_filer", "==", "1"), - (4, "eitc_qualifying_children", ">=", "3"), - ], - ) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 4, - "eitc_amount", - 2023, - 15.0, - "AMOUNT", - None, - "IRS_SOI", - "EITC", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "eitc", - "geo_level": "national", - "domain_variable": "eitc_child_count", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - target = target_set.targets[0] - assert target.measure == "eitc" - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - } == { - ("tax_unit_is_filer", "==", "1"), - ("eitc_child_count", ">=", "3"), - } - - -def test_arch_provider_matches_eitc_count_and_multi_domain_cells(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - [ - (4, "US EITC 3+ Children", "US", "eitc_3plus_children"), - (5, "US AGI 1_to_10k EITC 1 Child", "US", "eitc_1_child_agi"), - ], - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (4, "is_tax_filer", "==", "1"), - (4, "eitc_qualifying_children", ">=", "3"), - (5, "is_tax_filer", "==", "1"), - (5, "agi_bracket", "==", "1_to_10k"), - (5, "eitc_qualifying_children", "==", "1"), - ], - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 4, - "eitc_claims", - 2023, - 10.0, - "COUNT", - None, - "IRS_SOI", - "EITC", - None, - None, - ), - ( - 9, - 5, - "eitc_amount", - 2023, - 20.0, - "AMOUNT", - None, - "IRS_SOI", - "EITC", - None, - None, - ), - ( - 10, - 5, - "eitc_claims", - 2023, - 30.0, - "COUNT", - None, - "IRS_SOI", - "EITC", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "eitc_child_count", - }, - { - "variable": "eitc", - "geo_level": "national", - "domain_variable": ( - "adjusted_gross_income,eitc,eitc_child_count" - ), - }, - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": ( - "adjusted_gross_income,eitc,eitc_child_count" - ), - }, - ], - }, - ) - ) - - assert {target.metadata["target_id"] for target in target_set.targets} == { - 8, - 9, - 10, - } - - -def test_arch_provider_maps_census_stc_state_income_tax(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "CA state government", "US", "ca_state_government"), - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - (4, "state_fips", "==", "06"), - ) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 4, - "state_individual_income_tax_collections", - 2024, - 123.0, - "AMOUNT", - "STATE", - "CENSUS_STC", - "STC T40", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "target_cells": [ - { - "variable": "state_income_tax", - "geo_level": "state", - "domain_variable": None, - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - target = target_set.targets[0] - assert target.measure == "state_income_tax" - assert target.entity is EntityType.TAX_UNIT - assert target.aggregation is TargetAggregation.SUM - assert target.metadata["source"] == "CENSUS_STC" - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - } == {("state_fips", "==", "06")} - - -def test_arch_provider_maps_soi_itemized_deduction_targets(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 2, - "limited_state_local_taxes_amount", - 2024, - 122.0, - "AMOUNT", - None, - "IRS_SOI", - "Historic Table 2", - None, - None, - ), - ( - 9, - 2, - "interest_paid_deduction_amount", - 2024, - 169.0, - "AMOUNT", - None, - "IRS_SOI", - "Historic Table 2", - None, - "Composed from Schedule A interest lines.", - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "salt_deduction", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "interest_deduction", - "geo_level": "national", - "domain_variable": None, - }, - ], - }, - ) - ) - - targets_by_measure = {target.measure: target for target in target_set.targets} - assert set(targets_by_measure) == {"interest_deduction", "salt_deduction"} - - salt_target = targets_by_measure["salt_deduction"] - assert salt_target.metadata["target_id"] == 8 - assert salt_target.entity is EntityType.TAX_UNIT - assert salt_target.aggregation is TargetAggregation.SUM - - interest_target = targets_by_measure["interest_deduction"] - assert interest_target.metadata["target_id"] == 9 - assert interest_target.entity is EntityType.TAX_UNIT - assert interest_target.metadata["notes"] == ( - "Composed from Schedule A interest lines." - ) - - -def test_arch_provider_infers_geo_level_from_constraints(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "CA Filers", "US", "ca_filers"), - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (4, "is_tax_filer", "==", "1"), - (4, "state_fips", "==", "06"), - ], - ) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 4, - "adjusted_gross_income", - 2023, - 500.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "adjusted_gross_income", - "geo_level": "state", - "geographic_id": "6", - "domain_variable": None, - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - assert target_set.targets[0].metadata["geo_level"] == "state" - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["adjusted_gross_income"], - "geo_levels": ["national"], - }, - ) - ) - - assert 8 not in {target.metadata["target_id"] for target in target_set.targets} - - -def test_arch_provider_maps_program_indicator_constraints_to_support_filters(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "SNAP households", "US", "snap_households"), - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - (4, "snap", "==", "1"), - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 4, - "snap_household_count", - 2024, - 10.0, - "COUNT", - None, - "USDA_SNAP", - "USDA", - None, - None, - ), - ( - 9, - 4, - "snap_benefits", - 2024, - 500.0, - "AMOUNT", - None, - "USDA_SNAP", - "USDA", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["USDA_SNAP"], - "target_cells": [ - { - "variable": "snap", - "geo_level": "national", - "domain_variable": "snap", - }, - { - "variable": "household_count", - "geo_level": "national", - "domain_variable": "snap", - }, - ], - }, - ) - ) - - assert {target.metadata["target_id"] for target in target_set.targets} == {8, 9} - for target in target_set.targets: - assert [ - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - ] == [("snap", ">", 0)] - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["USDA_SNAP"], - "target_cells": [ - { - "variable": "snap", - "geo_level": "national", - "domain_variable": None, - }, - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [9] - - -def test_arch_provider_normalizes_congressional_district_constraints(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "CA-01 Filers", "US", "ca_01_filers"), - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (4, "is_tax_filer", "=", "1"), - (4, "state_fips", "=", "06"), - (4, "congressional_district", "=", "01"), - ], - ) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 4, - "adjusted_gross_income", - 2023, - 500.0, - "AMOUNT", - "CONGRESSIONAL_DISTRICT", - "IRS_SOI", - "SOI", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "adjusted_gross_income", - "geo_level": "district", - "geographic_id": "0601", - "domain_variable": None, - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - target = target_set.targets[0] - assert target.metadata["geo_level"] == "district" - assert ( - "congressional_district_geoid", - "==", - "0601", - ) in { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - } - assert ( - "tax_unit_is_filer", - "==", - "1", - ) in { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - } - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["adjusted_gross_income"], - "geo_levels": ["state"], - }, - ) - ) - - assert 8 not in {target.metadata["target_id"] for target in target_set.targets} - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["adjusted_gross_income"], - "geo_levels": ["congressional_district"], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["adjusted_gross_income"], - "geo_levels": ["congressional-district"], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - - -def test_arch_provider_no_domain_target_cell_excludes_domain_strata(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - _insert_multi_domain_soi_targets(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "adjusted_gross_income", - "geo_level": "national", - "domain_variable": None, - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [4] - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "adjusted_gross_income", - "geo_level": "national", - "domain_variable": "", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [4] - - -def test_arch_provider_current_year_partial_soi_falls_back_to_latest_soi(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 2, - "tax_unit_count", - 2024, - 25.0, - "COUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 9, - 2, - "adjusted_gross_income", - 2024, - 1_200.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 10, - 2, - "income_tax_liability", - 2024, - 80.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["tax_exempt_interest_income"], - }, - ) - ) - - assert {target.metadata["target_id"] for target in target_set.targets} == {1, 2} - assert {target.metadata["arch_source_period"] for target in target_set.targets} == { - 2023 - } - - -def test_arch_provider_uses_latest_soi_record_per_composition(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "CA Filers", "US", "ca_filers"), - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (4, "is_tax_filer", "==", "1"), - (4, "state_fips", "==", "06"), - ], - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 1, - "labor_force_count", - 2022, - 100.0, - "COUNT", - None, - "BLS", - "BLS", - None, - None, - ), - ( - 9, - 2, - "tax_unit_count", - 2024, - 25.0, - "COUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 10, - 2, - "adjusted_gross_income", - 2024, - 1_200.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 11, - 2, - "income_tax_liability", - 2024, - 80.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 12, - 4, - "wages_salaries_amount", - 2022, - 90.0, - "AMOUNT", - "STATE", - "IRS_SOI", - "SOI", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "employment_income", - "geo_level": "state", - "domain_variable": "employment_income", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [12] - assert target_set.targets[0].period == 2024 - assert target_set.targets[0].metadata["arch_source_period"] == 2022 - - -def test_arch_provider_maps_income_tax_before_credits_targets(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 2, - "income_tax_before_credits_returns", - 2023, - 50.0, - "COUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 9, - 2, - "income_tax_before_credits_amount", - 2023, - 500.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "income_tax_before_credits", - }, - { - "variable": "income_tax_before_credits", - "geo_level": "national", - "domain_variable": "income_tax_before_credits", - }, - ], - }, - ) - ) - - assert {target.metadata["target_id"] for target in target_set.targets} == {8, 9} - count_target = next( - target - for target in target_set.targets - if target.aggregation is TargetAggregation.COUNT - ) - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in count_target.filters - } == { - ("tax_unit_is_filer", "==", "1"), - ("income_tax_before_credits", ">", 0), - } - - -def test_arch_provider_maps_real_estate_tax_targets(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 2, - "real_estate_taxes_claims", - 2023, - 12.0, - "COUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ( - 9, - 2, - "real_estate_taxes_amount", - 2023, - 120.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "real_estate_taxes", - }, - { - "variable": "real_estate_taxes", - "geo_level": "national", - "domain_variable": "real_estate_taxes", - }, - ], - }, - ) - ) - - assert {target.metadata["target_id"] for target in target_set.targets} == {8, 9} - assert {target.metadata["variable"] for target in target_set.targets} == { - "real_estate_taxes", - "tax_unit_count", - } - - -def test_arch_provider_maps_aca_aptc_amount_targets(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "CA ACA Marketplace", "US", "ca_aca"), - ) - conn.execute( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - (4, "state_fips", "==", "06"), - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 4, - "aca_aptc_amount", - 2024, - 100.0, - "AMOUNT", - "STATE", - "CMS_ACA", - "CMS OEP", - None, - None, - ), - ( - 9, - 4, - "aca_marketplace_enrollment", - 2024, - 200.0, - "COUNT", - "STATE", - "CMS_ACA", - "CMS OEP", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["CMS_ACA"], - "target_cells": [ - { - "variable": "aca_ptc", - "geo_level": "state", - "domain_variable": None, - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - assert target_set.targets[0].measure == "aca_ptc" - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["CMS_ACA"], - "target_cells": [ - { - "variable": "person_count", - "geo_level": "state", - "domain_variable": "aca_ptc,is_aca_ptc_eligible", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [9] - - -def test_arch_provider_maps_soi_aca_ptc_return_counts(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 2, - "aca_ptc_returns", - 2023, - 7_841_370.0, - "COUNT", - None, - "IRS_SOI", - "Historic Table 2", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "used_aca_ptc", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - target = target_set.targets[0] - assert target.aggregation is TargetAggregation.COUNT - assert target.entity is EntityType.TAX_UNIT - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - } == { - ("tax_unit_is_filer", "==", "1"), - ("aca_ptc", ">", 0), - } - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": ( - "selected_marketplace_plan_benchmark_ratio,used_aca_ptc" - ), - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - - -def test_arch_provider_maps_soi_tax_filer_individual_counts(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - (4, "CA AGI 1_to_10k", "US", "ca_agi_1_to_10k"), - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (4, "is_tax_filer", "==", "1"), - (4, "state_fips", "==", "06"), - (4, "agi_bracket", "==", "1_to_10k"), - ], - ) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 4, - "tax_filer_individual_count", - 2023, - 1_930_150.0, - "COUNT", - "STATE", - "IRS_SOI", - "Historic Table 2", - None, - "SOI number of individuals does not represent full population.", - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "person_count", - "geo_level": "state", - "domain_variable": "adjusted_gross_income", - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - target = target_set.targets[0] - assert target.aggregation is TargetAggregation.COUNT - assert target.entity is EntityType.PERSON - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - } == { - ("tax_unit_is_filer", "==", "1"), - ("state_fips", "==", "06"), - ("adjusted_gross_income", ">=", 1), - ("adjusted_gross_income", "<", 10_000), - } - - -def test_arch_provider_maps_medicaid_benefit_targets(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 8, - 1, - "medicaid_benefits", - 2024, - 931_692_000_000.0, - "AMOUNT", - "NATIONAL", - "CMS_MEDICAID", - "CMS NHE", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["CMS_MEDICAID"], - "target_cells": [ - { - "variable": "medicaid", - "geo_level": "national", - "domain_variable": None, - } - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8] - target = target_set.targets[0] - assert target.measure == "medicaid" - assert target.entity is EntityType.PERSON - - -def test_arch_consumer_fact_provider_maps_wealth_and_part_b_targets(tmp_path): - jsonl_path = tmp_path / "consumer_facts.jsonl" - rows = [ - { - "schema_version": "arch.consumer_fact.v1", - "aggregate_fact_key": "arch.aggregate_fact.v2:net_worth", - "semantic_fact_key": "arch.semantic_fact.v2:net_worth", - "concept_alignment": { - "canonical_concept": ( - "federal_reserve.z1.households_nonprofits_net_worth" - ), - "source_concept": "federal_reserve.z1.fl152090005", - "relation": "source_label", - "authority": "arch-us", - }, - "geography": { - "id": "0100000US", - "level": "country", - }, - "label": "United States household net worth", - "observed_measure": { - "source_concept": "federal_reserve.z1.fl152090005", - "source_measure_id": "amount_outstanding", - "source_name": "federal_reserve", - "source_table": ( - "Z.1 B.101 Households and nonprofit organizations" - ), - "unit": "usd", - }, - "period": {"type": "calendar_year", "value": 2024}, - "source": { - "source_name": "federal_reserve", - "source_table": ( - "Z.1 B.101 Households and nonprofit organizations" - ), - "url": "https://www.federalreserve.gov/releases/z1/", - }, - "universe_constraints": {"domain": "household_balance_sheet"}, - "value": 169_619_200_000_000, - }, - { - "schema_version": "arch.consumer_fact.v1", - "aggregate_fact_key": "arch.aggregate_fact.v2:part_b", - "semantic_fact_key": "arch.semantic_fact.v2:part_b", - "concept_alignment": { - "canonical_concept": "cms_medicare.part_b_premium_income", - "source_concept": "cms_medicare.part_b_premium_income", - }, - "geography": { - "id": "0100000US", - "level": "country", - }, - "label": "United States Medicare Part B premium income", - "observed_measure": { - "source_concept": "cms_medicare.part_b_premium_income", - "source_measure_id": "actual_amount", - "source_name": "cms_medicare", - "source_table": "2025 Medicare Trustees Report Table III.C3", - "unit": "usd", - }, - "period": {"type": "calendar_year", "value": 2024}, - "source": { - "source_name": "cms_medicare", - "source_table": "2025 Medicare Trustees Report Table III.C3", - "url": "https://www.cms.gov/oact/tr/2025", - }, - "universe_constraints": { - "domain": "medicare_financing", - "constraints": [ - { - "operator": "==", - "role": "filter", - "value": "actual", - "variable": "amount_basis", - }, - { - "operator": "==", - "role": "filter", - "value": "part_b", - "variable": "medicare.part", - }, - { - "operator": "==", - "role": "filter", - "value": "premiums_from_enrollees", - "variable": "medicare.financing_component", - }, - ], - }, - "value": 139_837_000_000, - }, - ] - jsonl_path.write_text( - "".join(f"{json.dumps(row, sort_keys=True)}\n" for row in rows) - ) - - provider = ArchConsumerFactJSONLTargetProvider(jsonl_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "target_cells": [ - { - "variable": "net_worth", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "medicare_part_b_premiums", - "geo_level": "national", - "domain_variable": None, - }, - ], - }, - ) - ) - - targets_by_measure = {target.measure: target for target in target_set.targets} - assert set(targets_by_measure) == {"medicare_part_b_premiums", "net_worth"} - - net_worth = targets_by_measure["net_worth"] - assert net_worth.entity is EntityType.HOUSEHOLD - assert net_worth.aggregation is TargetAggregation.SUM - assert net_worth.value == pytest.approx(169_619_200_000_000) - assert net_worth.filters == () - assert net_worth.metadata["source"] == "FEDERAL_RESERVE" - assert net_worth.metadata["arch_source_concept"] == ( - "federal_reserve.z1.fl152090005" - ) - - part_b = targets_by_measure["medicare_part_b_premiums"] - assert part_b.entity is EntityType.PERSON - assert part_b.aggregation is TargetAggregation.SUM - assert part_b.value == pytest.approx(139_837_000_000) - assert part_b.filters == () - assert part_b.metadata["source"] == "CMS_MEDICARE" - assert part_b.metadata["arch_concept"] == "cms_medicare.part_b_premium_income" - - -def test_arch_consumer_fact_provider_maps_population_projection_targets(tmp_path): - jsonl_path = tmp_path / "consumer_facts.jsonl" - rows = [ - { - "schema_version": "arch.consumer_fact.v1", - "aggregate_fact_key": "arch.aggregate_fact.v2:age_0", - "semantic_fact_key": "arch.semantic_fact.v2:age_0", - "concept_alignment": { - "canonical_concept": "census.population_projection", - "source_concept": "census.POP_0", - "relation": "source_label", - "authority": "census", - }, - "geography": { - "id": "0100000US", - "level": "country", - }, - "label": "United States projected population age 0", - "observed_measure": { - "source_concept": "census.POP_0", - "source_measure_id": "population", - "source_name": "census_population_projections", - "source_table": ( - "2023 National Population Projections Main Series, middle series" - ), - "unit": "count", - }, - "period": {"type": "calendar_year", "value": 2025}, - "source": { - "source_name": "census_population_projections", - "source_table": ( - "2023 National Population Projections Main Series, middle series" - ), - "source_file": "np2023_d5_mid.csv", - "url": ( - "https://www2.census.gov/programs-surveys/popproj/datasets/" - "2023/2023-popproj/np2023_d5_mid.csv" - ), - }, - "universe_constraints": { - "domain": "population_projection", - "constraints": [ - { - "operator": "<", - "role": "filter", - "unit": "years", - "value": 1, - "variable": "age", - }, - { - "operator": ">=", - "role": "filter", - "unit": "years", - "value": 0, - "variable": "age", - }, - ], - }, - "value": 3_641_659, - }, - ] - jsonl_path.write_text( - "".join(f"{json.dumps(row, sort_keys=True)}\n" for row in rows) - ) - - provider = ArchConsumerFactJSONLTargetProvider(jsonl_path) - target_set = provider.load_target_set(TargetQuery(period=2025)) - - assert len(target_set.targets) == 1 - target = target_set.targets[0] - assert target.measure is None - assert target.entity is EntityType.PERSON - assert target.aggregation is TargetAggregation.COUNT - assert target.value == pytest.approx(3_641_659) - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - } == {("age", "<", "1"), ("age", ">=", "0")} - assert target.metadata["source"] == "CENSUS_POPULATION_PROJECTIONS" - assert target.metadata["arch_concept"] == "census.population_projection" - assert target.metadata["arch_source_concept"] == "census.POP_0" - - -def test_arch_provider_maps_ssa_benefit_targets(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 1, - "social_security_benefits", - 2024, - 1_471_195_000_000.0, - "AMOUNT", - "NATIONAL", - "SSA", - "SSA Supplement", - None, - None, - ), - ( - 9, - 1, - "social_security_retirement_benefits", - 2024, - 1_111_728_000_000.0, - "AMOUNT", - "NATIONAL", - "SSA", - "SSA Supplement", - None, - None, - ), - ( - 10, - 1, - "ssi_payments", - 2024, - 63_079_493_000.0, - "AMOUNT", - "NATIONAL", - "SSA", - "SSA Supplement", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["SSA"], - "target_cells": [ - { - "variable": "social_security", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "social_security_retirement", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "ssi", - "geo_level": "national", - "domain_variable": None, - }, - ], - }, - ) - ) - - assert {target.metadata["target_id"] for target in target_set.targets} == { - 8, - 9, - 10, - } - assert {target.measure for target in target_set.targets} == { - "social_security", - "social_security_retirement", - "ssi", - } - assert {target.entity for target in target_set.targets} == {EntityType.PERSON} - - -def test_arch_provider_maps_tanf_cash_assistance_target(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 8, - 1, - "tanf_cash_assistance", - 2024, - 7_788_317_474.55, - "AMOUNT", - "NATIONAL", - "HHS_ACF_TANF", - "ACF TANF Financial Data", - None, - None, - ), - ( - 9, - 1, - "tanf_family_count", - 2024, - 841_208.67, - "COUNT", - "NATIONAL", - "HHS_ACF_TANF", - "ACF TANF Caseload Data", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["HHS_ACF_TANF"], - "target_cells": [ - { - "variable": "tanf", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "spm_unit_count", - "geo_level": "national", - "domain_variable": "tanf", - }, - ], - }, - ) - ) - - assert [target.metadata["target_id"] for target in target_set.targets] == [8, 9] - targets_by_id = { - target.metadata["target_id"]: target for target in target_set.targets - } - assert targets_by_id[8].measure == "tanf" - assert targets_by_id[8].entity is EntityType.SPM_UNIT - assert targets_by_id[9].measure is None - assert targets_by_id[9].metadata["variable"] == "spm_unit_count" - assert targets_by_id[9].entity is EntityType.SPM_UNIT - - -def test_arch_provider_maps_w2_tip_income_without_source_year_labor_force(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - _insert_w2_tip_income_target(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "variables": ["tip_income"], - }, - ) - ) - - assert len(target_set.targets) == 1 - target = target_set.targets[0] - assert target.entity is EntityType.PERSON - assert target.measure == "tip_income" - assert target.aggregation is TargetAggregation.SUM - assert target.value == pytest.approx(121.0) - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in target.filters - } == {("tip_income", ">", "0")} - assert target.metadata["arch_source_period"] == 2020 - assert target.metadata["arch_aging_count_method"] == "not_required" - assert target.metadata["arch_aging_amount_method"] == ( - "soi_total_agi_last_growth_extrapolation" - ) - - -def test_arch_provider_maps_ira_contribution_targets(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.executemany( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - [ - (12, "US taxpayers with traditional IRA contributions", "US", "trad_ira"), - (13, "US taxpayers with Roth IRA contributions", "US", "roth_ira"), - ], - ) - conn.executemany( - """ - INSERT INTO stratum_constraints ( - stratum_id, - variable, - operator, - value - ) VALUES (?, ?, ?, ?) - """, - [ - (12, "traditional_ira_contributions", ">", "0"), - (13, "roth_ira_contributions", ">", "0"), - ], - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 12, - 12, - "traditional_ira_contributions", - 2022, - 50.0, - "AMOUNT", - "NATIONAL", - "IRS_SOI", - "IRA", - None, - None, - ), - ( - 13, - 13, - "roth_ira_contributions", - 2022, - 75.0, - "AMOUNT", - "NATIONAL", - "IRS_SOI", - "IRA", - None, - None, - ), - ], - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "traditional_ira_contributions", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "roth_ira_contributions", - "geo_level": "national", - "domain_variable": None, - }, - ], - }, - ) - ) - - assert {target.measure for target in target_set.targets} == { - "traditional_ira_contributions", - "roth_ira_contributions", - } - assert {target.entity for target in target_set.targets} == {EntityType.PERSON} - assert { - target.metadata["arch_aging_count_method"] for target in target_set.targets - } == {"not_required"} - - -def test_us_pipeline_can_select_arch_target_provider(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - arch_targets_db=str(db_path), - calibration_target_source="arch", - ) - ) - - provider, source = pipeline._resolve_calibration_target_provider() - - assert source == "arch" - assert isinstance(provider, ArchSQLiteTargetProvider) - - -def test_arch_target_profile_coverage_summarizes_custom_cells(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "adjusted_gross_income", - geo_level="national", - domain_variable=None, - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="tax_exempt_interest_income", - ), - PolicyEngineUSTargetCell( - "employment_income", - geo_level="national", - domain_variable="employment_income", - ), - ), - ) - - assert report.target_cell_count == 3 - assert report.covered_cell_count == 2 - assert report.uncovered_cell_count == 1 - assert report.coverage_rate == pytest.approx(2 / 3) - assert report.by_geo_level == { - "national": { - "target_cell_count": 3, - "covered_cell_count": 2, - "uncovered_cell_count": 1, - } - } - assert report.by_variable["adjusted_gross_income"]["covered_cell_count"] == 1 - assert report.by_variable["employment_income"]["uncovered_cell_count"] == 1 - - payload = report.to_dict() - assert payload["profile_name"] == "custom" - assert payload["cells"][0]["target_ids"] == [4] - assert payload["cells"][1]["target_ids"] == [1] - assert payload["cells"][2]["covered"] is False - - -def test_arch_target_profile_coverage_accepts_soi_itemized_domain(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - _insert_irs_soi_itemized_deduction_targets(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "medical_expense_deduction", - geo_level="national", - domain_variable="medical_expense_deduction", - ), - PolicyEngineUSTargetCell( - "medical_expense_deduction", - geo_level="national", - domain_variable=("medical_expense_deduction,tax_unit_itemizes"), - ), - PolicyEngineUSTargetCell( - "real_estate_taxes", - geo_level="national", - domain_variable="real_estate_taxes,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "salt", - geo_level="national", - domain_variable="salt,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable=("medical_expense_deduction,tax_unit_itemizes"), - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="real_estate_taxes,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="salt,tax_unit_itemizes", - ), - ), - ) - - assert report.target_cell_count == 7 - assert report.covered_cell_count == 7 - assert { - (cell.cell["variable"], cell.cell["domain_variable"]): cell.target_ids - for cell in report.cells - } == { - ( - "medical_expense_deduction", - "medical_expense_deduction", - ): (12,), - ( - "medical_expense_deduction", - "medical_expense_deduction,tax_unit_itemizes", - ): (12,), - ( - "real_estate_taxes", - "real_estate_taxes,tax_unit_itemizes", - ): (13,), - ("salt", "salt,tax_unit_itemizes"): (14,), - ( - "tax_unit_count", - "medical_expense_deduction,tax_unit_itemizes", - ): (15,), - ( - "tax_unit_count", - "real_estate_taxes,tax_unit_itemizes", - ): (16,), - ("tax_unit_count", "salt,tax_unit_itemizes"): (17,), - } - - -def test_arch_target_profile_coverage_accepts_soi_medical_dental_domain( - tmp_path, -): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - conn = sqlite3.connect(db_path) - conn.execute( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - 800, - 2, - "medical_dental_expense_amount", - 2023, - 100.0, - "AMOUNT", - None, - "IRS_SOI", - "SOI Historic Table 2 state broad totals", - None, - None, - ), - ) - conn.commit() - conn.close() - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "medical_expense_deduction", - geo_level="national", - domain_variable=( - "medical_expense_deduction,tax_unit_itemizes" - ), - ), - ), - ) - - assert report.covered_cell_count == 1 - assert report.cells[0].target_ids == (800,) - - -def test_arch_target_profile_coverage_rolls_complete_state_targets_to_national( - tmp_path, -): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - _insert_complete_state_rollup_targets(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "non_refundable_ctc", - geo_level="national", - domain_variable="non_refundable_ctc", - ), - PolicyEngineUSTargetCell( - "non_refundable_ctc", - geo_level="national", - domain_variable="adjusted_gross_income,non_refundable_ctc", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="non_refundable_ctc", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income,non_refundable_ctc", - ), - PolicyEngineUSTargetCell( - "qualified_business_income_deduction", - geo_level="national", - domain_variable="qualified_business_income_deduction", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="qualified_business_income_deduction", - ), - PolicyEngineUSTargetCell( - "medical_expense_deduction", - geo_level="national", - domain_variable="medical_expense_deduction,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="medical_expense_deduction,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "aca_ptc", - geo_level="national", - domain_variable="aca_ptc", - ), - ), - ) - - assert report.target_cell_count == 9 - assert report.covered_cell_count == 9 - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "target_cells": [cell.cell for cell in report.cells], - }, - ) - ) - rollup_targets = { - ( - target.measure or target.metadata["variable"], - target.aggregation, - target.metadata["arch_variable"], - ): target - for target in target_set.targets - if target.metadata["geo_level"] == "national" - and str(target.metadata["target_id"]).startswith("-") - } - assert rollup_targets[ - ("non_refundable_ctc", TargetAggregation.SUM, "ctc_amount") - ].value == pytest.approx(sum(1_000.0 + index for index in range(51))) - assert rollup_targets[ - ("tax_unit_count", TargetAggregation.COUNT, "ctc_claims") - ].value == pytest.approx(sum(100.0 + index for index in range(51))) - assert rollup_targets[ - ( - "qualified_business_income_deduction", - TargetAggregation.SUM, - "qbi_amount", - ) - ].value == pytest.approx(sum(2_000.0 + index for index in range(51))) - assert rollup_targets[ - ("tax_unit_count", TargetAggregation.COUNT, "qbi_claims") - ].value == pytest.approx(sum(200.0 + index for index in range(51))) - assert rollup_targets[ - ("medical_expense_deduction", TargetAggregation.SUM, "medical_amount") - ].value == pytest.approx(sum(300.0 + index for index in range(51))) - assert rollup_targets[ - ("tax_unit_count", TargetAggregation.COUNT, "medical_claims") - ].value == pytest.approx(sum(30.0 + index for index in range(51))) - assert rollup_targets[ - ("aca_ptc", TargetAggregation.SUM, "aca_aptc_amount") - ].value == pytest.approx(sum(10_000.0 + index for index in range(51))) - - -def test_arch_target_profile_coverage_reports_current_pe_profile(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="pe_native_broad", - ) - - assert report.target_cell_count == 220 - assert report.covered_cell_count == 4 - assert report.uncovered_cell_count == 216 - assert report.by_geo_level["national"]["covered_cell_count"] == 3 - assert report.by_geo_level["state"]["covered_cell_count"] == 1 - - covered_cells = { - ( - cell.cell["variable"], - cell.cell["geo_level"], - cell.cell["domain_variable"], - ): cell.target_ids - for cell in report.cells - if cell.covered - } - assert covered_cells == { - ("adjusted_gross_income", "national", None): (4,), - ( - "tax_exempt_interest_income", - "national", - "tax_exempt_interest_income", - ): (2,), - ("tax_unit_count", "national", "tax_exempt_interest_income"): (1,), - ("tax_unit_count", "state", "adjusted_gross_income"): (7,), - } - - -def test_arch_target_gap_queue_describes_missing_loader_rows(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "employment_income", - geo_level="national", - domain_variable="employment_income", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="employment_income", - ), - ), - ) - - assert report.row_count == 2 - assert report.covered_row_count == 0 - assert report.uncovered_row_count == 2 - assert report.by_loader_status == {"missing_arch_target_record": 2} - assert report.by_gap_category == {"ready_primary_loader": 2} - - rows_by_variable = {row.variable: row for row in report.rows} - amount_row = rows_by_variable["employment_income"] - assert amount_row.priority == 1 - assert amount_row.expected_source == "IRS_SOI" - assert amount_row.expected_source_table == "IRS SOI Publication 1304 Table 1.4" - assert amount_row.expected_arch_variable == "wages_salaries_amount" - assert amount_row.expected_target_type == "AMOUNT" - assert amount_row.expected_entity == "person" - assert amount_row.expected_aggregation == "sum" - assert amount_row.gap_category == "ready_primary_loader" - assert amount_row.expected_filters == ( - { - "kind": "domain", - "feature": "employment_income", - "operator": ">", - "value": 0, - }, - ) - assert amount_row.agent_task_kind == "add_arch_source_loader_or_target_record" - - count_row = rows_by_variable["tax_unit_count"] - assert count_row.expected_arch_variable == "wages_salaries_returns" - assert count_row.expected_target_type == "COUNT" - assert count_row.expected_entity == "tax_unit" - assert count_row.expected_aggregation == "count" - assert count_row.gap_category == "ready_primary_loader" - - -def test_arch_target_gap_queue_points_full_population_amounts_to_bea(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "employment_income", - geo_level="national", - domain_variable=None, - ), - PolicyEngineUSTargetCell( - "self_employment_income", - geo_level="national", - domain_variable=None, - ), - PolicyEngineUSTargetCell( - "dividend_income", - geo_level="national", - domain_variable=None, - ), - PolicyEngineUSTargetCell( - "self_employment_income", - geo_level="state", - domain_variable=None, - ), - ), - ) - - rows_by_cell = {(row.variable, row.geo_level): row for row in report.rows} - assert rows_by_cell[("employment_income", "national")].expected_source == "BEA" - assert rows_by_cell[("employment_income", "national")].expected_arch_variable == ( - "wages_salaries_amount" - ) - assert rows_by_cell[("employment_income", "national")].expected_source_table == ( - "BEA NIPA annual total wages and salaries" - ) - assert ( - rows_by_cell[("self_employment_income", "national")].expected_source - == "IRS_SOI" - ) - assert rows_by_cell[ - ("self_employment_income", "national") - ].expected_arch_variable == ("schedule_c_income_amount") - assert rows_by_cell[ - ("self_employment_income", "national") - ].expected_source_table == ("IRS SOI Publication 1304") - assert rows_by_cell[("dividend_income", "national")].expected_source == "BEA" - assert rows_by_cell[("dividend_income", "national")].expected_arch_variable == ( - "personal_dividend_income_amount" - ) - assert ( - rows_by_cell[("self_employment_income", "state")].expected_source == "IRS_SOI" - ) - assert rows_by_cell[("self_employment_income", "state")].expected_source_table == ( - "IRS SOI Publication 1304" - ) - - -def test_arch_target_gap_queue_marks_multi_domain_rows_for_review(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income,medical_expense_deduction", - ), - ), - ) - - row = report.rows[0] - assert row.expected_source == "IRS_SOI" - assert row.expected_arch_variable is None - assert row.loader_status == "needs_source_mapping_review" - assert row.gap_category == "source_mapping_review" - assert row.agent_task_kind == "review_source_mapping" - assert "multi-domain cells" in row.notes - - -def test_arch_target_gap_queue_points_eitc_child_rows_to_soi_table_2(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="eitc_child_count", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income,eitc,eitc_child_count", - ), - ), - ) - - assert {row.expected_arch_variable for row in report.rows} == {"eitc_claims"} - assert {row.expected_source_table for row in report.rows} == { - "IRS SOI Historic Table 2" - } - assert {row.expected_target_type for row in report.rows} == {"COUNT"} - assert {row.loader_status for row in report.rows} == {"missing_arch_target_record"} - assert {row.gap_category for row in report.rows} == {"ready_primary_loader"} - - -def test_arch_target_gap_queue_points_aca_ptc_counts_to_soi_table_2(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="used_aca_ptc", - ), - ), - ) - - row = report.rows[0] - assert row.expected_source == "IRS_SOI" - assert row.expected_source_table == "IRS SOI Historic Table 2" - assert row.expected_arch_variable == "aca_ptc_returns" - assert row.expected_target_type == "COUNT" - assert row.expected_entity == "tax_unit" - assert row.loader_status == "missing_arch_target_record" - assert row.gap_category == "ready_primary_loader" - - -def test_arch_target_gap_queue_points_income_tax_return_counts_to_soi_table_2( - tmp_path, -): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable=( - "adjusted_gross_income,income_tax_before_credits" - ), - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable=( - "adjusted_gross_income,filing_status," - "income_tax_before_credits" - ), - ), - ), - ) - - assert {row.expected_source for row in report.rows} == {"IRS_SOI"} - assert {row.expected_source_table for row in report.rows} == { - "IRS SOI Historic Table 2" - } - assert {row.expected_arch_variable for row in report.rows} == { - "income_tax_before_credits_returns" - } - assert {row.expected_target_type for row in report.rows} == {"COUNT"} - assert {row.expected_entity for row in report.rows} == {"tax_unit"} - - -def test_arch_target_gap_queue_points_energy_subsidy_households_to_liheap( - tmp_path, -): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "household_count", - geo_level="national", - domain_variable="spm_unit_energy_subsidy_reported", - ), - ), - ) - - row = report.rows[0] - assert row.expected_source == "HHS_ACF_LIHEAP" - assert row.expected_source_table == "HHS ACF LIHEAP National Profile" - assert row.expected_arch_variable == "liheap_household_count" - assert row.expected_target_type == "COUNT" - assert row.expected_entity == "household" - assert row.loader_status == "missing_arch_target_record" - assert row.gap_category == "ready_primary_loader" - - -def test_arch_target_gap_queue_points_retirement_contributions_to_soi( - tmp_path, -): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "traditional_401k_contributions", - geo_level="national", - ), - PolicyEngineUSTargetCell("roth_401k_contributions", geo_level="national"), - PolicyEngineUSTargetCell( - "self_employed_pension_contribution_ald", - geo_level="national", - ), - ), - ) - - rows_by_variable = {row.variable: row for row in report.rows} - traditional = rows_by_variable["traditional_401k_contributions"] - roth = rows_by_variable["roth_401k_contributions"] - self_employed = rows_by_variable["self_employed_pension_contribution_ald"] - - assert {row.expected_source for row in report.rows} == {"IRS_SOI"} - assert traditional.expected_source_table == "IRS SOI Form W-2 Statistics Table 4.B" - assert traditional.expected_arch_variable == "traditional_401k_contributions" - assert traditional.expected_entity == "person" - assert roth.expected_source_table == "IRS SOI Form W-2 Statistics Table 4.B" - assert roth.expected_arch_variable == "roth_401k_contributions" - assert roth.expected_entity == "person" - assert self_employed.expected_source_table == ( - "IRS SOI Publication 1304 Table 1.4" - ) - assert self_employed.expected_arch_variable == ( - "self_employed_pension_contribution_ald" - ) - assert self_employed.expected_entity == "tax_unit" - assert {row.gap_category for row in report.rows} == {"ready_primary_loader"} - - -def test_arch_target_gap_queue_points_agi_person_counts_to_soi_table_2(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "person_count", - geo_level="state", - domain_variable="adjusted_gross_income", - ), - ), - ) - - row = report.rows[0] - assert row.expected_source == "IRS_SOI" - assert row.expected_source_table == "IRS SOI Historic Table 2" - assert row.expected_arch_variable == "tax_filer_individual_count" - assert row.expected_target_type == "COUNT" - assert row.expected_entity == "person" - assert row.loader_status == "missing_arch_target_record" - assert row.gap_category == "ready_primary_loader" - - -def test_arch_target_gap_queue_points_state_income_tax_to_census_stc(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=(PolicyEngineUSTargetCell("state_income_tax", geo_level="state"),), - ) - - row = report.rows[0] - assert row.expected_source == "CENSUS_STC" - assert row.expected_source_table == "Census State Tax Collections item T40" - assert row.expected_arch_variable == "state_individual_income_tax_collections" - assert row.expected_target_type == "AMOUNT" - assert row.expected_entity == "tax_unit" - assert row.loader_status == "missing_arch_target_record" - assert row.gap_category == "ready_primary_loader" - - -def test_arch_target_gap_queue_points_itemized_deductions_to_soi_table_2(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell("salt_deduction", geo_level="national"), - PolicyEngineUSTargetCell("interest_deduction", geo_level="national"), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="salt,tax_unit_itemizes", - ), - ), - ) - - rows_by_variable = {row.variable: row for row in report.rows} - rows_by_cell = {(row.variable, row.domain_variable): row for row in report.rows} - salt_row = rows_by_variable["salt_deduction"] - assert salt_row.expected_source == "IRS_SOI" - assert salt_row.expected_source_table == "IRS SOI Publication 1304 Table 2.1" - assert salt_row.expected_arch_variable == "limited_state_local_taxes_amount" - assert salt_row.expected_target_type == "AMOUNT" - assert salt_row.expected_entity == "tax_unit" - assert salt_row.loader_status == "missing_arch_target_record" - assert salt_row.gap_category == "ready_primary_loader" - - interest_row = rows_by_variable["interest_deduction"] - assert interest_row.expected_source == "IRS_SOI" - assert interest_row.expected_source_table == "IRS SOI Publication 1304 Table 2.1" - assert interest_row.expected_arch_variable == "interest_paid_deduction_amount" - assert interest_row.expected_target_type == "AMOUNT" - assert interest_row.expected_entity == "tax_unit" - assert interest_row.loader_status == "missing_arch_target_record" - assert interest_row.gap_category == "ready_primary_loader" - - salt_count_row = rows_by_cell[("tax_unit_count", "salt,tax_unit_itemizes")] - assert salt_count_row.expected_source == "IRS_SOI" - assert salt_count_row.expected_source_table == ( - "IRS SOI itemized deduction or credit tables" - ) - assert salt_count_row.expected_arch_variable == "salt_claims" - assert salt_count_row.expected_target_type == "COUNT" - assert salt_count_row.expected_entity == "tax_unit" - assert salt_count_row.loader_status == "missing_arch_target_record" - assert salt_count_row.gap_category == "ready_primary_loader" - - -def test_arch_target_gap_queue_points_income_tax_positive_to_soi_liability(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell("income_tax_positive", geo_level="national"), - ), - ) - - row = report.rows[0] - assert row.expected_source == "IRS_SOI" - assert row.expected_source_table == ( - "IRS SOI Publication 1304 Table 1.1 or Historic Table 2" - ) - assert row.expected_arch_variable == "income_tax_liability" - assert row.expected_target_type == "AMOUNT" - assert row.expected_entity == "tax_unit" - assert row.loader_status == "missing_arch_target_record" - assert row.gap_category == "ready_primary_loader" - - -def test_arch_target_gap_queue_deprioritizes_survey_or_model_inputs(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell("rent", geo_level="national"), - PolicyEngineUSTargetCell( - "person_count", - geo_level="national", - domain_variable="ssn_card_type", - ), - ), - ) - - assert report.by_gap_category == {"survey_or_model_input_deprioritized": 2} - assert {row.gap_category for row in report.rows} == { - "survey_or_model_input_deprioritized" - } - assert {row.agent_task_kind for row in report.rows} == { - "defer_or_review_non_primary_source" - } - assert all( - "survey/model-input proxy deprioritized" in row.notes for row in report.rows - ) - - -def test_arch_target_gap_queue_classifies_loaded_wrong_geography(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="adjusted_gross_income", - ), - ), - ) - - row = report.rows[0] - assert row.expected_source == "IRS_SOI" - assert row.expected_arch_variable == "tax_unit_count" - assert row.loader_status == "loaded_arch_variable_missing_geography" - assert row.gap_category == "ready_rollup_or_geography" - assert row.agent_task_kind == "add_arch_rollup_or_geography_records" - - -def test_arch_target_gap_queue_can_include_covered_rows(tmp_path): - db_path = tmp_path / "arch_targets.db" - _create_arch_targets_db(db_path) - - provider = ArchSQLiteTargetProvider(db_path) - report = summarize_arch_target_gap_queue( - provider, - period=2024, - profile_name="custom", - include_covered=True, - target_cells=( - PolicyEngineUSTargetCell( - "adjusted_gross_income", - geo_level="national", - domain_variable=None, - ), - ), - ) - - assert report.row_count == 1 - row = report.rows[0] - assert row.covered is True - assert row.target_ids == (4,) - assert row.expected_filters == () - assert row.loader_status == "covered" - assert row.gap_category == "covered" - assert row.agent_task_kind == "none" - - -def test_arch_target_gap_queue_cli_writes_csv(tmp_path): - db_path = tmp_path / "arch_targets.db" - output_path = tmp_path / "gaps.csv" - _create_arch_targets_db(db_path) - - exit_code = main_gaps( - [ - "--arch-targets-db", - str(db_path), - "--period", - "2024", - "--profile", - "pe_native_broad", - "--format", - "csv", - "--output", - str(output_path), - ] - ) - - assert exit_code == 0 - text = output_path.read_text() - assert text.startswith("priority,profile_name,period,variable") - assert "gap_category" in text - assert "employment_income" in text - assert "missing_arch_target_record" in text - - -def test_arch_target_refresh_cli_discovers_artifact_and_writes_snapshot(tmp_path): - artifact_root = tmp_path / "artifacts" - artifact_root.mkdir() - db_path = artifact_root / "arch_targets_fixture.db" - output_dir = tmp_path / "snapshot" - _create_arch_targets_db(db_path) - - exit_code = main_refresh( - [ - "--artifact-root", - str(artifact_root), - "--period", - "2024", - "--profile", - "pe_native_broad", - "--output-dir", - str(output_dir), - ] - ) - - assert exit_code == 0 - - coverage_path = output_dir / "pe_native_broad_2024_coverage.json" - gaps_json_path = output_dir / "pe_native_broad_2024_gaps.json" - gaps_csv_path = output_dir / "pe_native_broad_2024_gaps.csv" - summary_path = output_dir / "pe_native_broad_2024_summary.md" - - coverage = json.loads(coverage_path.read_text()) - gaps = json.loads(gaps_json_path.read_text()) - gaps_csv = gaps_csv_path.read_text() - summary = summary_path.read_text() - - assert coverage["target_cell_count"] == 220 - assert coverage["covered_cell_count"] == 4 - assert gaps["uncovered_row_count"] == 216 - assert gaps_csv.startswith("priority,profile_name,period,variable") - assert "Coverage rate" in summary - assert str(db_path.resolve()) in summary diff --git a/tests/targets/test_arch_facts.py b/tests/targets/test_arch_facts.py deleted file mode 100644 index 1b484b85..00000000 --- a/tests/targets/test_arch_facts.py +++ /dev/null @@ -1,4410 +0,0 @@ -from __future__ import annotations - -import json -import sqlite3 -from pathlib import Path -from typing import Any - -import pytest -from microplex.targets import TargetQuery - -import microplex_us.targets.arch as arch_module -from microplex_us.pipelines.us import USMicroplexBuildConfig, USMicroplexPipeline -from microplex_us.policyengine.target_profiles import PolicyEngineUSTargetCell -from microplex_us.targets import ( - ArchCompositeSQLiteTargetProvider, - ArchConsumerFactJSONLTargetProvider, - ArchFactSQLiteTargetProvider, - ArchSQLiteTargetProvider, - resolve_arch_sqlite_target_provider, - summarize_arch_target_gap_queue, - summarize_arch_target_profile_coverage, -) -from microplex_us.targets.arch import main_parity, main_smoke - - -def _create_value_constraint_target_db(path: Path) -> None: - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE strata ( - id INTEGER PRIMARY KEY, - name TEXT, - jurisdiction TEXT, - definition_hash TEXT - ); - - CREATE TABLE stratum_constraints ( - id INTEGER PRIMARY KEY, - stratum_id INTEGER NOT NULL, - variable TEXT NOT NULL, - operator TEXT NOT NULL, - value TEXT NOT NULL - ); - - CREATE TABLE targets ( - id INTEGER PRIMARY KEY, - stratum_id INTEGER NOT NULL, - variable TEXT NOT NULL, - period INTEGER NOT NULL, - value REAL NOT NULL, - target_type TEXT NOT NULL, - geographic_level TEXT, - source TEXT NOT NULL, - source_table TEXT, - source_url TEXT, - notes TEXT - ); - """ - ) - conn.executemany( - """ - INSERT INTO strata (id, name, jurisdiction, definition_hash) - VALUES (?, ?, ?, ?) - """, - [ - (1, "US All Filers", "US", "all"), - (2, "US Filers AGI 1_to_5k", "US", "1_to_5k"), - ], - ) - conn.executemany( - """ - INSERT INTO stratum_constraints (stratum_id, variable, operator, value) - VALUES (?, ?, ?, ?) - """, - [ - (1, "is_tax_filer", "==", "1"), - (2, "is_tax_filer", "==", "1"), - (2, "adjusted_gross_income", ">=", "1"), - (2, "adjusted_gross_income", "<", "5000"), - ], - ) - conn.executemany( - """ - INSERT INTO targets ( - id, - stratum_id, - variable, - period, - value, - target_type, - geographic_level, - source, - source_table, - source_url, - notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - 1, - 1, - "tax_unit_count", - 2023, - 160_602_107, - "COUNT", - "NATIONAL", - "IRS_SOI", - "Publication 1304 Table 1.1", - "https://www.irs.gov/pub/irs-soi/23in11si.xls", - None, - ), - ( - 2, - 1, - "adjusted_gross_income", - 2023, - 15_286_017_359_000, - "AMOUNT", - "NATIONAL", - "IRS_SOI", - "Publication 1304 Table 1.1", - "https://www.irs.gov/pub/irs-soi/23in11si.xls", - None, - ), - ( - 3, - 1, - "income_tax_liability", - 2023, - 2_147_909_818_000, - "AMOUNT", - "NATIONAL", - "IRS_SOI", - "Publication 1304 Table 1.1", - "https://www.irs.gov/pub/irs-soi/23in11si.xls", - None, - ), - ( - 4, - 2, - "tax_unit_count", - 2023, - 7_357_751, - "COUNT", - "NATIONAL", - "IRS_SOI", - "Publication 1304 Table 1.1", - "https://www.irs.gov/pub/irs-soi/23in11si.xls", - None, - ), - ( - 5, - 2, - "adjusted_gross_income", - 2023, - 20_372_694_000, - "AMOUNT", - "NATIONAL", - "IRS_SOI", - "Publication 1304 Table 1.1", - "https://www.irs.gov/pub/irs-soi/23in11si.xls", - None, - ), - ], - ) - conn.commit() - conn.close() - - -def _create_arch_fact_db(path: Path) -> None: - conn = sqlite3.connect(path) - conn.executescript( - """ - CREATE TABLE aggregate_facts ( - fact_key TEXT PRIMARY KEY, - source_record_id TEXT, - value_numeric REAL, - value_text TEXT, - value_json TEXT NOT NULL, - period_value TEXT NOT NULL, - geography_level TEXT NOT NULL, - geography_id TEXT NOT NULL, - geography_name TEXT, - measure_concept TEXT NOT NULL, - measure_source_concept TEXT, - measure_concept_relation TEXT, - measure_concept_authority TEXT, - measure_concept_evidence_url TEXT, - measure_concept_evidence_notes TEXT, - measure_legal_vintage TEXT, - measure_unit TEXT NOT NULL, - aggregation_method TEXT NOT NULL, - domain TEXT NOT NULL, - filters_json TEXT NOT NULL, - label TEXT, - source_name TEXT, - source_table TEXT, - source_url TEXT, - source_method_notes TEXT - ); - - CREATE TABLE aggregate_constraints ( - fact_key TEXT NOT NULL, - ordinal INTEGER NOT NULL, - variable TEXT NOT NULL, - operator TEXT NOT NULL, - value_text TEXT, - value_numeric REAL, - value_json TEXT NOT NULL, - unit TEXT, - role TEXT NOT NULL, - label TEXT, - PRIMARY KEY (fact_key, ordinal) - ); - - CREATE TABLE fact_source_cells ( - fact_key TEXT NOT NULL, - source_cell_key TEXT NOT NULL, - ordinal INTEGER NOT NULL, - PRIMARY KEY (fact_key, source_cell_key) - ); - - CREATE TABLE fact_source_rows ( - fact_key TEXT NOT NULL, - source_row_key TEXT NOT NULL, - ordinal INTEGER NOT NULL, - PRIMARY KEY (fact_key, source_row_key) - ); - """ - ) - - def fact( - key: str, - *, - concept: str, - value: float, - aggregation: str, - income_range: str, - unit: str, - source_concept: str | None = None, - ) -> tuple[Any, ...]: - return ( - key, - f"irs_soi.ty2023.table_1_1.{income_range}.{concept.rsplit('.', 1)[-1]}", - value, - str(int(value)) if float(value).is_integer() else str(value), - json.dumps(value), - "2023", - "country", - "0100000US", - "United States", - concept, - source_concept, - "exact" if source_concept else None, - "arch-us" if source_concept else None, - "https://uscode.house.gov/view.xhtml?req=(title:26%20section:62%20edition:prelim)" - if source_concept - else None, - "IRS SOI Table 1.1 reports adjusted gross income.", - "tax_year_2023" if source_concept else None, - unit, - aggregation, - "all_individual_income_tax_returns", - json.dumps({"filing_status": "all", "income_range": income_range}), - f"{income_range} {concept}", - "irs_soi", - "Publication 1304 Table 1.1", - "https://www.irs.gov/pub/irs-soi/23in11si.xls", - "Source-package aggregate fact fixture.", - ) - - conn.executemany( - """ - INSERT INTO aggregate_facts ( - fact_key, - source_record_id, - value_numeric, - value_text, - value_json, - period_value, - geography_level, - geography_id, - geography_name, - measure_concept, - measure_source_concept, - measure_concept_relation, - measure_concept_authority, - measure_concept_evidence_url, - measure_concept_evidence_notes, - measure_legal_vintage, - measure_unit, - aggregation_method, - domain, - filters_json, - label, - source_name, - source_table, - source_url, - source_method_notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - fact( - "arch.fact.v1:all-count", - concept="irs_soi.individual_income_tax_returns", - value=160_602_107, - aggregation="count", - income_range="all", - unit="count", - ), - fact( - "arch.fact.v1:all-agi", - concept="us:statutes/26/62#adjusted_gross_income", - source_concept="irs_soi.adjusted_gross_income", - value=15_286_017_359_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - fact( - "arch.fact.v1:all-tax", - concept="irs_soi.total_income_tax", - value=2_147_909_818_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - fact( - "arch.fact.v1:1-to-5k-count", - concept="irs_soi.individual_income_tax_returns", - value=7_357_751, - aggregation="count", - income_range="1_to_5k", - unit="count", - ), - fact( - "arch.fact.v1:1-to-5k-agi", - concept="us:statutes/26/62#adjusted_gross_income", - source_concept="irs_soi.adjusted_gross_income", - value=20_372_694_000, - aggregation="sum", - income_range="1_to_5k", - unit="usd", - ), - ], - ) - conn.executemany( - """ - INSERT INTO aggregate_constraints ( - fact_key, - ordinal, - variable, - operator, - value_text, - value_numeric, - value_json, - unit, - role, - label - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - key, - ordinal, - "us:statutes/26/62#adjusted_gross_income", - operator, - str(value), - float(value), - json.dumps(value), - "usd", - "filter", - "Adjusted gross income bound", - ) - for key in ("arch.fact.v1:1-to-5k-count", "arch.fact.v1:1-to-5k-agi") - for ordinal, operator, value in ((0, ">=", 1), (1, "<", 5000)) - ], - ) - conn.executemany( - """ - INSERT INTO fact_source_cells (fact_key, source_cell_key, ordinal) - VALUES (?, ?, ?) - """, - [ - ("arch.fact.v1:all-agi", "arch.source_cell.v1:agi", 0), - ("arch.fact.v1:all-count", "arch.source_cell.v1:count", 0), - ], - ) - conn.execute( - """ - INSERT INTO fact_source_rows (fact_key, source_row_key, ordinal) - VALUES (?, ?, ?) - """, - ("arch.fact.v1:all-agi", "arch.source_row.v1:all", 0), - ) - conn.commit() - conn.close() - - -def _insert_arch_table_1_1_reference_totals( - path: Path, - *, - year: int, - return_count: float, - adjusted_gross_income: float, -) -> None: - conn = sqlite3.connect(path) - - def fact( - key: str, - *, - concept: str, - value: float, - aggregation: str, - unit: str, - source_concept: str | None = None, - ) -> tuple[Any, ...]: - return ( - key, - f"irs_soi.ty{year}.table_1_1.all.{concept.rsplit('.', 1)[-1]}", - value, - str(int(value)) if float(value).is_integer() else str(value), - json.dumps(value), - str(year), - "country", - "0100000US", - "United States", - concept, - source_concept, - "exact" if source_concept else None, - "arch-us" if source_concept else None, - "https://uscode.house.gov/view.xhtml?req=(title:26%20section:62%20edition:prelim)" - if source_concept - else None, - "IRS SOI Table 1.1 reports adjusted gross income.", - f"tax_year_{year}" if source_concept else None, - unit, - aggregation, - "all_individual_income_tax_returns", - json.dumps({"filing_status": "all", "income_range": "all"}), - f"{year} all {concept}", - "irs_soi", - "Publication 1304 Table 1.1", - f"https://www.irs.gov/pub/irs-soi/{str(year)[-2:]}in11si.xls", - "Source-package aggregate fact aging reference fixture.", - ) - - conn.executemany( - """ - INSERT INTO aggregate_facts ( - fact_key, - source_record_id, - value_numeric, - value_text, - value_json, - period_value, - geography_level, - geography_id, - geography_name, - measure_concept, - measure_source_concept, - measure_concept_relation, - measure_concept_authority, - measure_concept_evidence_url, - measure_concept_evidence_notes, - measure_legal_vintage, - measure_unit, - aggregation_method, - domain, - filters_json, - label, - source_name, - source_table, - source_url, - source_method_notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - fact( - f"arch.fact.v1:{year}-all-count", - concept="irs_soi.individual_income_tax_returns", - value=return_count, - aggregation="count", - unit="count", - ), - fact( - f"arch.fact.v1:{year}-all-agi", - concept="us:statutes/26/62#adjusted_gross_income", - source_concept="irs_soi.adjusted_gross_income", - value=adjusted_gross_income, - aggregation="sum", - unit="usd", - ), - ], - ) - conn.commit() - conn.close() - - -def _insert_arch_table_1_4_facts(path: Path) -> None: - conn = sqlite3.connect(path) - - def fact( - key: str, - *, - concept: str, - value: float, - aggregation: str, - income_range: str, - unit: str, - source_concept: str | None = None, - concept_relation: str | None = None, - ) -> tuple[Any, ...]: - slug = concept.split("#")[-1].rsplit(".", 1)[-1].replace(":", "_") - return ( - key, - f"irs_soi.ty2023.table_1_4.{income_range}.{slug}", - value, - str(int(value)) if float(value).is_integer() else str(value), - json.dumps(value), - "2023", - "country", - "0100000US", - "United States", - concept, - source_concept, - concept_relation, - "arch-us" if concept_relation else None, - "https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-returns-complete-report-publication-1304-basic-tables-part-1" - if concept_relation - else None, - "SOI Table 1.4 source concept alignment fixture." - if concept_relation - else None, - "tax_year_2023" if concept_relation else None, - unit, - aggregation, - "all_individual_income_tax_returns", - json.dumps({"filing_status": "all", "income_range": income_range}), - f"{income_range} {concept}", - "irs_soi", - "Publication 1304 Table 1.4", - "https://www.irs.gov/pub/irs-soi/23in14ar.xls", - "Source-package aggregate fact fixture.", - ) - - conn.executemany( - """ - INSERT INTO aggregate_facts ( - fact_key, - source_record_id, - value_numeric, - value_text, - value_json, - period_value, - geography_level, - geography_id, - geography_name, - measure_concept, - measure_source_concept, - measure_concept_relation, - measure_concept_authority, - measure_concept_evidence_url, - measure_concept_evidence_notes, - measure_legal_vintage, - measure_unit, - aggregation_method, - domain, - filters_json, - label, - source_name, - source_table, - source_url, - source_method_notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - fact( - "arch.fact.v1:t14-all-wages-returns", - concept="irs_soi.returns_with_total_wages", - value=132_000_000, - aggregation="count", - income_range="all", - unit="count", - ), - fact( - "arch.fact.v1:t14-all-wages-amount", - concept="us:statutes/26/62#input.wages", - source_concept="irs_soi.total_wages", - concept_relation="broad_match", - value=10_500_000_000_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - fact( - "arch.fact.v1:t14-all-capital-gains-returns", - concept="irs_soi.returns_with_taxable_net_capital_gains", - value=27_000_000, - aggregation="count", - income_range="all", - unit="count", - ), - fact( - "arch.fact.v1:t14-all-capital-gains-amount", - concept="irs_soi.taxable_net_capital_gains", - value=1_100_000_000_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - fact( - "arch.fact.v1:t14-all-ira-returns", - concept="irs_soi.returns_with_taxable_ira_distributions", - value=18_000_000, - aggregation="count", - income_range="all", - unit="count", - ), - fact( - "arch.fact.v1:t14-all-ira-amount", - concept="irs_soi.taxable_ira_distributions", - value=420_000_000_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - fact( - "arch.fact.v1:t14-all-pension-returns", - concept="irs_soi.returns_with_taxable_pension_income", - value=30_000_000, - aggregation="count", - income_range="all", - unit="count", - ), - fact( - "arch.fact.v1:t14-all-pension-amount", - concept="irs_soi.taxable_pension_income", - value=740_000_000_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - fact( - "arch.fact.v1:t14-all-uc-returns", - concept="irs_soi.returns_with_unemployment_compensation", - value=7_000_000, - aggregation="count", - income_range="all", - unit="count", - ), - fact( - "arch.fact.v1:t14-all-uc-amount", - concept="irs_soi.unemployment_compensation", - value=62_000_000_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - fact( - "arch.fact.v1:t14-all-taxable-ss-returns", - concept="irs_soi.returns_with_taxable_social_security_benefits", - value=29_000_000, - aggregation="count", - income_range="all", - unit="count", - ), - fact( - "arch.fact.v1:t14-all-taxable-ss-amount", - concept="irs_soi.taxable_social_security_benefits", - value=510_000_000_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - fact( - "arch.fact.v1:t14-1-to-5k-wages-amount", - concept="us:statutes/26/62#input.wages", - source_concept="irs_soi.total_wages", - concept_relation="broad_match", - value=4_200_000_000, - aggregation="sum", - income_range="1_to_5k", - unit="usd", - ), - ], - ) - conn.executemany( - """ - INSERT INTO aggregate_constraints ( - fact_key, - ordinal, - variable, - operator, - value_text, - value_numeric, - value_json, - unit, - role, - label - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - ( - "arch.fact.v1:t14-1-to-5k-wages-amount", - ordinal, - "us:statutes/26/62#adjusted_gross_income", - operator, - str(value), - float(value), - json.dumps(value), - "usd", - "filter", - "Adjusted gross income bound", - ) - for ordinal, operator, value in ((0, ">=", 1), (1, "<", 5000)) - ], - ) - conn.executemany( - """ - INSERT INTO fact_source_cells (fact_key, source_cell_key, ordinal) - VALUES (?, ?, ?) - """, - [ - ( - "arch.fact.v1:t14-all-wages-amount", - "arch.source_cell.v1:t14-wages-amount", - 0, - ), - ( - "arch.fact.v1:t14-all-wages-returns", - "arch.source_cell.v1:t14-wages-returns", - 0, - ), - ], - ) - conn.execute( - """ - INSERT INTO fact_source_rows (fact_key, source_row_key, ordinal) - VALUES (?, ?, ?) - """, - ( - "arch.fact.v1:t14-all-wages-amount", - "arch.source_row.v1:t14-all", - 0, - ), - ) - conn.commit() - conn.close() - - -def _write_consumer_fact_jsonl(path: Path) -> None: - def row( - key: str, - *, - semantic_key: str, - concept: str, - value: float, - aggregation: str, - income_range: str, - unit: str, - source_concept: str | None = None, - ) -> dict[str, Any]: - observed_concept = source_concept or concept - constraints = [] - if income_range == "1_to_5k": - constraints = [ - { - "variable": "us:statutes/26/62#adjusted_gross_income", - "operator": ">=", - "value": 1, - "unit": "usd", - "role": "filter", - }, - { - "variable": "us:statutes/26/62#adjusted_gross_income", - "operator": "<", - "value": 5000, - "unit": "usd", - "role": "filter", - }, - ] - payload: dict[str, Any] = { - "schema_version": "arch.consumer_fact.v1", - "aggregate_fact_key": key, - "semantic_fact_key": semantic_key, - "legacy_fact_key": key.replace("aggregate_fact.v2", "fact.v1"), - "value": value, - "value_type": "integer", - "period": {"type": "tax_year", "value": 2023}, - "geography": { - "id": "0100000US", - "level": "country", - "vintage": "2020_census", - }, - "entity": {"name": "tax_unit", "role": "filing_unit"}, - "aggregation": {"method": aggregation}, - "observed_measure": { - "source_concept": observed_concept, - "source_measure_id": observed_concept.rsplit(".", 1)[-1], - "source_name": "irs_soi", - "source_table": "Publication 1304 Table 1.1", - "unit": unit, - }, - "dimensions": {"filing_status": "all", "income_range": income_range}, - "universe_constraints": { - "domain": "all_individual_income_tax_returns", - "constraints": constraints, - }, - "source": { - "source_name": "irs_soi", - "source_table": "Publication 1304 Table 1.1", - "url": "https://www.irs.gov/pub/irs-soi/23in11si.xls", - "method_notes": "Consumer-contract fact fixture.", - }, - "lineage": { - "source_record_id": ( - f"irs_soi.ty2023.table_1_1.{income_range}." - f"{observed_concept.rsplit('.', 1)[-1]}" - ), - "source_cell_keys": [f"arch.source_cell.v1:{key.rsplit(':', 1)[-1]}"], - "source_row_keys": [f"arch.source_row.v1:{income_range}"], - }, - "label": f"{income_range} {concept}", - } - if source_concept is not None: - payload["concept_alignment"] = { - "concept_alignment_key": "arch.concept_alignment.v2:agi", - "source_concept": source_concept, - "canonical_concept": concept, - "relation": "exact", - "authority": "arch-us", - "evidence_url": ( - "https://uscode.house.gov/view.xhtml?" - "req=(title:26%20section:62%20edition:prelim)" - ), - "evidence_notes": "IRS SOI Table 1.1 reports adjusted gross income.", - "legal_vintage": "tax_year_2023", - } - return payload - - rows = [ - row( - "arch.aggregate_fact.v2:all-count", - semantic_key="arch.semantic_fact.v2:all-count", - concept="irs_soi.individual_income_tax_returns", - value=160_602_107, - aggregation="count", - income_range="all", - unit="count", - ), - row( - "arch.aggregate_fact.v2:all-agi", - semantic_key="arch.semantic_fact.v2:all-agi", - concept="us:statutes/26/62#adjusted_gross_income", - source_concept="irs_soi.adjusted_gross_income", - value=15_286_017_359_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - row( - "arch.aggregate_fact.v2:all-tax", - semantic_key="arch.semantic_fact.v2:all-tax", - concept="irs_soi.total_income_tax", - value=2_147_909_818_000, - aggregation="sum", - income_range="all", - unit="usd", - ), - row( - "arch.aggregate_fact.v2:1-to-5k-count", - semantic_key="arch.semantic_fact.v2:1-to-5k-count", - concept="irs_soi.individual_income_tax_returns", - value=7_357_751, - aggregation="count", - income_range="1_to_5k", - unit="count", - ), - row( - "arch.aggregate_fact.v2:1-to-5k-agi", - semantic_key="arch.semantic_fact.v2:1-to-5k-agi", - concept="us:statutes/26/62#adjusted_gross_income", - source_concept="irs_soi.adjusted_gross_income", - value=20_372_694_000, - aggregation="sum", - income_range="1_to_5k", - unit="usd", - ), - ] - path.write_text("\n".join(json.dumps(item, sort_keys=True) for item in rows) + "\n") - - -def _consumer_fact( - key: str, - *, - concept: str, - domain: str, - source_name: str, - source_table: str, - value: float, - period: dict[str, Any] | None = None, - geography: dict[str, Any] | None = None, - constraints: tuple[dict[str, Any], ...] = (), - unit: str = "count", -) -> dict[str, Any]: - return { - "schema_version": "arch.consumer_fact.v1", - "aggregate_fact_key": f"arch.aggregate_fact.v2:{key}", - "semantic_fact_key": f"arch.semantic_fact.v2:{key}", - "value": value, - "period": period or {"type": "calendar_year", "value": 2024}, - "geography": geography - or {"level": "country", "id": "0100000US", "name": "United States"}, - "observed_measure": { - "source_concept": concept, - "source_measure_id": concept.rsplit(".", 1)[-1], - "source_name": source_name, - "source_table": source_table, - "unit": unit, - }, - "universe_constraints": { - "domain": domain, - "constraints": list(constraints), - }, - "source": { - "source_name": source_name, - "source_table": source_table, - "url": f"https://example.test/{key}", - "method_notes": "US admin source-family fixture.", - }, - "lineage": { - "source_record_id": f"{source_name}.{key}", - "source_cell_keys": [f"arch.source_cell.v1:{key}"], - "source_row_keys": [f"arch.source_row.v1:{key}"], - }, - "label": key, - } - - -def _target_filter_tuples(target: Any) -> set[tuple[str, str, str]]: - return { - ( - str(target_filter.feature), - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in target.filters - } - - -def _normalize_target_behavior(target_set) -> list[tuple[Any, ...]]: - rows = [] - for target in target_set.targets: - filters = tuple( - sorted( - ( - str(target_filter.feature), - str( - getattr(target_filter.operator, "value", target_filter.operator) - ), - str(target_filter.value), - ) - for target_filter in target.filters - ) - ) - rows.append( - ( - str(target.entity.value), - str(getattr(target.aggregation, "value", target.aggregation)), - target.measure, - round(float(target.value), 6), - int(target.period), - str(target.source), - target.metadata["variable"], - target.metadata["geo_level"], - filters, - ) - ) - return sorted(rows) - - -def test_arch_fact_provider_matches_value_constraint_soi_targets( - tmp_path: Path, -) -> None: - value_db = tmp_path / "value_targets.db" - fact_db = tmp_path / "arch_facts.db" - _create_value_constraint_target_db(value_db) - _create_arch_fact_db(fact_db) - - query = TargetQuery(period=2023) - value_targets = ArchSQLiteTargetProvider(value_db).load_target_set(query) - fact_targets = ArchFactSQLiteTargetProvider(fact_db).load_target_set(query) - - assert _normalize_target_behavior(fact_targets) == _normalize_target_behavior( - value_targets - ) - - -def test_arch_fact_provider_skips_unsupported_source_package_facts( - tmp_path: Path, -) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - conn = sqlite3.connect(fact_db) - conn.execute( - """ - INSERT INTO aggregate_facts ( - fact_key, - source_record_id, - value_numeric, - value_text, - value_json, - period_value, - geography_level, - geography_id, - geography_name, - measure_concept, - measure_source_concept, - measure_concept_relation, - measure_concept_authority, - measure_concept_evidence_url, - measure_concept_evidence_notes, - measure_legal_vintage, - measure_unit, - aggregation_method, - domain, - filters_json, - label, - source_name, - source_table, - source_url, - source_method_notes - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - "arch.fact.v1:bea-defined-contribution-pensions", - "bea_nipa.nipa.7.20.line_5", - 274_439_000_000, - "274439000000", - json.dumps(274_439_000_000), - "2023", - "country", - "0100000US", - "United States", - "bea_nipa.defined_contribution_employer_contributions", - "bea_nipa.W351RC", - "source_label", - "bea", - "https://apps.bea.gov/iTable/", - "BEA NIPA pension contribution source-package fixture.", - None, - "usd", - "sum", - "defined_contribution_pension_plans", - json.dumps({}), - "Defined contribution employer contributions", - "bea_nipa", - "NIPA Table 7.20", - "https://apps.bea.gov/iTable/", - "Unsupported source package fact fixture.", - ), - ) - conn.commit() - conn.close() - - target_set = ArchFactSQLiteTargetProvider(fact_db).load_target_set( - TargetQuery(period=2023) - ) - - assert target_set.targets - assert all( - target.metadata.get("arch_concept") - != "bea_nipa.defined_contribution_employer_contributions" - for target in target_set.targets - ) - - -def test_arch_consumer_fact_jsonl_provider_matches_value_constraint_soi_targets( - tmp_path: Path, -) -> None: - value_db = tmp_path / "value_targets.db" - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _create_value_constraint_target_db(value_db) - _write_consumer_fact_jsonl(consumer_jsonl) - - query = TargetQuery(period=2023) - value_targets = ArchSQLiteTargetProvider(value_db).load_target_set(query) - consumer_targets = ArchConsumerFactJSONLTargetProvider( - consumer_jsonl - ).load_target_set(query) - - assert _normalize_target_behavior(consumer_targets) == _normalize_target_behavior( - value_targets - ) - - -def test_arch_consumer_fact_jsonl_provider_skips_unsupported_source_package_facts( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "soi-wages", - concept="irs_soi.total_wages", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Publication 1304 Table 1.1", - period={"type": "tax_year", "value": 2024}, - value=10_000_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-defined-contribution-pensions", - concept="bea_nipa.defined_contribution_employer_contributions", - domain="defined_contribution_pension_plans", - source_name="bea_nipa", - source_table="NIPA Table 7.20", - period={"type": "calendar_year", "value": 2024}, - value=274_439_000_000, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2024) - ) - - assert len(target_set.targets) == 1 - target = target_set.targets[0] - assert target.metadata["arch_variable"] == "wages_salaries_amount" - assert target.measure == "employment_income" - assert target.metadata["arch_concept"] == "irs_soi.total_wages" - - -def test_arch_fact_provider_preserves_fact_provenance(tmp_path: Path) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - - target_set = ArchFactSQLiteTargetProvider(fact_db).load_target_set( - TargetQuery( - period=2023, - provider_filters={ - "target_cells": [ - { - "variable": "adjusted_gross_income", - "geo_level": "national", - "domain_variable": None, - } - ] - }, - ) - ) - - all_agi = next( - target - for target in target_set.targets - if target.metadata["arch_aggregate_fact_key"] == "arch.fact.v1:all-agi" - ) - assert all_agi.metadata["arch_semantic_fact_key"].startswith( - "arch.semantic_fact.v1|us:statutes/26/62#adjusted_gross_income" - ) - assert all_agi.metadata["arch_source_record_id"].startswith( - "irs_soi.ty2023.table_1_1.all" - ) - assert all_agi.metadata["arch_source_cell_keys"] == ["arch.source_cell.v1:agi"] - assert all_agi.metadata["arch_source_row_keys"] == ["arch.source_row.v1:all"] - assert all_agi.metadata["arch_source_concept"] == "irs_soi.adjusted_gross_income" - assert all_agi.metadata["arch_concept_relation"] == "exact" - assert all_agi.metadata["unit"] == "usd" - - -def test_arch_consumer_fact_jsonl_provider_preserves_contract_keys( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _write_consumer_fact_jsonl(consumer_jsonl) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2023) - ) - - all_agi = next( - target - for target in target_set.targets - if target.metadata["arch_aggregate_fact_key"] - == "arch.aggregate_fact.v2:all-agi" - ) - assert all_agi.metadata["arch_semantic_fact_key"] == "arch.semantic_fact.v2:all-agi" - assert all_agi.metadata["arch_source_record_id"].startswith( - "irs_soi.ty2023.table_1_1.all" - ) - assert all_agi.metadata["arch_source_cell_keys"] == ["arch.source_cell.v1:all-agi"] - assert all_agi.metadata["arch_source_row_keys"] == ["arch.source_row.v1:all"] - assert all_agi.metadata["arch_source_concept"] == "irs_soi.adjusted_gross_income" - assert all_agi.metadata["arch_concept_relation"] == "exact" - assert all_agi.metadata["unit"] == "usd" - - -def test_arch_consumer_fact_jsonl_provider_maps_income_tax_after_credits_returns( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _write_consumer_fact_jsonl(consumer_jsonl) - rows = [json.loads(line) for line in consumer_jsonl.read_text().splitlines()] - row = json.loads(json.dumps(rows[0])) - row["aggregate_fact_key"] = "arch.aggregate_fact.v2:all-income-tax-returns" - row["semantic_fact_key"] = "arch.semantic_fact.v2:all-income-tax-returns" - row["legacy_fact_key"] = "arch.fact.v1:all-income-tax-returns" - row["value"] = 111_545_061 - row["observed_measure"] = { - **row["observed_measure"], - "source_concept": "irs_soi.returns_with_income_tax_after_credits", - "source_measure_id": "income_tax_after_credits_returns", - "unit": "count", - } - row["lineage"]["source_record_id"] = ( - "irs_soi.ty2023.table_1_1.all.income_tax_after_credits_returns" - ) - consumer_jsonl.write_text(json.dumps(row, sort_keys=True) + "\n") - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2023) - ) - target = target_set.targets[0] - filters = { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in target.filters - } - - assert target.metadata["arch_variable"] == "income_tax_liability_returns" - assert target.metadata["variable"] == "tax_unit_count" - assert target.aggregation.value == "count" - assert filters == { - ("income_tax", ">", "0"), - ("tax_unit_is_filer", "==", "1"), - } - - -def test_arch_consumer_fact_jsonl_provider_maps_tax_exempt_interest( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _write_consumer_fact_jsonl(consumer_jsonl) - template = json.loads(consumer_jsonl.read_text().splitlines()[0]) - rows = [] - for suffix, concept, measure_id, value, unit in ( - ( - "qualified-dividends-returns", - "irs_soi.returns_with_qualified_dividends", - "qualified_dividends_returns", - 38_000_000, - "count", - ), - ( - "qualified-dividends-amount", - "irs_soi.qualified_dividends", - "qualified_dividends_amount", - 350_000_000_000, - "usd", - ), - ( - "returns", - "irs_soi.returns_with_tax_exempt_interest", - "tax_exempt_interest_returns", - 6_837_120, - "count", - ), - ( - "amount", - "irs_soi.tax_exempt_interest", - "tax_exempt_interest_amount", - 89_000_000_000, - "usd", - ), - ): - row = json.loads(json.dumps(template)) - row["aggregate_fact_key"] = f"arch.aggregate_fact.v2:tax-exempt-{suffix}" - row["semantic_fact_key"] = f"arch.semantic_fact.v2:tax-exempt-{suffix}" - row["legacy_fact_key"] = f"arch.fact.v1:tax-exempt-{suffix}" - row["period"] = {"type": "tax_year", "value": 2022} - row["value"] = value - row["source"] = {**row["source"], "source_table": "Historic Table 2"} - row["observed_measure"] = { - **row["observed_measure"], - "source_concept": concept, - "source_measure_id": measure_id, - "source_table": "Historic Table 2", - "unit": unit, - } - row["aggregation"] = {"method": "count" if unit == "count" else "sum"} - row["lineage"]["source_record_id"] = ( - f"irs_soi.ty2022.historic_table_2.us.all.{measure_id}" - ) - rows.append(row) - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2022) - ) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - returns = targets_by_arch_variable["tax_exempt_interest_returns"] - amount = targets_by_arch_variable["tax_exempt_interest_amount"] - qualified_returns = targets_by_arch_variable["qualified_dividends_returns"] - qualified_amount = targets_by_arch_variable["qualified_dividends_amount"] - - assert returns.metadata["variable"] == "tax_unit_count" - assert returns.aggregation.value == "count" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in returns.filters - } == { - ("tax_exempt_interest_income", ">", "0"), - ("tax_unit_is_filer", "==", "1"), - } - assert amount.metadata["variable"] == "tax_exempt_interest_income" - assert amount.measure == "tax_exempt_interest_income" - assert qualified_returns.metadata["variable"] == "tax_unit_count" - assert qualified_returns.aggregation.value == "count" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in qualified_returns.filters - } == { - ("qualified_dividend_income", ">", "0"), - ("tax_unit_is_filer", "==", "1"), - } - assert qualified_amount.metadata["variable"] == "qualified_dividend_income" - assert qualified_amount.measure == "qualified_dividend_income" - - -def test_arch_consumer_fact_jsonl_provider_maps_schedule_c_self_employment( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "soi-schedule-c-returns", - concept="irs_soi.returns_with_schedule_c_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2", - period={"type": "tax_year", "value": 2022}, - value=28_000_000, - unit="count", - ), - _consumer_fact( - "soi-schedule-c-income", - concept="irs_soi.schedule_c_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2", - period={"type": "tax_year", "value": 2022}, - value=512_000_000_000, - unit="usd", - ), - _consumer_fact( - "soi-partnership-scorp-returns", - concept="irs_soi.returns_with_partnership_scorp_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2", - period={"type": "tax_year", "value": 2022}, - value=12_000_000, - unit="count", - ), - _consumer_fact( - "soi-partnership-scorp-income", - concept="irs_soi.partnership_scorp_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2", - period={"type": "tax_year", "value": 2022}, - value=1_200_000_000_000, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2022) - ) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - returns = targets_by_arch_variable["schedule_c_income_returns"] - amount = targets_by_arch_variable["schedule_c_income_amount"] - partnership_returns = targets_by_arch_variable["partnership_scorp_income_returns"] - partnership_amount = targets_by_arch_variable["partnership_scorp_income_amount"] - - assert returns.metadata["variable"] == "tax_unit_count" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in returns.filters - } == { - ("self_employment_income", ">", "0"), - ("tax_unit_is_filer", "==", "1"), - } - assert amount.metadata["variable"] == "self_employment_income" - assert amount.measure == "self_employment_income" - assert ( - partnership_returns.metadata["variable"] == "tax_unit_count" - ) - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in partnership_returns.filters - } == { - ("tax_unit_is_filer", "==", "1"), - ("tax_unit_partnership_s_corp_income", ">", "0"), - } - assert ( - partnership_amount.metadata["variable"] - == "tax_unit_partnership_s_corp_income" - ) - assert partnership_amount.measure == "tax_unit_partnership_s_corp_income" - - -def test_arch_consumer_fact_jsonl_provider_maps_historic_table_2_concepts( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _write_consumer_fact_jsonl(consumer_jsonl) - template = json.loads(consumer_jsonl.read_text().splitlines()[0]) - rows = [] - for index, (concept, measure_id, value) in enumerate( - ( - ( - "irs_soi.returns_with_premium_tax_credit", - "premium_tax_credit_returns", - 7_841_370, - ), - ("irs_soi.earned_income_credit", "eitc_amount", 59_204_588_000), - ( - "irs_soi.tax_filer_individuals", - "tax_filer_individual_count", - 293_617_150, - ), - ), - start=1, - ): - row = json.loads(json.dumps(template)) - row["aggregate_fact_key"] = f"arch.aggregate_fact.v2:historic-table-2-{index}" - row["semantic_fact_key"] = f"arch.semantic_fact.v2:historic-table-2-{index}" - row["legacy_fact_key"] = f"arch.fact.v1:historic-table-2-{index}" - row["period"] = {"type": "tax_year", "value": 2022} - row["value"] = value - row["source"] = {**row["source"], "source_table": "Historic Table 2"} - row["observed_measure"] = { - **row["observed_measure"], - "source_concept": concept, - "source_measure_id": measure_id, - "source_table": "Historic Table 2", - "unit": "usd" if concept == "irs_soi.earned_income_credit" else "count", - } - row["aggregation"] = { - "method": "sum" if concept == "irs_soi.earned_income_credit" else "count" - } - row["lineage"]["source_record_id"] = ( - f"irs_soi.ty2022.historic_table_2.us.all.{measure_id}" - ) - rows.append(row) - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2022) - ) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - - premium_tax_credit = targets_by_arch_variable["aca_ptc_returns"] - assert premium_tax_credit.metadata["variable"] == "tax_unit_count" - assert premium_tax_credit.aggregation.value == "count" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in premium_tax_credit.filters - } == { - ("aca_ptc", ">", "0"), - ("tax_unit_is_filer", "==", "1"), - } - - eitc = targets_by_arch_variable["eitc_amount"] - assert eitc.metadata["variable"] == "eitc" - assert eitc.measure == "eitc" - assert eitc.aggregation.value == "sum" - - tax_filer_individuals = targets_by_arch_variable["tax_filer_individual_count"] - assert tax_filer_individuals.metadata["variable"] == "person_count" - assert tax_filer_individuals.aggregation.value == "count" - - -def test_arch_consumer_fact_jsonl_provider_maps_table_2_1_itemized_details( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - period = {"type": "tax_year", "value": 2023} - source_table = "Publication 1304 Table 2.1" - domain = "individual_income_tax_returns_with_itemized_deductions" - rows = [ - _consumer_fact( - "soi-charitable-deduction", - concept="irs_soi.contributions_deduction", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=211_975_123_000, - period=period, - unit="usd", - ), - _consumer_fact( - "soi-charitable-returns", - concept="irs_soi.returns_with_contributions_deduction", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=11_747_949, - period=period, - ), - _consumer_fact( - "soi-interest-deduction", - concept="irs_soi.interest_paid_deduction", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=208_176_768_000, - period=period, - unit="usd", - ), - _consumer_fact( - "soi-state-local-total", - concept="irs_soi.state_and_local_taxes", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=331_823_221_000, - period=period, - unit="usd", - ), - _consumer_fact( - "soi-state-local-income-sales", - concept="irs_soi.state_local_income_or_sales_taxes", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=218_543_083_000, - period=period, - unit="usd", - ), - _consumer_fact( - "soi-real-estate-taxes", - concept="irs_soi.real_estate_taxes", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=108_606_373_000, - period=period, - unit="usd", - ), - _consumer_fact( - "soi-mortgage-financial", - concept="irs_soi.home_mortgage_interest_paid_to_financial_institutions", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=167_675_863_000, - period=period, - unit="usd", - ), - _consumer_fact( - "soi-mortgage-individual", - concept="irs_soi.home_mortgage_interest_paid_to_individuals", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=3_688_924_000, - period=period, - unit="usd", - ), - _consumer_fact( - "soi-deductible-points", - concept="irs_soi.deductible_points", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=1_027_127_000, - period=period, - unit="usd", - ), - _consumer_fact( - "soi-investment-interest", - concept="irs_soi.investment_interest_expense_deduction", - domain=domain, - source_name="irs_soi", - source_table=source_table, - value=35_768_354_000, - period=period, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - provider = ArchConsumerFactJSONLTargetProvider(consumer_jsonl) - target_set = provider.load_target_set(TargetQuery(period=2023)) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target - for target in target_set.targets - if target.metadata.get("arch_variable") is not None - } - targets_by_measure = { - str(target.measure): target - for target in target_set.targets - if target.measure is not None - } - - charitable = targets_by_arch_variable["charitable_amount"] - assert charitable.measure == "charitable_deduction" - assert charitable.entity.value == "tax_unit" - assert charitable.value == 211_975_123_000 - assert ("itemized_deductions", ">", "0") in _target_filter_tuples(charitable) - - charitable_count = targets_by_arch_variable["charitable_returns"] - assert charitable_count.metadata["variable"] == "tax_unit_count" - assert charitable_count.aggregation.value == "count" - assert ("charitable_deduction", ">", "0") in _target_filter_tuples( - charitable_count - ) - assert ("itemized_deductions", ">", "0") in _target_filter_tuples( - charitable_count - ) - - assert targets_by_arch_variable["interest_paid_deduction_amount"].measure == ( - "interest_deduction" - ) - assert "total_state_local_taxes_amount" not in targets_by_arch_variable - state_local_income_sales = targets_by_arch_variable[ - "state_local_income_or_sales_tax_amount" - ] - assert state_local_income_sales.measure == "state_and_local_sales_or_income_tax" - assert targets_by_arch_variable["real_estate_taxes_amount"].measure == ( - "real_estate_taxes" - ) - salt = targets_by_measure["salt"] - assert salt.measure == "salt" - assert salt.value == 327_149_456_000 - assert salt.metadata["variable"] == "salt" - assert targets_by_arch_variable["mortgage_interest_paid_amount"].measure == ( - "deductible_mortgage_interest" - ) - assert targets_by_arch_variable["home_mortgage_personal_seller_amount"].measure == ( - "deductible_mortgage_interest" - ) - assert targets_by_arch_variable["deductible_points_amount"].measure == ( - "deductible_mortgage_interest" - ) - assert targets_by_arch_variable["investment_interest_paid_amount"].measure == ( - "investment_interest_expense" - ) - - coverage = summarize_arch_target_profile_coverage( - provider, - period=2023, - profile_name="custom", - target_cells=( - PolicyEngineUSTargetCell( - "charitable_deduction", - geo_level="national", - ), - PolicyEngineUSTargetCell( - "charitable_deduction", - geo_level="national", - domain_variable="charitable_deduction,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "tax_unit_count", - geo_level="national", - domain_variable="charitable_deduction,tax_unit_itemizes", - ), - PolicyEngineUSTargetCell( - "interest_deduction", - geo_level="national", - ), - PolicyEngineUSTargetCell( - "deductible_mortgage_interest", - geo_level="national", - ), - PolicyEngineUSTargetCell( - "state_and_local_sales_or_income_tax", - geo_level="national", - ), - PolicyEngineUSTargetCell( - "salt", - geo_level="national", - domain_variable="salt,tax_unit_itemizes", - ), - ), - ) - assert coverage.covered_cell_count == coverage.target_cell_count - - -def test_arch_consumer_fact_jsonl_provider_maps_state_soi_rows( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "state-ca-agi-50k-75k", - concept="irs_soi.adjusted_gross_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state AGI facts", - period={"type": "tax_year", "value": 2022}, - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=123_456_000_000, - unit="usd", - constraints=( - { - "variable": "us:statutes/26/62#adjusted_gross_income", - "operator": ">=", - "value": 50_000, - "unit": "usd", - "role": "filter", - }, - { - "variable": "us:statutes/26/62#adjusted_gross_income", - "operator": "<", - "value": 75_000, - "unit": "usd", - "role": "filter", - }, - ), - ), - _consumer_fact( - "state-ca-eitc-amount", - concept="irs_soi.earned_income_credit", - domain="individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state EITC totals", - period={"type": "tax_year", "value": 2022}, - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=5_770_703_000, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2022) - ) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - - agi = targets_by_arch_variable["adjusted_gross_income"] - assert agi.metadata["variable"] == "adjusted_gross_income" - assert agi.metadata["geo_level"] == "state" - assert agi.metadata["geography_id"] == "0400000US06" - assert agi.measure == "adjusted_gross_income" - assert agi.aggregation.value == "sum" - assert _target_filter_tuples(agi) == { - ("tax_unit_is_filer", "==", "1"), - ("adjusted_gross_income", ">=", "50000"), - ("adjusted_gross_income", "<", "75000"), - ("state_fips", "==", "06"), - } - - eitc = targets_by_arch_variable["eitc_amount"] - assert eitc.metadata["variable"] == "eitc" - assert eitc.metadata["geo_level"] == "state" - assert eitc.measure == "eitc" - assert eitc.aggregation.value == "sum" - assert _target_filter_tuples(eitc) == { - ("tax_unit_is_filer", "==", "1"), - ("state_fips", "==", "06"), - } - - -def test_arch_consumer_fact_jsonl_provider_maps_acs_district_age_rows( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "acs-cd-al01-age-0-4", - concept="census_acs.person_count", - domain="total_population", - source_name="census_acs", - source_table="ACS S0101 congressional district age", - period={"type": "calendar_year", "value": 2024}, - geography={ - "level": "congressional_district", - "id": "5001900US0101", - "name": "Congressional District 1 (119th Congress), Alabama", - }, - value=39_908, - constraints=( - { - "variable": "age", - "operator": ">=", - "value": 0, - "unit": "years", - "role": "filter", - }, - { - "variable": "age", - "operator": "<", - "value": 5, - "unit": "years", - "role": "filter", - }, - ), - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["CENSUS_ACS"], - "target_cells": [ - { - "variable": "person_count", - "geo_level": "district", - "geographic_id": "0101", - "domain_variable": "age", - }, - ], - }, - ) - ) - - assert len(target_set.targets) == 1 - target = target_set.targets[0] - assert target.value == 39_908 - assert target.metadata["variable"] == "person_count" - assert target.metadata["geo_level"] == "district" - assert target.metadata["source"] == "CENSUS_ACS" - assert _target_filter_tuples(target) == { - ("age", ">=", "0"), - ("age", "<", "5"), - ("congressional_district_geoid", "==", "0101"), - } - - -def test_arch_consumer_fact_jsonl_provider_normalizes_117th_district_geos( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "soi-cd-al01-agi", - concept="us:statutes/26/62#adjusted_gross_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="SOI Congressional District Data 2022", - period={"type": "tax_year", "value": 2022}, - geography={ - "level": "congressional_district", - "id": "5001700US0101", - "name": "Alabama Congressional District 1", - }, - value=22_915_824_000, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery( - period=2022, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "adjusted_gross_income", - "geo_level": "district", - "geographic_id": "0101", - "domain_variable": None, - }, - ], - }, - ) - ) - - assert len(target_set.targets) == 1 - target = target_set.targets[0] - assert target.value == 22_915_824_000 - assert target.metadata["variable"] == "adjusted_gross_income" - assert target.metadata["geo_level"] == "district" - assert _target_filter_tuples(target) == { - ("tax_unit_is_filer", "==", "1"), - ("congressional_district_geoid", "==", "0101"), - } - - -def test_arch_consumer_fact_jsonl_provider_maps_acs_state_age_sex_rows( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - row = _consumer_fact( - "acs-ca-female-40-44", - concept="census_acs.person_count", - domain="total_population", - source_name="census_acs", - source_table="ACS B01001 state female age", - period={"type": "calendar_year", "value": 2023}, - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=1_300_307, - constraints=( - { - "variable": "age", - "operator": ">=", - "value": 40, - "unit": "years", - "role": "filter", - }, - { - "variable": "age", - "operator": "<", - "value": 45, - "unit": "years", - "role": "filter", - }, - { - "variable": "sex", - "operator": "==", - "value": "female", - "role": "filter", - }, - ), - ) - consumer_jsonl.write_text(json.dumps(row, sort_keys=True) + "\n") - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2023) - ) - - assert len(target_set.targets) == 1 - target = target_set.targets[0] - assert target.value == 1_300_307 - assert target.metadata["variable"] == "person_count" - assert target.metadata["geo_level"] == "state" - assert _target_filter_tuples(target) == { - ("age", ">=", "40"), - ("age", "<", "45"), - ("is_female", "==", "1"), - ("state_fips", "==", "06"), - } - - -def test_arch_consumer_fact_jsonl_provider_maps_acs_district_snap_rows( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - geography = { - "level": "congressional_district", - "id": "5001900US0101", - "name": "Congressional District 1 (119th Congress), Alabama", - } - rows = [ - _consumer_fact( - "acs-cd-al01-households-total", - concept="census_acs.household_count", - domain="households", - source_name="census_acs", - source_table="ACS S2201 congressional district SNAP households", - period={"type": "calendar_year", "value": 2024}, - geography=geography, - value=300_636, - ), - _consumer_fact( - "acs-cd-al01-households-snap", - concept="census_acs.household_count", - domain="households", - source_name="census_acs", - source_table="ACS S2201 congressional district SNAP households", - period={"type": "calendar_year", "value": 2024}, - geography=geography, - value=34_742, - constraints=( - { - "variable": "snap_receipt_status", - "operator": "==", - "value": "receiving_food_stamps_snap", - "role": "filter", - }, - ), - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["CENSUS_ACS"], - "target_cells": [ - { - "variable": "household_count", - "geo_level": "district", - "geographic_id": "0101", - "domain_variable": "snap", - }, - ], - }, - ) - ) - - assert len(target_set.targets) == 1 - target = target_set.targets[0] - assert target.value == 34_742 - assert target.metadata["variable"] == "household_count" - assert target.metadata["geo_level"] == "district" - assert target.metadata["source"] == "CENSUS_ACS" - assert _target_filter_tuples(target) == { - ("congressional_district_geoid", "==", "0101"), - ("snap", ">", "0"), - } - - -def test_arch_consumer_fact_jsonl_provider_maps_state_broad_soi_concepts( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - geography = {"level": "state", "id": "0400000US06", "name": "California"} - rows = [ - _consumer_fact( - "state-ca-qualified-dividends", - concept="irs_soi.qualified_dividends", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=93_000_000_000, - unit="usd", - ), - _consumer_fact( - "state-ca-schedule-c-returns", - concept="irs_soi.returns_with_schedule_c_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=3_617_080, - ), - _consumer_fact( - "state-ca-partnership-scorp", - concept="irs_soi.partnership_scorp_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=125_930_370_000, - unit="usd", - ), - _consumer_fact( - "state-ca-medical-dental", - concept="irs_soi.medical_dental_expense_deduction", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=11_456_144_000, - unit="usd", - ), - _consumer_fact( - "state-ca-qbi-returns", - concept="irs_soi.returns_with_qualified_business_income_deduction", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=499_080, - ), - _consumer_fact( - "state-ca-qbi", - concept="irs_soi.qualified_business_income_deduction", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=4_400_400_000, - unit="usd", - ), - _consumer_fact( - "state-ca-rental-returns", - concept="irs_soi.returns_with_rental_royalty_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=1_315_410, - ), - _consumer_fact( - "state-ca-rental", - concept="irs_soi.rental_royalty_income", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=14_331_993_000, - unit="usd", - ), - _consumer_fact( - "state-ca-ctc-returns", - concept="irs_soi.returns_with_child_tax_credit", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=4_626_510, - ), - _consumer_fact( - "state-ca-ctc", - concept="irs_soi.child_tax_credit", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=9_724_583_000, - unit="usd", - ), - _consumer_fact( - "state-ca-actc-returns", - concept="irs_soi.returns_with_additional_child_tax_credit", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=1_933_500, - ), - _consumer_fact( - "state-ca-actc", - concept="irs_soi.additional_child_tax_credit", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Historic Table 2 state broad totals", - period={"type": "tax_year", "value": 2022}, - geography=geography, - value=3_605_628_000, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2022) - ) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - - qualified_dividends = targets_by_arch_variable["qualified_dividends_amount"] - assert qualified_dividends.metadata["variable"] == "qualified_dividend_income" - assert qualified_dividends.measure == "qualified_dividend_income" - assert _target_filter_tuples(qualified_dividends) == { - ("tax_unit_is_filer", "==", "1"), - ("state_fips", "==", "06"), - } - - schedule_c_returns = targets_by_arch_variable["schedule_c_income_returns"] - assert schedule_c_returns.metadata["variable"] == "tax_unit_count" - assert schedule_c_returns.aggregation.value == "count" - assert ("self_employment_income", ">", "0") in _target_filter_tuples( - schedule_c_returns - ) - - partnership = targets_by_arch_variable["partnership_scorp_income_amount"] - assert ( - partnership.metadata["variable"] == "tax_unit_partnership_s_corp_income" - ) - assert partnership.measure == "tax_unit_partnership_s_corp_income" - - medical = targets_by_arch_variable["medical_dental_expense_amount"] - assert medical.metadata["variable"] == "medical_expense_deduction" - assert medical.measure == "medical_expense_deduction" - - qbi = targets_by_arch_variable["qbi_amount"] - assert qbi.metadata["variable"] == "qualified_business_income_deduction" - assert qbi.measure == "qualified_business_income_deduction" - - qbi_claims = targets_by_arch_variable["qbi_claims"] - assert qbi_claims.metadata["variable"] == "tax_unit_count" - assert qbi_claims.aggregation.value == "count" - assert ( - "qualified_business_income_deduction", - ">", - "0", - ) in _target_filter_tuples(qbi_claims) - - rental = targets_by_arch_variable["rental_royalty_income_amount"] - assert rental.metadata["variable"] == "rental_income" - assert rental.measure == "rental_income" - - rental_returns = targets_by_arch_variable["rental_royalty_income_returns"] - assert rental_returns.metadata["variable"] == "tax_unit_count" - assert rental_returns.aggregation.value == "count" - assert ("rental_income", ">", "0") in _target_filter_tuples( - rental_returns - ) - - ctc = targets_by_arch_variable["ctc_amount"] - assert ctc.metadata["variable"] == "non_refundable_ctc" - assert ctc.measure == "non_refundable_ctc" - - ctc_claims = targets_by_arch_variable["ctc_claims"] - assert ctc_claims.metadata["variable"] == "tax_unit_count" - assert ctc_claims.aggregation.value == "count" - assert ("non_refundable_ctc", ">", "0") in _target_filter_tuples( - ctc_claims - ) - - actc = targets_by_arch_variable["actc_amount"] - assert actc.metadata["variable"] == "refundable_ctc" - assert actc.measure == "refundable_ctc" - - actc_claims = targets_by_arch_variable["actc_claims"] - assert actc_claims.metadata["variable"] == "tax_unit_count" - assert actc_claims.aggregation.value == "count" - assert ("refundable_ctc", ">", "0") in _target_filter_tuples( - actc_claims - ) - - -def test_arch_consumer_fact_jsonl_provider_maps_soi_alimony_concepts( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "soi-alimony-received-returns", - concept="irs_soi.returns_with_alimony_received", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Publication 1304 Table 1.4", - period={"type": "tax_year", "value": 2023}, - value=183_582, - ), - _consumer_fact( - "soi-alimony-received-amount", - concept="irs_soi.alimony_received", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Publication 1304 Table 1.4", - period={"type": "tax_year", "value": 2023}, - value=6_686_429_000, - unit="usd", - ), - _consumer_fact( - "soi-alimony-paid-returns", - concept="irs_soi.returns_with_alimony_paid", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Publication 1304 Table 1.4", - period={"type": "tax_year", "value": 2023}, - value=278_541, - ), - _consumer_fact( - "soi-alimony-paid-amount", - concept="irs_soi.alimony_paid", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Publication 1304 Table 1.4", - period={"type": "tax_year", "value": 2023}, - value=7_497_135_000, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2023) - ) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - - received_amount = targets_by_arch_variable["alimony_received_amount"] - assert received_amount.metadata["variable"] == "alimony_income" - assert received_amount.measure == "alimony_income" - - received_returns = targets_by_arch_variable["alimony_received_returns"] - assert received_returns.metadata["variable"] == "tax_unit_count" - assert received_returns.aggregation.value == "count" - assert ("alimony_income", ">", "0") in _target_filter_tuples( - received_returns - ) - - paid_amount = targets_by_arch_variable["alimony_paid_amount"] - assert paid_amount.metadata["variable"] == "alimony_expense" - assert paid_amount.measure == "alimony_expense" - - paid_returns = targets_by_arch_variable["alimony_paid_returns"] - assert paid_returns.metadata["variable"] == "tax_unit_count" - assert paid_returns.aggregation.value == "count" - assert ("alimony_expense", ">", "0") in _target_filter_tuples(paid_returns) - - -def test_arch_consumer_fact_jsonl_provider_maps_eitc_by_agi_and_children( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - row = _consumer_fact( - "eitc-three-child-50k-75k-returns", - concept="irs_soi.returns_with_total_earned_income_credit", - domain="individual_income_tax_returns_with_earned_income_credit", - source_name="irs_soi", - source_table="Publication 1304 Table 2.5 EITC by AGI and qualifying children", - period={"type": "tax_year", "value": 2022}, - value=97_411, - constraints=( - { - "variable": "us:statutes/26/62#adjusted_gross_income", - "operator": ">=", - "value": 50_000, - "unit": "usd", - "role": "filter", - }, - { - "variable": "us:statutes/26/62#adjusted_gross_income", - "operator": "<", - "value": 75_000, - "unit": "usd", - "role": "filter", - }, - { - "variable": "us.tax.earned_income_credit_qualifying_children", - "operator": "==", - "value": 3, - "unit": "count", - "role": "filter", - }, - ), - ) - consumer_jsonl.write_text(json.dumps(row, sort_keys=True) + "\n") - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2022) - ) - target = target_set.targets[0] - - assert target.metadata["arch_variable"] == "eitc_claims" - assert target.metadata["variable"] == "tax_unit_count" - assert target.aggregation.value == "count" - assert _target_filter_tuples(target) == { - ("eitc", ">", "0"), - ("adjusted_gross_income", ">=", "50000"), - ("adjusted_gross_income", "<", "75000"), - ("eitc_child_count", "==", "3"), - } - - -def test_arch_consumer_fact_coverage_accepts_eitc_child_count_totals( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "eitc-one-child-total-returns", - concept="irs_soi.returns_with_total_earned_income_credit", - domain="individual_income_tax_returns_with_earned_income_credit", - source_name="irs_soi", - source_table=( - "Publication 1304 Table 2.5 EITC by AGI and qualifying children" - ), - period={"type": "tax_year", "value": 2022}, - value=8_490_417, - constraints=( - { - "variable": "us.tax.earned_income_credit_qualifying_children", - "operator": "==", - "value": 1, - "unit": "count", - "role": "filter", - }, - ), - ), - _consumer_fact( - "eitc-one-child-total-amount", - concept="irs_soi.total_earned_income_credit", - domain="individual_income_tax_returns_with_earned_income_credit", - source_name="irs_soi", - source_table=( - "Publication 1304 Table 2.5 EITC by AGI and qualifying children" - ), - period={"type": "tax_year", "value": 2022}, - value=21_182_747_000, - unit="usd", - constraints=( - { - "variable": "us.tax.earned_income_credit_qualifying_children", - "operator": "==", - "value": 1, - "unit": "count", - "role": "filter", - }, - ), - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - provider = ArchConsumerFactJSONLTargetProvider(consumer_jsonl) - - report = summarize_arch_target_profile_coverage( - provider, - period=2022, - profile_name="custom", - target_cells=( - { - "variable": "eitc", - "geo_level": "national", - "domain_variable": "eitc_child_count", - }, - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "eitc_child_count", - }, - ), - ) - - assert report.covered_cell_count == 2 - - -def test_arch_consumer_fact_jsonl_provider_maps_us_admin_source_families( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "kff-aca-effectuated", - concept="cms_aca.marketplace_effectuated_enrollment", - domain="aca_marketplace_effectuated_enrollment", - source_name="kff", - source_table="Marketplace Effectuated Enrollment", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=1_795_695, - ), - _consumer_fact( - "cms-medicaid-monthly", - concept="cms_medicaid.total_medicaid_enrollment", - domain="medicaid_chip_enrollment", - source_name="cms_medicaid", - source_table="Monthly Medicaid and CHIP Enrollment", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - period={"type": "month", "value": "2024-12"}, - value=13_500_000, - ), - _consumer_fact( - "cms-nhe-medicaid", - concept="cms_nhe.medicaid_title_xix_expenditures", - domain="national_health_expenditures", - source_name="cms_nhe", - source_table="National Health Expenditures", - value=931_692_000_000, - unit="usd", - ), - _consumer_fact( - "snap-benefits", - concept="usda_snap.total_benefits", - domain="supplemental_nutrition_assistance_program", - source_name="usda_snap", - source_table="SNAP fiscal year benefits", - value=100_000_000_000, - unit="usd", - ), - _consumer_fact( - "snap-households", - concept="usda_snap.average_monthly_households", - domain="supplemental_nutrition_assistance_program", - source_name="usda_snap", - source_table="SNAP fiscal year participation", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=2_100_000, - ), - _consumer_fact( - "tanf-cash", - concept="hhs_acf_tanf.cash_assistance_expenditures", - domain="tanf_cash_assistance", - source_name="hhs_acf_tanf", - source_table="TANF Financial Data", - period={"type": "fiscal_year", "value": 2024}, - value=7_788_317_475, - unit="usd", - ), - _consumer_fact( - "tanf-total-families", - concept="hhs_acf_tanf.average_monthly_tanf_total_families", - domain="tanf_caseload", - source_name="hhs_acf_tanf", - source_table="TANF Caseload Data 2024", - period={"type": "fiscal_year", "value": 2024}, - value=841_209, - ), - _consumer_fact( - "liheap-households", - concept="hhs_acf_liheap.households_served_by_state_programs", - domain="liheap_state_programs", - source_name="hhs_acf_liheap", - source_table="LIHEAP FY2024 National Profile (All States)", - period={"type": "fiscal_year", "value": 2024}, - value=5_876_646, - constraints=( - {"variable": "program", "operator": "==", "value": "liheap"}, - { - "variable": "administering_entity", - "operator": "==", - "value": "state_programs", - }, - ), - ), - _consumer_fact( - "stc-income-tax", - concept="census_stc.individual_income_tax_collections", - domain="state_government_tax_collections", - source_name="census_stc", - source_table="FY2024 STC Flat File item T40", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - period={"type": "fiscal_year", "value": 2024}, - value=123_101_651_000, - unit="usd", - ), - _consumer_fact( - "ssa-retirement", - concept="ssa.annual_oasdi_or_ssi_payment_amount", - domain="social_security_and_ssi_payments", - source_name="ssa", - source_table="Annual Statistical Supplement", - value=1_111_728_000_000, - unit="usd", - constraints=( - { - "variable": "us_social_security_and_ssi.program_payment_type", - "operator": "==", - "value": "social_security_retirement_benefits", - }, - ), - ), - _consumer_fact( - "ssa-ssi", - concept="ssa.annual_oasdi_or_ssi_payment_amount", - domain="social_security_and_ssi_payments", - source_name="ssa", - source_table="Annual Statistical Supplement", - value=63_079_493_000, - unit="usd", - constraints=( - { - "variable": "us_social_security_and_ssi.program_payment_type", - "operator": "==", - "value": "ssi_payments", - }, - ), - ), - _consumer_fact( - "pep-age", - concept="census_pep.resident_population", - domain="resident_population", - source_name="census_pep", - source_table="Annual Estimates by Age and Sex", - value=18_599_314, - constraints=( - {"variable": "age", "operator": ">=", "value": 0, "unit": "years"}, - {"variable": "age", "operator": "<", "value": 5, "unit": "years"}, - ), - ), - _consumer_fact( - "aca-oep-average-aptc", - concept="cms_aca.average_monthly_aptc", - domain="aca_marketplace_qhp_selections", - source_name="cms_aca", - source_table="OEP State-Level Public Use File", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=526, - unit="usd", - ), - _consumer_fact( - "w2-traditional-401k", - concept="irs_soi.form_w2_401k_elective_deferrals", - domain="form_w2_items", - source_name="irs_soi", - source_table="Form W-2 Statistics Table 4.B", - period={"type": "tax_year", "value": 2024}, - value=277_859_181_000, - unit="usd", - ), - _consumer_fact( - "w2-roth-401k", - concept="irs_soi.form_w2_designated_roth_401k_contributions", - domain="form_w2_items", - source_name="irs_soi", - source_table="Form W-2 Statistics Table 4.B", - period={"type": "tax_year", "value": 2024}, - value=32_302_509_000, - unit="usd", - ), - _consumer_fact( - "soi-keogh", - concept="irs_soi.payments_to_keogh_plan", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Publication 1304 Table 1.4", - period={"type": "tax_year", "value": 2024}, - value=30_130_848_000, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - provider = ArchConsumerFactJSONLTargetProvider(consumer_jsonl) - - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - { - "variable": "person_count", - "geo_level": "state", - "domain_variable": "aca_ptc", - }, - { - "variable": "person_count", - "geo_level": "state", - "domain_variable": "medicaid_enrolled", - }, - {"variable": "medicaid", "geo_level": "national", "domain_variable": None}, - {"variable": "snap", "geo_level": "national", "domain_variable": None}, - { - "variable": "household_count", - "geo_level": "state", - "domain_variable": "snap", - }, - {"variable": "tanf", "geo_level": "national", "domain_variable": None}, - { - "variable": "spm_unit_count", - "geo_level": "national", - "domain_variable": "tanf", - }, - { - "variable": "household_count", - "geo_level": "national", - "domain_variable": "spm_unit_energy_subsidy_reported", - }, - { - "variable": "state_income_tax", - "geo_level": "state", - "domain_variable": None, - }, - { - "variable": "social_security_retirement", - "geo_level": "national", - "domain_variable": None, - }, - {"variable": "ssi", "geo_level": "national", "domain_variable": None}, - { - "variable": "person_count", - "geo_level": "national", - "domain_variable": "age", - }, - { - "variable": "traditional_401k_contributions", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "roth_401k_contributions", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "self_employed_pension_contribution_ald", - "geo_level": "national", - "domain_variable": None, - }, - ), - ) - - assert report.target_cell_count == 15 - assert report.covered_cell_count == 15 - - target_set = provider.load_target_set(TargetQuery(period=2024)) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - assert ( - targets_by_arch_variable["aca_marketplace_enrollment"].metadata["variable"] - == "person_count" - ) - assert ( - targets_by_arch_variable["medicaid_total_enrollment"].metadata["variable"] - == "person_count" - ) - assert targets_by_arch_variable["medicaid_benefits"].measure == "medicaid" - assert targets_by_arch_variable["snap_benefits"].measure == "snap" - assert ( - targets_by_arch_variable["snap_household_count"].metadata["variable"] - == "household_count" - ) - assert targets_by_arch_variable["tanf_cash_assistance"].measure == "tanf" - assert ( - targets_by_arch_variable["tanf_family_count"].metadata["variable"] - == "spm_unit_count" - ) - liheap_target = targets_by_arch_variable["liheap_household_count"] - assert liheap_target.metadata["variable"] == "household_count" - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in liheap_target.filters - } == {("spm_unit_energy_subsidy_reported", ">", 0)} - assert ( - targets_by_arch_variable["state_individual_income_tax_collections"].measure - == "state_income_tax" - ) - assert ( - targets_by_arch_variable["social_security_retirement_benefits"].measure - == "social_security_retirement" - ) - social_security_retirement = targets_by_arch_variable[ - "social_security_retirement_benefits" - ] - assert _target_filter_tuples(social_security_retirement) == set() - assert "program_payment_type" not in social_security_retirement.required_features - assert targets_by_arch_variable["ssi_payments"].measure == "ssi" - ssi_payments = targets_by_arch_variable["ssi_payments"] - assert _target_filter_tuples(ssi_payments) == set() - assert "program_payment_type" not in ssi_payments.required_features - traditional_401k = targets_by_arch_variable["traditional_401k_contributions"] - assert traditional_401k.measure == "traditional_401k_contributions" - assert traditional_401k.entity.value == "person" - roth_401k = targets_by_arch_variable["roth_401k_contributions"] - assert roth_401k.measure == "roth_401k_contributions" - assert roth_401k.entity.value == "person" - self_employed_pension = targets_by_arch_variable[ - "self_employed_pension_contribution_ald" - ] - assert self_employed_pension.measure == "self_employed_pension_contribution_ald" - assert self_employed_pension.entity.value == "tax_unit" - assert "aca_average_monthly_aptc" not in targets_by_arch_variable - - -def test_arch_consumer_fact_jsonl_provider_maps_medicare_part_b_premiums( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "cms-medicare-part-b-premiums", - concept="cms_medicare.part_b_premium_income", - domain="medicare_financing", - source_name="cms_medicare", - source_table="2025 Medicare Trustees Report Table III.C3", - period={"type": "calendar_year", "value": 2024}, - value=139_837_000_000, - unit="usd", - constraints=( - {"variable": "amount_basis", "operator": "==", "value": "actual"}, - {"variable": "medicare.part", "operator": "==", "value": "part_b"}, - { - "variable": "medicare.financing_component", - "operator": "==", - "value": "premiums_from_enrollees", - }, - ), - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2024) - ) - - target = target_set.targets[0] - assert target.metadata["arch_variable"] == "medicare_part_b_premiums" - assert target.metadata["variable"] == "medicare_part_b_premiums" - assert target.measure == "medicare_part_b_premiums" - assert target.entity.value == "person" - assert target.filters == () - - -def test_arch_consumer_fact_jsonl_provider_uses_ssa_payment_type_as_variable( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - payment_types = { - "social_security_benefits": "social_security", - "social_security_dependents_benefits": "social_security_dependents", - "social_security_disability_benefits": "social_security_disability", - "social_security_retirement_benefits": "social_security_retirement", - "social_security_survivors_benefits": "social_security_survivors", - "ssi_payments": "ssi", - } - rows = [ - _consumer_fact( - f"ssa-{payment_type}", - concept="ssa.annual_oasdi_or_ssi_payment_amount", - domain="social_security_and_ssi_payments", - source_name="ssa", - source_table="Annual Statistical Supplement", - value=1_000_000_000, - unit="usd", - constraints=( - { - "variable": "us_social_security_and_ssi.program_payment_type", - "operator": "==", - "value": payment_type, - }, - ), - ) - for payment_type in payment_types - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2024) - ) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - - assert set(targets_by_arch_variable) == set(payment_types) - for arch_variable, measure in payment_types.items(): - target = targets_by_arch_variable[arch_variable] - assert target.measure == measure - assert target.metadata["variable"] == measure - assert _target_filter_tuples(target) == set() - assert "program_payment_type" not in target.required_features - - -def test_arch_consumer_fact_jsonl_provider_maps_ssi_detail_targets( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "ssa-ssi-aged-recipients", - concept="ssa.ssi_recipient_count", - domain="social_security_and_ssi_payments", - source_name="ssa", - source_table="SSI Annual Statistical Report 2024", - period={"type": "calendar_year", "value": 2024}, - value=1_160_608, - unit="count", - constraints=( - {"variable": "ssi_category", "operator": "==", "value": "aged"}, - ), - ), - _consumer_fact( - "ssa-ca-ssi-payments", - concept="ssa.ssi_payment_amount", - domain="social_security_and_ssi_payments", - source_name="ssa", - source_table="SSI Annual Statistical Report 2024", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - period={"type": "calendar_year", "value": 2024}, - value=12_800_000_000, - unit="usd", - ), - _consumer_fact( - "ssa-ca-ssi-disabled-recipients", - concept="ssa.ssi_recipient_count", - domain="social_security_and_ssi_payments", - source_name="ssa", - source_table="SSI Annual Statistical Report 2024", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - period={"type": "calendar_year", "value": 2024}, - value=877_000, - unit="count", - constraints=( - {"variable": "ssi_category", "operator": "==", "value": "disabled"}, - ), - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2024) - ) - - def find_target( - arch_variable: str, - required_filters: set[tuple[str, str, object]], - ): - for target in target_set.targets: - if target.metadata["arch_variable"] != arch_variable: - continue - filters = { - ( - target_filter.feature, - target_filter.operator.value, - target_filter.value, - ) - for target_filter in target.filters - } - if required_filters.issubset(filters): - return target - raise AssertionError( - f"Missing {arch_variable} target with filters {required_filters}" - ) - - aged_count = find_target( - "ssi_recipients", - {("is_ssi_aged", ">", 0), ("ssi", ">", 0)}, - ) - assert aged_count.measure is None - assert aged_count.entity.value == "person" - assert aged_count.value == pytest.approx(1_160_608) - assert aged_count.metadata["arch_variable"] == "ssi_recipients" - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in aged_count.filters - } == {("is_ssi_aged", ">", 0), ("ssi", ">", 0)} - - ca_payments = find_target( - "ssi_total_payments", - {("state_fips", "==", "06")}, - ) - assert ca_payments.measure == "ssi" - assert ca_payments.entity.value == "person" - assert ca_payments.value == pytest.approx(12_800_000_000) - assert ca_payments.metadata["arch_variable"] == "ssi_total_payments" - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in ca_payments.filters - } == {("state_fips", "==", "06")} - - ca_disabled_count = find_target( - "ssi_recipients", - { - ("is_ssi_disabled", ">", 0), - ("ssi", ">", 0), - ("state_fips", "==", "06"), - }, - ) - assert ca_disabled_count.measure is None - assert ca_disabled_count.value == pytest.approx(877_000) - assert { - (target_filter.feature, target_filter.operator.value, target_filter.value) - for target_filter in ca_disabled_count.filters - } == { - ("is_ssi_disabled", ">", 0), - ("ssi", ">", 0), - ("state_fips", "==", "06"), - } - - -def test_arch_consumer_fact_jsonl_provider_maps_fed_household_net_worth( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "fed-z1-net-worth", - concept="federal_reserve.z1.households_nonprofits_net_worth", - domain="household_balance_sheet", - source_name="federal_reserve", - source_table="Z.1 B.101 Households and nonprofit organizations", - period={"type": "calendar_year", "value": 2024}, - value=169_619_200_000_000, - unit="usd", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2024) - ) - - target = target_set.targets[0] - assert target.metadata["arch_variable"] == "net_worth_amount" - assert target.metadata["variable"] == "net_worth" - assert target.measure == "net_worth" - assert target.entity.value == "household" - assert target.filters == () - - -def test_arch_consumer_fact_jsonl_provider_maps_decennial_sld_facts( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "census-cd119-sldu-population", - concept="census_decennial.resident_population", - domain="resident_population", - source_name="census_decennial", - source_table="2020 Census CD119 California SLD P1", - geography={ - "level": "state_legislative_district_upper", - "id": "610U900US06001", - "name": "State Senate District 1", - }, - value=943_108, - ), - _consumer_fact( - "census-cd119-sldl-households", - concept="census_decennial.occupied_housing_units", - domain="households", - source_name="census_decennial", - source_table="2020 Census CD119 California SLD H3", - geography={ - "level": "state_legislative_district_lower", - "id": "620L900US06080", - "name": "Assembly District 80", - }, - value=154_291, - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - provider = ArchConsumerFactJSONLTargetProvider(consumer_jsonl) - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - { - "variable": "person_count", - "geo_level": "sldu", - "geographic_id": "CA-SLDU-001", - "domain_variable": None, - }, - { - "variable": "household_count", - "geo_level": "sldl", - "geographic_id": "CA-SLDL-080", - "domain_variable": None, - }, - ), - ) - - assert report.covered_cell_count == 2 - target_set = provider.load_target_set(TargetQuery(period=2024)) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - population = targets_by_arch_variable["population"] - households = targets_by_arch_variable["household_count"] - - assert population.value == 943_108 - assert population.metadata["source"] == "CENSUS_DECENNIAL" - assert population.metadata["geo_level"] == "sldu" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in population.filters - } == {("sldu_id", "==", "CA-SLDU-001")} - assert households.value == 154_291 - assert households.metadata["geo_level"] == "sldl" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in households.filters - } == {("sldl_id", "==", "CA-SLDL-080")} - - -def test_arch_consumer_fact_jsonl_provider_maps_acs_cd_age_population( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "acs-cd119-age-population", - concept="census_acs.person_count", - domain="total_population", - source_name="census_acs", - source_table="ACS 2024 1-year subject table S0101", - geography={ - "level": "congressional_district", - "id": "5001900US0101", - "name": "Alabama Congressional District 1", - }, - value=39_908, - constraints=( - {"variable": "age", "operator": ">=", "value": 0, "unit": "years"}, - {"variable": "age", "operator": "<", "value": 5, "unit": "years"}, - ), - ), - _consumer_fact( - "acs-cd119-households", - concept="census_acs.household_count", - domain="households", - source_name="census_acs", - source_table="ACS 2024 1-year subject table S2201", - geography={ - "level": "congressional_district", - "id": "5001900US0101", - "name": "Alabama Congressional District 1", - }, - value=300_636, - ), - _consumer_fact( - "acs-cd119-snap-households", - concept="census_acs.household_count", - domain="households", - source_name="census_acs", - source_table="ACS 2024 1-year subject table S2201", - geography={ - "level": "congressional_district", - "id": "5001900US0101", - "name": "Alabama Congressional District 1", - }, - value=34_742, - constraints=( - { - "variable": "snap_receipt_status", - "operator": "==", - "value": "receiving_food_stamps_snap", - }, - ), - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - provider = ArchConsumerFactJSONLTargetProvider(consumer_jsonl) - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - { - "variable": "person_count", - "geo_level": "district", - "geographic_id": "0101", - "domain_variable": "age", - }, - { - "variable": "household_count", - "geo_level": "district", - "geographic_id": "0101", - "domain_variable": None, - }, - { - "variable": "household_count", - "geo_level": "district", - "geographic_id": "0101", - "domain_variable": "snap", - }, - ), - ) - - assert report.covered_cell_count == 3 - target_set = provider.load_target_set(TargetQuery(period=2024)) - targets_by_key = { - target.metadata["arch_source_record_id"]: target - for target in target_set.targets - } - target = targets_by_key["census_acs.acs-cd119-age-population"] - households = targets_by_key["census_acs.acs-cd119-households"] - snap_households = targets_by_key["census_acs.acs-cd119-snap-households"] - - assert target.metadata["source"] == "CENSUS_ACS" - assert target.metadata["arch_variable"] == "population" - assert target.metadata["variable"] == "person_count" - assert target.metadata["geo_level"] == "district" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in target.filters - } == { - ("age", ">=", "0"), - ("age", "<", "5"), - ("congressional_district_geoid", "==", "0101"), - } - assert households.metadata["arch_variable"] == "household_count" - assert households.metadata["variable"] == "household_count" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in households.filters - } == {("congressional_district_geoid", "==", "0101")} - assert snap_households.metadata["arch_variable"] == "household_count" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in snap_households.filters - } == { - ("congressional_district_geoid", "==", "0101"), - ("snap", ">", "0"), - } - - -def test_arch_consumer_fact_jsonl_provider_maps_census_population_projection( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "census-popproj-age-0", - concept="census.population_projection", - domain="population_projection", - source_name="census_population_projections", - source_table="2023 National Population Projections Main Series", - period={"type": "calendar_year", "value": 2024}, - value=3_636_897, - constraints=( - {"variable": "age", "operator": ">=", "value": 0, "unit": "years"}, - {"variable": "age", "operator": "<", "value": 1, "unit": "years"}, - ), - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2024) - ) - - target = target_set.targets[0] - assert target.metadata["arch_variable"] == "population" - assert target.metadata["variable"] == "person_count" - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in target.filters - } == {("age", ">=", "0"), ("age", "<", "1")} - - -def test_arch_consumer_fact_jsonl_provider_normalizes_legacy_sld_ids( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "legacy-sldu-population", - concept="census_decennial.resident_population", - domain="resident_population", - source_name="census_decennial", - source_table="Legacy SLD fixture", - geography={ - "level": "state_senate_district", - "id": "CA-SD-1", - "name": "State Senate District 1", - }, - value=943_108, - ), - _consumer_fact( - "legacy-sldl-households", - concept="census_decennial.occupied_housing_units", - domain="households", - source_name="census_decennial", - source_table="Legacy SLD fixture", - geography={ - "level": "state_house_district", - "id": "NY-AD-65", - "name": "Assembly District 65", - }, - value=154_291, - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - provider = ArchConsumerFactJSONLTargetProvider(consumer_jsonl) - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - { - "variable": "person_count", - "geo_level": "sldu", - "geographic_id": "06001", - "domain_variable": None, - }, - { - "variable": "household_count", - "geo_level": "sldl", - "geographic_id": "36065", - "domain_variable": None, - }, - ), - ) - - assert report.covered_cell_count == 2 - target_set = provider.load_target_set(TargetQuery(period=2024)) - targets_by_arch_variable = { - target.metadata["arch_variable"]: target for target in target_set.targets - } - - assert { - (target_filter.feature, str(target_filter.value)) - for target_filter in targets_by_arch_variable["population"].filters - } == {("sldu_id", "CA-SLDU-001")} - assert { - (target_filter.feature, str(target_filter.value)) - for target_filter in targets_by_arch_variable["household_count"].filters - } == {("sldl_id", "NY-SLDL-065")} - - -def test_arch_consumer_fact_jsonl_provider_maps_bea_full_population_amounts( - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setattr( - arch_module, - "ARCH_NATIONAL_ROLLUP_STATE_FIPS", - frozenset({"06", "36"}), - ) - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - rows = [ - _consumer_fact( - "bea-nipa-wages", - concept="bea_nipa.wages_and_salaries", - domain="personal_income", - source_name="bea", - source_table="NIPA annual total wages and salaries", - value=11_000_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-nipa-proprietors", - concept=( - "bea_nipa.proprietors_income_with_inventory_valuation_and_capital_consumption_adjustments" - ), - domain="personal_income", - source_name="bea", - source_table="NIPA annual personal income components", - value=2_000_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-us-wages", - concept="bea_regional.wages_and_salaries", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - value=12_300_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-us-proprietors", - concept="bea_regional.proprietors_income", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - value=2_020_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ca-wages", - concept="bea_regional.wages_and_salaries", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=1_500_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ca-supplements", - concept="bea_regional.supplements_to_wages_and_salaries", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=300_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ca-contributions", - concept="bea_regional.contributions_for_government_social_insurance", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=200_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ca-residence", - concept="bea_regional.residence_adjustment", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=40_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ny-wages", - concept="bea_regional.wages_and_salaries", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US36", "name": "New York"}, - value=2_000_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ny-supplements", - concept="bea_regional.supplements_to_wages_and_salaries", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US36", "name": "New York"}, - value=400_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ny-contributions", - concept="bea_regional.contributions_for_government_social_insurance", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US36", "name": "New York"}, - value=100_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ny-residence", - concept="bea_regional.residence_adjustment", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US36", "name": "New York"}, - value=-50_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-regional-ca-proprietors", - concept="bea_regional.proprietors_income", - domain="personal_income", - source_name="bea", - source_table="SAINC5N", - geography={"level": "state", "id": "0400000US06", "name": "California"}, - value=180_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-nipa-dividends", - concept="bea_nipa.personal_dividend_income", - domain="personal_income", - source_name="bea", - source_table="NIPA annual personal income components", - value=2_100_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-nipa-rental", - concept=( - "bea_nipa.rental_income_of_persons_with_capital_consumption_adjustment" - ), - domain="personal_income", - source_name="bea", - source_table="NIPA annual personal income components", - value=1_000_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-nipa-social-security", - concept="bea_nipa.social_security_benefits", - domain="personal_current_transfer_receipts", - source_name="bea", - source_table="NIPA annual personal income components", - value=1_500_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-nipa-medicaid", - concept="bea_nipa.medicaid_benefits", - domain="personal_current_transfer_receipts", - source_name="bea", - source_table="NIPA annual personal income components", - value=900_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-nipa-ui", - concept="bea_nipa.unemployment_insurance_benefits", - domain="personal_current_transfer_receipts", - source_name="bea", - source_table="NIPA annual personal income components", - value=30_000_000_000, - unit="usd", - ), - _consumer_fact( - "bea-nipa-saving-rate", - concept="bea_nipa.personal_saving_rate", - domain="personal_income", - source_name="bea", - source_table="NIPA annual personal income disposition", - value=3.8, - unit="percent", - ), - ] - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - provider = ArchConsumerFactJSONLTargetProvider(consumer_jsonl) - - report = summarize_arch_target_profile_coverage( - provider, - period=2024, - profile_name="custom", - target_cells=( - { - "variable": "employment_income_before_lsr", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "employment_income_before_lsr", - "geo_level": "state", - "domain_variable": None, - }, - { - "variable": "self_employment_income", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "self_employment_income", - "geo_level": "state", - "domain_variable": None, - }, - { - "variable": "dividend_income", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "rental_income", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "social_security", - "geo_level": "national", - "domain_variable": None, - }, - {"variable": "medicaid", "geo_level": "national", "domain_variable": None}, - { - "variable": "unemployment_compensation", - "geo_level": "national", - "domain_variable": None, - }, - ), - ) - - assert report.target_cell_count == 9 - assert report.covered_cell_count == 7 - - target_set = provider.load_target_set(TargetQuery(period=2024)) - targets_by_source_record = { - target.metadata["arch_source_record_id"]: target - for target in target_set.targets - } - assert set(targets_by_source_record) == { - "bea.bea-nipa-wages", - "bea.bea-nipa-proprietors", - "microplex.derived.bea_state_wages.2024.06", - "microplex.derived.bea_state_wages.2024.36", - "bea.bea-regional-ca-proprietors", - "bea.bea-nipa-dividends", - "bea.bea-nipa-rental", - "bea.bea-nipa-social-security", - "bea.bea-nipa-medicaid", - "bea.bea-nipa-ui", - } - assert targets_by_source_record["bea.bea-nipa-wages"].measure == ( - "employment_income_before_lsr" - ) - assert ( - targets_by_source_record["bea.bea-nipa-wages"].metadata["arch_variable"] - == "employment_income_before_lsr_amount" - ) - assert targets_by_source_record["bea.bea-nipa-wages"].filters == () - assert targets_by_source_record["bea.bea-nipa-proprietors"].measure == ( - "proprietors_income_amount" - ) - assert targets_by_source_record["bea.bea-nipa-proprietors"].metadata[ - "arch_concept" - ] == ( - "bea_nipa.proprietors_income_with_inventory_valuation_and_capital_consumption_adjustments" - ) - assert targets_by_source_record["bea.bea-nipa-proprietors"].filters == () - ca_state_wages = targets_by_source_record[ - "microplex.derived.bea_state_wages.2024.06" - ] - ny_state_wages = targets_by_source_record[ - "microplex.derived.bea_state_wages.2024.36" - ] - assert ca_state_wages.measure == ( - "employment_income_before_lsr" - ) - assert ca_state_wages.metadata["arch_variable"] == ( - "employment_income_before_lsr_amount" - ) - ca_adjusted = 1_500_000_000_000 + 40_000_000_000 * ( - 1_500_000_000_000 / 2_000_000_000_000 - ) - ny_adjusted = 2_000_000_000_000 - 50_000_000_000 * ( - 2_000_000_000_000 / 2_500_000_000_000 - ) - scale = 11_000_000_000_000 / (ca_adjusted + ny_adjusted) - assert ca_state_wages.value == pytest.approx(ca_adjusted * scale) - assert ny_state_wages.value == pytest.approx(ny_adjusted * scale) - assert ca_state_wages.value + ny_state_wages.value == pytest.approx( - 11_000_000_000_000 - ) - assert targets_by_source_record["bea.bea-regional-ca-proprietors"].measure == ( - "proprietors_income_amount" - ) - assert { - ( - target_filter.feature, - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in ca_state_wages.filters - } == {("state_fips", "==", "06")} - assert targets_by_source_record["bea.bea-nipa-dividends"].source == "BEA" - assert "bea.bea-regional-us-wages" not in targets_by_source_record - assert "bea.bea-regional-ca-wages" not in targets_by_source_record - assert "bea.bea-regional-ca-supplements" not in targets_by_source_record - assert "bea.bea-regional-ca-contributions" not in targets_by_source_record - assert "bea.bea-regional-ca-residence" not in targets_by_source_record - assert "bea.bea-regional-us-proprietors" not in targets_by_source_record - assert not provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={"variables": ("self_employment_income",)}, - ) - ).targets - assert all( - target.metadata["arch_variable"] != "personal_saving_rate" - for target in target_set.targets - ) - - -def test_arch_consumer_fact_jsonl_provider_skips_cbo_projection_concepts( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - control = _consumer_fact( - "soi-wages", - concept="irs_soi.total_wages", - domain="all_individual_income_tax_returns", - source_name="irs_soi", - source_table="Publication 1304 Table 1.1", - period={"type": "tax_year", "value": 2024}, - value=10_000_000_000_000, - unit="usd", - ) - cbo_concepts = ( - "cbo.adjusted_gross_income_projection", - "cbo.wages_and_salaries_projection", - ( - "cbo.taxable_interest_and_ordinary_dividends_excluding_qualified_" - "dividends_projection" - ), - "cbo.qualified_dividend_income_projection", - "cbo.net_capital_gain_projection", - "cbo.net_business_income_projection", - ) - rows = [control] - for concept in cbo_concepts: - row = _consumer_fact( - concept.rsplit(".", 1)[-1], - concept=concept, - domain="individual_income_tax_returns", - source_name="cbo", - source_table=( - "Revenue Projections, by Category, February 2026, " - "sheet 3.Individual Income Tax Details" - ), - period={"type": "tax_year", "value": 2024}, - value=1_000_000_000, - unit="usd", - ) - row["concept_alignment"] = { - "canonical_concept": concept, - "source_concept": concept.replace("_projection", ""), - "relation": "source_label", - "authority": "cbo", - "evidence_notes": "Projection fixture.", - } - rows.append(row) - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - target_set = ArchConsumerFactJSONLTargetProvider(consumer_jsonl).load_target_set( - TargetQuery(period=2024) - ) - - assert len(target_set.targets) == 1 - target = target_set.targets[0] - assert target.metadata["arch_variable"] == "wages_salaries_amount" - assert target.measure == "employment_income" - assert target.metadata["arch_source_concept"] == "irs_soi.total_wages" - - -def test_arch_target_smoke_cli_reports_consumer_fact_jsonl_counts( - tmp_path: Path, - capsys: pytest.CaptureFixture[str], -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _write_consumer_fact_jsonl(consumer_jsonl) - - exit_code = main_smoke( - [ - "--arch-targets-db", - str(consumer_jsonl), - "--period", - "2023", - "--expected-target-count", - "5", - "--no-compose-model-year-targets", - ] - ) - payload = json.loads(capsys.readouterr().out) - - assert exit_code == 0 - assert payload["valid"] - assert payload["target_count"] == 5 - assert payload["by_source"] == {"IRS_SOI": 5} - assert payload["by_variable"] == { - "adjusted_gross_income": 2, - "income_tax": 1, - "tax_unit_count": 2, - } - assert payload["errors"] == [] - assert payload["sample_targets"][0]["metadata"]["arch_aggregate_fact_key"] - - -def test_arch_target_smoke_cli_rejects_unexpected_target_count( - tmp_path: Path, - capsys: pytest.CaptureFixture[str], -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _write_consumer_fact_jsonl(consumer_jsonl) - - exit_code = main_smoke( - [ - "--arch-targets-db", - str(consumer_jsonl), - "--period", - "2023", - "--expected-target-count", - "6", - "--no-compose-model-year-targets", - ] - ) - payload = json.loads(capsys.readouterr().out) - - assert exit_code == 1 - assert not payload["valid"] - assert payload["target_count"] == 5 - assert payload["errors"] == [ - { - "code": "unexpected_target_count", - "message": "Expected 6 targets, loaded 5.", - } - ] - - -def test_arch_target_parity_cli_accepts_matching_consumer_fact_jsonl( - tmp_path: Path, - capsys: pytest.CaptureFixture[str], -) -> None: - value_db = tmp_path / "value_targets.db" - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _create_value_constraint_target_db(value_db) - _write_consumer_fact_jsonl(consumer_jsonl) - - exit_code = main_parity( - [ - "--incumbent-arch-targets-db", - str(value_db), - "--candidate-arch-targets-db", - str(consumer_jsonl), - "--period", - "2023", - "--no-compose-model-year-targets", - ] - ) - payload = json.loads(capsys.readouterr().out) - - assert exit_code == 0 - assert payload["valid"] - assert payload["counts"] == { - "candidate_only_count": 0, - "candidate_target_count": 5, - "duplicate_identity_count": 0, - "incumbent_only_count": 0, - "incumbent_target_count": 5, - "matched_count": 5, - "value_mismatch_count": 0, - } - assert payload["errors"] == [] - assert payload["rows"][0]["status"] == "matched" - - -def test_arch_target_parity_cli_rejects_value_mismatch( - tmp_path: Path, - capsys: pytest.CaptureFixture[str], -) -> None: - value_db = tmp_path / "value_targets.db" - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _create_value_constraint_target_db(value_db) - _write_consumer_fact_jsonl(consumer_jsonl) - rows = [json.loads(line) for line in consumer_jsonl.read_text().splitlines()] - rows[1]["value"] += 1_000 - consumer_jsonl.write_text( - "\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n" - ) - - exit_code = main_parity( - [ - "--incumbent-arch-targets-db", - str(value_db), - "--candidate-arch-targets-db", - str(consumer_jsonl), - "--period", - "2023", - "--no-compose-model-year-targets", - ] - ) - payload = json.loads(capsys.readouterr().out) - - assert exit_code == 1 - assert not payload["valid"] - assert payload["counts"]["matched_count"] == 4 - assert payload["counts"]["value_mismatch_count"] == 1 - assert payload["errors"][0]["code"] == "value_mismatch" - assert payload["errors"][0]["absolute_delta"] == 1_000 - - -def test_arch_target_parity_cli_rejects_duplicate_candidate_identity( - tmp_path: Path, - capsys: pytest.CaptureFixture[str], -) -> None: - value_db = tmp_path / "value_targets.db" - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _create_value_constraint_target_db(value_db) - _write_consumer_fact_jsonl(consumer_jsonl) - lines = consumer_jsonl.read_text().splitlines() - consumer_jsonl.write_text("\n".join([*lines, lines[0]]) + "\n") - - exit_code = main_parity( - [ - "--incumbent-arch-targets-db", - str(value_db), - "--candidate-arch-targets-db", - str(consumer_jsonl), - "--period", - "2023", - "--no-compose-model-year-targets", - ] - ) - payload = json.loads(capsys.readouterr().out) - - assert exit_code == 1 - assert not payload["valid"] - assert payload["counts"]["duplicate_identity_count"] == 1 - assert payload["errors"][0]["code"] == "duplicate_identity" - assert payload["errors"][0]["candidate_target_count"] == 2 - - -def test_arch_fact_provider_composes_latest_source_facts_to_model_year( - tmp_path: Path, -) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - - target_set = ArchFactSQLiteTargetProvider(fact_db).load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "adjusted_gross_income", - "geo_level": "national", - "domain_variable": None, - } - ], - }, - ) - ) - - all_agi = next( - target - for target in target_set.targets - if target.metadata["arch_aggregate_fact_key"] == "arch.fact.v1:all-agi" - ) - assert all_agi.period == 2024 - assert all_agi.value == 15_286_017_359_000 - assert all_agi.metadata["arch_source_period"] == 2023 - assert all_agi.metadata["arch_model_period"] == 2024 - assert all_agi.metadata["arch_aging_amount_factor"] == 1 - assert all_agi.metadata["arch_aging_amount_method"] == ( - "source_fact_carry_forward_no_amount_reference" - ) - - -def test_arch_composite_source_facts_age_across_artifacts( - tmp_path: Path, -) -> None: - table_1_1_db = tmp_path / "arch_table_1_1.db" - table_1_4_db = tmp_path / "arch_table_1_4.db" - _create_arch_fact_db(table_1_1_db) - _insert_arch_table_1_1_reference_totals( - table_1_1_db, - year=2022, - return_count=160_602_107 / 1.1, - adjusted_gross_income=15_286_017_359_000 / 1.1, - ) - _create_arch_fact_db(table_1_4_db) - _insert_arch_table_1_4_facts(table_1_4_db) - provider = resolve_arch_sqlite_target_provider((table_1_1_db, table_1_4_db)) - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - provider_filters={ - "sources": ["IRS_SOI"], - "target_cells": [ - { - "variable": "employment_income", - "geo_level": "national", - "domain_variable": "employment_income", - } - ], - }, - ) - ) - - wages = next( - target - for target in target_set.targets - if target.metadata["arch_aggregate_fact_key"] - == "arch.fact.v1:t14-all-wages-amount" - ) - assert wages.period == 2024 - assert wages.value == 10_500_000_000_000 * 1.1 - assert wages.metadata["arch_source_period"] == 2023 - assert wages.metadata["arch_aging_amount_factor"] == 1.1 - assert wages.metadata["arch_aging_amount_method"] == ( - "soi_total_agi_last_growth_extrapolation" - ) - assert wages.metadata["arch_source_db_path"] == str(table_1_4_db) - - -def test_arch_provider_resolver_detects_source_fact_schema(tmp_path: Path) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - - provider = resolve_arch_sqlite_target_provider(fact_db) - - assert isinstance(provider, ArchFactSQLiteTargetProvider) - - -def test_arch_provider_resolver_detects_consumer_fact_jsonl(tmp_path: Path) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _write_consumer_fact_jsonl(consumer_jsonl) - - provider = resolve_arch_sqlite_target_provider(consumer_jsonl) - - assert isinstance(provider, ArchConsumerFactJSONLTargetProvider) - - -def test_arch_provider_resolver_combines_multiple_source_fact_dbs( - tmp_path: Path, -) -> None: - table_1_1_db = tmp_path / "arch_table_1_1.db" - table_1_4_db = tmp_path / "arch_table_1_4.db" - _create_arch_fact_db(table_1_1_db) - _create_arch_fact_db(table_1_4_db) - _insert_arch_table_1_4_facts(table_1_4_db) - - provider = resolve_arch_sqlite_target_provider( - (str(table_1_1_db), str(table_1_4_db)) - ) - target_set = provider.load_target_set(TargetQuery(period=2023)) - - assert isinstance(provider, ArchCompositeSQLiteTargetProvider) - assert len(target_set.targets) == 18 - assert len({target.name for target in target_set.targets}) == 18 - assert {target.metadata["target_id"] for target in target_set.targets} == set( - range(1, 19) - ) - assert all( - "arch_source_db_path" in target.metadata for target in target_set.targets - ) - - -def test_us_pipeline_arch_target_provider_accepts_source_fact_db( - tmp_path: Path, -) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - arch_targets_db=str(fact_db), - calibration_target_source="arch", - ) - ) - - provider, source = pipeline._resolve_calibration_target_provider() - - assert source == "arch" - assert isinstance(provider, ArchFactSQLiteTargetProvider) - - -def test_us_pipeline_arch_target_provider_accepts_consumer_fact_jsonl( - tmp_path: Path, -) -> None: - consumer_jsonl = tmp_path / "consumer_facts.jsonl" - _write_consumer_fact_jsonl(consumer_jsonl) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - arch_targets_db=str(consumer_jsonl), - calibration_target_source="arch", - ) - ) - - provider, source = pipeline._resolve_calibration_target_provider() - - assert source == "arch" - assert isinstance(provider, ArchConsumerFactJSONLTargetProvider) - - -def test_us_pipeline_arch_target_provider_accepts_multiple_source_fact_dbs( - tmp_path: Path, -) -> None: - table_1_1_db = tmp_path / "arch_table_1_1.db" - table_1_4_db = tmp_path / "arch_table_1_4.db" - _create_arch_fact_db(table_1_1_db) - _create_arch_fact_db(table_1_4_db) - _insert_arch_table_1_4_facts(table_1_4_db) - pipeline = USMicroplexPipeline( - USMicroplexBuildConfig( - arch_targets_db=(str(table_1_1_db), str(table_1_4_db)), - calibration_target_source="arch", - ) - ) - - provider, source = pipeline._resolve_calibration_target_provider() - target_set = provider.load_target_set(TargetQuery(period=2023)) - - assert source == "arch" - assert isinstance(provider, ArchCompositeSQLiteTargetProvider) - assert len(target_set.targets) == 18 - - -def test_arch_fact_provider_maps_soi_table_1_4_income_source_facts( - tmp_path: Path, -) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - _insert_arch_table_1_4_facts(fact_db) - - target_set = ArchFactSQLiteTargetProvider(fact_db).load_target_set( - TargetQuery(period=2023) - ) - table_1_4_targets = [ - target - for target in target_set.targets - if target.metadata["source_table"] == "Publication 1304 Table 1.4" - ] - - arch_variables = {target.metadata["arch_variable"] for target in table_1_4_targets} - assert arch_variables >= { - "wages_salaries_returns", - "wages_salaries_amount", - "net_capital_gains_returns", - "net_capital_gains_amount", - "taxable_ira_distributions_returns", - "taxable_ira_distributions_amount", - "taxable_pension_income_returns", - "taxable_pension_income_amount", - "unemployment_compensation_returns", - "unemployment_compensation_amount", - "taxable_social_security_returns", - "taxable_social_security_amount", - } - - wages_amount = next( - target - for target in table_1_4_targets - if target.metadata["arch_aggregate_fact_key"] - == "arch.fact.v1:t14-all-wages-amount" - ) - assert wages_amount.measure == "employment_income" - assert getattr(wages_amount.aggregation, "value", wages_amount.aggregation) == "sum" - assert getattr(wages_amount.entity, "value", wages_amount.entity) == "person" - assert wages_amount.metadata["variable"] == "employment_income" - assert wages_amount.metadata["arch_source_concept"] == "irs_soi.total_wages" - assert wages_amount.metadata["arch_concept_relation"] == "broad_match" - assert wages_amount.metadata["arch_source_cell_keys"] == [ - "arch.source_cell.v1:t14-wages-amount" - ] - - wages_returns = next( - target - for target in table_1_4_targets - if target.metadata["arch_aggregate_fact_key"] - == "arch.fact.v1:t14-all-wages-returns" - ) - assert wages_returns.measure is None - assert ( - getattr(wages_returns.aggregation, "value", wages_returns.aggregation) - == "count" - ) - assert getattr(wages_returns.entity, "value", wages_returns.entity) == "tax_unit" - assert wages_returns.metadata["variable"] == "tax_unit_count" - assert ( - "employment_income", - ">", - "0", - ) in { - ( - str(target_filter.feature), - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in wages_returns.filters - } - - capital_gains_amount = next( - target - for target in table_1_4_targets - if target.metadata["arch_aggregate_fact_key"] - == "arch.fact.v1:t14-all-capital-gains-amount" - ) - assert ( - "net_capital_gains", - ">", - "0", - ) in { - ( - str(target_filter.feature), - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in capital_gains_amount.filters - } - - bracket_wages = next( - target - for target in table_1_4_targets - if target.metadata["arch_aggregate_fact_key"] - == "arch.fact.v1:t14-1-to-5k-wages-amount" - ) - assert { - ( - str(target_filter.feature), - str(getattr(target_filter.operator, "value", target_filter.operator)), - str(target_filter.value), - ) - for target_filter in bracket_wages.filters - } >= { - ("adjusted_gross_income", ">=", "1"), - ("adjusted_gross_income", "<", "5000"), - } - - -def test_arch_fact_profile_coverage_accepts_soi_table_1_4_facts( - tmp_path: Path, -) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - _insert_arch_table_1_4_facts(fact_db) - provider = ArchFactSQLiteTargetProvider(fact_db) - - report = summarize_arch_target_profile_coverage( - provider, - period=2023, - profile_name="custom", - target_cells=( - { - "variable": "employment_income", - "geo_level": "national", - "domain_variable": "employment_income", - }, - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "employment_income", - }, - { - "variable": "taxable_social_security", - "geo_level": "national", - "domain_variable": "taxable_social_security", - }, - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "taxable_social_security", - }, - ), - ) - - assert report.target_cell_count == 4 - assert report.covered_cell_count == 4 - assert report.coverage_rate == 1 - - -def test_arch_composite_profile_coverage_combines_table_1_1_and_1_4( - tmp_path: Path, -) -> None: - table_1_1_db = tmp_path / "arch_table_1_1.db" - table_1_4_db = tmp_path / "arch_table_1_4.db" - _create_arch_fact_db(table_1_1_db) - _create_arch_fact_db(table_1_4_db) - _insert_arch_table_1_4_facts(table_1_4_db) - provider = resolve_arch_sqlite_target_provider((table_1_1_db, table_1_4_db)) - - report = summarize_arch_target_profile_coverage( - provider, - period=2023, - profile_name="custom", - target_cells=( - { - "variable": "adjusted_gross_income", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "income_tax", - "geo_level": "national", - "domain_variable": None, - }, - { - "variable": "employment_income", - "geo_level": "national", - "domain_variable": "employment_income", - }, - { - "variable": "tax_unit_count", - "geo_level": "national", - "domain_variable": "employment_income", - }, - ), - ) - - assert report.target_cell_count == 4 - assert report.covered_cell_count == 4 - assert report.coverage_rate == 1 - - -def test_arch_fact_gap_queue_uses_source_fact_loaded_catalog( - tmp_path: Path, -) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - _insert_arch_table_1_4_facts(fact_db) - provider = ArchFactSQLiteTargetProvider(fact_db) - - report = summarize_arch_target_gap_queue( - provider, - period=2023, - profile_name="custom", - target_cells=( - { - "variable": "employment_income", - "geo_level": "state", - "domain_variable": "employment_income", - }, - ), - ) - - assert report.row_count == 1 - assert report.rows[0].expected_arch_variable == "wages_salaries_amount" - assert report.rows[0].loader_status == "loaded_arch_variable_missing_geography" - - -def test_arch_fact_gap_queue_expected_filters_normalize_geography_ids( - tmp_path: Path, -) -> None: - fact_db = tmp_path / "arch_facts.db" - _create_arch_fact_db(fact_db) - provider = ArchFactSQLiteTargetProvider(fact_db) - - report = summarize_arch_target_gap_queue( - provider, - period=2023, - profile_name="custom", - target_cells=( - { - "variable": "person_count", - "geo_level": "state", - "geographic_id": "06", - "domain_variable": None, - }, - { - "variable": "person_count", - "geo_level": "sldu", - "geographic_id": "06001", - "domain_variable": None, - }, - { - "variable": "household_count", - "geo_level": "sldl", - "geographic_id": "36065", - "domain_variable": None, - }, - ), - ) - - filters_by_level = { - row.geo_level: { - item["feature"]: item["value"] - for item in row.expected_filters - if item["kind"] == "geography" - } - for row in report.rows - } - - assert filters_by_level == { - "state": {"state_fips": "06"}, - "sldu": {"sldu_id": "CA-SLDU-001"}, - "sldl": {"sldl_id": "NY-SLDL-065"}, - } diff --git a/tests/targets/test_census_blocks.py b/tests/targets/test_census_blocks.py deleted file mode 100644 index 7cee0594..00000000 --- a/tests/targets/test_census_blocks.py +++ /dev/null @@ -1,153 +0,0 @@ -"""Tests for Census block-derived target providers.""" - -from __future__ import annotations - -import pandas as pd -from microplex.core import EntityType -from microplex.targets import TargetAggregation, TargetFilter, TargetQuery - -from microplex_us.targets.census_blocks import ( - CENSUS_BLOCK_POPULATION_SOURCE, - CensusBlockPopulationTargetProvider, - build_census_block_population_targets, -) - - -def _sample_blocks() -> pd.DataFrame: - return pd.DataFrame( - { - "geoid": [ - "060010201001000", - "060010201001001", - "060030101001000", - "360610001001000", - ], - "state_fips": ["06", "06", "06", "36"], - "county": ["001", "001", "003", "061"], - "tract": ["020100", "020100", "010100", "000100"], - "population": [10, 20, 5, 7], - "cd_id": ["CA-12", "CA-12", "CA-03", "NY-10"], - "sldu_id": ["CA-SD-09", "CA-SD-09", "CA-SD-01", "NY-SD-30"], - "sldl_id": ["CA-HD-18", "CA-HD-18", "CA-HD-05", "NY-AD-65"], - "cbsa_code": ["41860", "41860", None, "35620"], - "spm_metro_area": ["41860", "41860", "", "35620"], - } - ) - - -def test_build_census_block_population_targets_rolls_parent_geographies() -> None: - targets = build_census_block_population_targets( - _sample_blocks(), - geo_levels=("national", "state", "county", "tract", "cd", "sldu", "sldl"), - ) - - by_name = {target.name: target for target in targets} - - assert by_name["census_block_population_national"].value == 42 - assert by_name["census_block_population_state_06"].value == 35 - assert by_name["census_block_population_county_06001"].value == 30 - assert by_name["census_block_population_tract_06001020100"].value == 30 - assert by_name["census_block_population_cd_CA_12"].value == 30 - assert by_name["census_block_population_sldu_CA_SLDU_009"].value == 30 - assert by_name["census_block_population_sldl_CA_SLDL_018"].value == 30 - - county = by_name["census_block_population_county_06001"] - assert county.entity is EntityType.PERSON - assert county.aggregation is TargetAggregation.COUNT - assert county.source == CENSUS_BLOCK_POPULATION_SOURCE - assert county.filters == ( - TargetFilter(feature="county_fips", operator="==", value="06001"), - ) - assert county.metadata["variable"] == "person_count" - assert county.metadata["geo_level"] == "county" - assert county.metadata["geographic_id"] == "06001" - assert county.metadata["block_rollup"] is True - - -def test_census_block_provider_filters_by_geo_level_and_id() -> None: - provider = CensusBlockPopulationTargetProvider(block_probabilities=_sample_blocks()) - - target_set = provider.load_target_set( - TargetQuery( - provider_filters={ - "geo_levels": ["county", "cd"], - "geographic_ids": ["06001", "CA-03"], - "variables": ["person_count"], - }, - ) - ) - - targets = sorted(target_set.targets, key=lambda target: target.name) - - assert [target.name for target in targets] == [ - "census_block_population_cd_CA_03", - "census_block_population_county_06001", - ] - assert [target.value for target in targets] == [5, 30] - - -def test_census_block_provider_normalizes_legacy_sld_ids() -> None: - provider = CensusBlockPopulationTargetProvider(block_probabilities=_sample_blocks()) - - target_set = provider.load_target_set( - TargetQuery( - provider_filters={ - "geo_levels": ["sldu", "sldl"], - "geographic_ids": ["CA-SD-09", "NY-AD-65"], - } - ) - ) - by_name = {target.name: target for target in target_set.targets} - - assert by_name["census_block_population_sldu_CA_SLDU_009"].value == 30 - assert by_name["census_block_population_sldl_NY_SLDL_065"].value == 7 - - -def test_census_block_targets_use_geo_level_to_normalize_bare_sld_ids() -> None: - targets = build_census_block_population_targets( - _sample_blocks(), - geo_levels=("sldu", "sldl"), - geographic_ids=("06009", "36065"), - ) - by_name = {target.name: target for target in targets} - - assert by_name["census_block_population_sldu_CA_SLDU_009"].value == 30 - assert by_name["census_block_population_sldl_NY_SLDL_065"].value == 7 - - provider = CensusBlockPopulationTargetProvider(block_probabilities=_sample_blocks()) - target_set = provider.load_target_set( - TargetQuery( - provider_filters={ - "geo_levels": ["sldu", "sldl"], - "geographic_ids": ["06009", "36065"], - } - ) - ) - provider_by_name = {target.name: target for target in target_set.targets} - - assert provider_by_name["census_block_population_sldu_CA_SLDU_009"].value == 30 - assert provider_by_name["census_block_population_sldl_NY_SLDL_065"].value == 7 - - -def test_census_block_targets_resolve_all_before_bare_sld_filter_expansion() -> None: - targets = build_census_block_population_targets( - _sample_blocks(), - geo_levels=("all",), - geographic_ids=("06009",), - ) - - assert { - target.name: target.value - for target in targets - if target.metadata["geo_level"] == "sldu" - } == {"census_block_population_sldu_CA_SLDU_009": 30} - - -def test_census_block_provider_ignores_unrelated_variables() -> None: - provider = CensusBlockPopulationTargetProvider(block_probabilities=_sample_blocks()) - - target_set = provider.load_target_set( - TargetQuery(provider_filters={"variables": ["household_count"]}) - ) - - assert target_set.targets == [] diff --git a/tests/targets/test_supabase.py b/tests/targets/test_supabase.py deleted file mode 100644 index aa8a56be..00000000 --- a/tests/targets/test_supabase.py +++ /dev/null @@ -1,446 +0,0 @@ -"""Tests for loading US calibration targets from Supabase.""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any - -import pandas as pd -import pytest -from microplex.core import EntityType -from microplex.targets import FilterOperator, TargetAggregation, TargetQuery - -from microplex_us.calibration_harness import CalibrationHarness -from microplex_us.supabase_targets import ( - SUPABASE_SUPPORTED_BY_COLUMN_MAP_KEY, - SUPABASE_TARGET_TYPE_KEY, - SUPABASE_VARIABLE_KEY, - SupabaseTargetLoader, - SupabaseTargetProvider, -) -from microplex_us.target_registry import ( - US_TARGET_CATEGORY_KEY, - US_TARGET_LEVEL_KEY, - TargetCategory, - TargetLevel, -) - -SUPABASE_URL = "https://test.supabase.co" -SUPABASE_KEY = "test-key" - - -@dataclass -class MockResponse: - payload: list[dict[str, Any]] - - def json(self) -> list[dict[str, Any]]: - return self.payload - - def raise_for_status(self) -> None: - return None - - -@pytest.fixture -def loader() -> SupabaseTargetLoader: - return SupabaseTargetLoader(SUPABASE_URL, SUPABASE_KEY) - - -@pytest.fixture -def provider() -> SupabaseTargetProvider: - return SupabaseTargetProvider(SUPABASE_URL, SUPABASE_KEY) - - -@pytest.fixture -def request_queue(monkeypatch: pytest.MonkeyPatch): - calls = [] - responses: list[MockResponse] = [] - - def fake_get( - url: str, - *, - headers: dict[str, str], - params: dict[str, Any], - timeout: int, - ) -> MockResponse: - calls.append( - { - "url": url, - "headers": headers, - "params": params, - "timeout": timeout, - } - ) - return responses.pop(0) - - def queue(*payloads: list[dict[str, Any]]) -> list[dict[str, Any]]: - responses[:] = [MockResponse(payload) for payload in payloads] - return calls - - monkeypatch.setattr("microplex_us.supabase_targets.requests.get", fake_get) - return queue - - -def test_missing_service_key_raises(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.delenv("POLICYENGINE_SUPABASE_SERVICE_KEY", raising=False) - - with pytest.raises(ValueError, match="POLICYENGINE_SUPABASE_SERVICE_KEY"): - SupabaseTargetLoader(SUPABASE_URL) - - -def test_load_all_targets(loader: SupabaseTargetLoader, request_queue) -> None: - calls = request_queue( - [ - { - "id": "t1", - "variable": "employment_income", - "value": 9022400000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "IRS SOI", "institution": "IRS"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - { - "id": "t2", - "variable": "snap_spending", - "value": 103100000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "USDA SNAP", "institution": "USDA"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - ] - ) - - targets = loader.load_all() - - assert [target["variable"] for target in targets] == [ - "employment_income", - "snap_spending", - ] - assert calls[0]["url"] == f"{SUPABASE_URL}/rest/v1/targets" - assert calls[0]["params"]["limit"] == 1000 - assert calls[0]["params"]["offset"] == 0 - - -def test_load_by_institution( - loader: SupabaseTargetLoader, - request_queue, -) -> None: - request_queue( - [{"id": "src-1", "institution": "IRS", "name": "IRS SOI"}], - [ - { - "id": "t1", - "variable": "employment_income", - "value": 9022400000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "IRS SOI", "institution": "IRS"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - } - ], - ) - - targets = loader.load_by_institution("IRS") - - assert len(targets) == 1 - assert targets[0]["source"]["institution"] == "IRS" - - -def test_load_by_period(loader: SupabaseTargetLoader, request_queue) -> None: - calls = request_queue( - [ - { - "id": "t1", - "variable": "employment_income", - "value": 9022400000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "IRS SOI", "institution": "IRS"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - } - ] - ) - - targets = loader.load_by_period(2024) - - assert len(targets) == 1 - assert calls[0]["params"]["period"] == "eq.2024" - - -def test_cps_column_mapping(loader: SupabaseTargetLoader) -> None: - mapping = loader.get_cps_column_map() - - assert mapping["employment_income"] == "employment_income" - assert mapping["self_employment_income"] == "self_employment_income" - assert mapping["dividend_income"] == "dividend_income" - assert mapping["snap_spending"] == "snap" - assert mapping["ssi_spending"] == "ssi" - assert mapping["eitc_spending"] == "eitc" - - -def test_build_continuous_targets( - loader: SupabaseTargetLoader, - request_queue, -) -> None: - request_queue( - [ - { - "id": "t1", - "variable": "employment_income", - "value": 9022400000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "IRS SOI", "institution": "IRS"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - { - "id": "t2", - "variable": "snap_spending", - "value": 103100000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "USDA SNAP", "institution": "USDA"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - ] - ) - - constraints = loader.build_calibration_constraints() - - assert constraints["employment_income"] == 9022400000000 - assert constraints["snap"] == 103100000000 - - -def test_build_state_targets( - loader: SupabaseTargetLoader, - request_queue, -) -> None: - request_queue( - [ - { - "id": "t1", - "variable": "medicaid_enrollment", - "value": 14000000, - "target_type": "count", - "period": 2024, - "source": {"name": "CMS Medicaid", "institution": "HHS"}, - "stratum": {"name": "California", "jurisdiction": "us-ca"}, - } - ] - ) - - constraints = loader.build_calibration_constraints(include_states=True) - - assert constraints["medicaid_ca"] == 14000000 - - -def test_get_summary(loader: SupabaseTargetLoader, request_queue) -> None: - request_queue( - [ - { - "id": "t1", - "variable": "employment_income", - "value": 9022400000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "IRS SOI", "institution": "IRS"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - { - "id": "t2", - "variable": "person_count", - "value": 330000000, - "target_type": "count", - "period": 2024, - "source": {"name": "Census", "institution": "Census"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - ] - ) - - summary = loader.get_summary() - - assert summary == { - "total": 2, - "by_institution": {"IRS": 1, "Census": 1}, - "by_variable": {"employment_income": 1, "person_count": 1}, - "by_type": {"amount": 1, "count": 1}, - } - - -def test_target_from_row_builds_national_sum_spec( - provider: SupabaseTargetProvider, -) -> None: - spec = provider.target_from_row( - { - "id": "target-1", - "variable": "employment_income", - "value": 9022400000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "IRS SOI", "institution": "IRS"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - } - ) - - assert spec.name == "employment_income" - assert spec.entity is EntityType.PERSON - assert spec.aggregation is TargetAggregation.SUM - assert spec.measure == "employment_income" - assert spec.filters == () - assert spec.value == 9022400000000 - assert spec.source == "IRS SOI" - assert spec.metadata[SUPABASE_VARIABLE_KEY] == "employment_income" - assert spec.metadata[SUPABASE_TARGET_TYPE_KEY] == "amount" - assert spec.metadata[SUPABASE_SUPPORTED_BY_COLUMN_MAP_KEY] is True - assert spec.metadata[US_TARGET_CATEGORY_KEY] == TargetCategory.INCOME.value - assert spec.metadata[US_TARGET_LEVEL_KEY] == TargetLevel.NATIONAL.value - - -def test_target_from_row_builds_state_count_spec( - provider: SupabaseTargetProvider, -) -> None: - spec = provider.target_from_row( - { - "id": "target-2", - "variable": "medicaid_enrollment", - "value": 14000000, - "target_type": "count", - "period": 2024, - "source": {"name": "CMS Medicaid", "institution": "HHS"}, - "stratum": {"name": "California", "jurisdiction": "us-ca"}, - } - ) - - assert spec.name == "medicaid_enrollment_us_ca" - assert spec.entity is EntityType.PERSON - assert spec.aggregation is TargetAggregation.COUNT - assert spec.measure is None - assert spec.filters[0].feature == "medicaid" - assert spec.filters[0].operator is FilterOperator.GT - assert spec.filters[0].value == 0 - assert spec.filters[1].feature == "state_fips" - assert spec.filters[1].operator is FilterOperator.EQ - assert spec.filters[1].value == "06" - assert spec.required_features == ("medicaid", "state_fips") - assert spec.metadata[US_TARGET_CATEGORY_KEY] == TargetCategory.HEALTH.value - assert spec.metadata[US_TARGET_LEVEL_KEY] == TargetLevel.STATE.value - - -def test_target_from_row_keeps_unsupported_variables_classifiable( - provider: SupabaseTargetProvider, -) -> None: - spec = provider.target_from_row( - { - "id": "target-3", - "variable": "unknown_cash_income", - "value": 100, - "target_type": "amount", - "period": 2024, - "source": {"name": "Unknown", "institution": "Other"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - } - ) - - assert spec.measure == "unknown_cash_income" - assert spec.required_features == ("unknown_cash_income",) - assert spec.metadata[SUPABASE_SUPPORTED_BY_COLUMN_MAP_KEY] is False - - -def test_load_target_set_filters_rows_with_core_query( - provider: SupabaseTargetProvider, - request_queue, -) -> None: - calls = request_queue( - [ - { - "id": "target-1", - "variable": "employment_income", - "value": 9022400000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "IRS SOI", "institution": "IRS"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - { - "id": "target-2", - "variable": "snap_spending", - "value": 103100000000, - "target_type": "amount", - "period": 2024, - "source": {"name": "USDA SNAP", "institution": "USDA"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - { - "id": "target-3", - "variable": "unknown_cash_income", - "value": 100, - "target_type": "amount", - "period": 2024, - "source": {"name": "Unknown", "institution": "Other"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - ] - ) - - target_set = provider.load_target_set( - TargetQuery( - period=2024, - entity=EntityType.PERSON, - metadata_filters={US_TARGET_CATEGORY_KEY: TargetCategory.INCOME.value}, - provider_filters={"include_unsupported": False}, - ) - ) - - assert [target.name for target in target_set.targets] == ["employment_income"] - assert calls[0]["params"]["period"] == "eq.2024" - - -def test_calibration_harness_can_use_supabase_target_provider( - provider: SupabaseTargetProvider, - request_queue, -) -> None: - request_queue( - [ - { - "id": "target-1", - "variable": "employment_income", - "value": 30, - "target_type": "amount", - "period": 2024, - "source": {"name": "IRS SOI", "institution": "IRS"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - { - "id": "target-2", - "variable": "unknown_cash_income", - "value": 100, - "target_type": "amount", - "period": 2024, - "source": {"name": "Unknown", "institution": "Other"}, - "stratum": {"name": "National", "jurisdiction": "us"}, - }, - ] - ) - harness = CalibrationHarness(target_provider=provider) - frame = pd.DataFrame( - { - "employment_income": [10.0, 20.0], - "weight": [1.0, 1.0], - } - ) - - result = harness.run_experiment( - frame, - "supabase_income", - categories=[TargetCategory.INCOME], - only_available=True, - period=2024, - provider_filters={"include_unsupported": False}, - entity=EntityType.PERSON, - verbose=False, - ) - - assert result.targets_used == ["employment_income"] - assert result.errors == {"employment_income": 0.0} diff --git a/tests/test_block_synthesis.py b/tests/test_block_synthesis.py deleted file mode 100644 index 10079533..00000000 --- a/tests/test_block_synthesis.py +++ /dev/null @@ -1,119 +0,0 @@ -"""US-specific block-geography integration around the core hierarchical synthesizer.""" - -from __future__ import annotations - -from pathlib import Path - -import numpy as np -import pandas as pd -import pytest -from microplex.hierarchical import HierarchicalSynthesizer, HouseholdSchema - -from microplex_us.geography import BlockGeography, derive_geographies - - -@pytest.fixture -def block_probabilities() -> pd.DataFrame: - data_path = Path(__file__).resolve().parents[2] / "microplex" / "data" / "block_probabilities.parquet" - if not data_path.exists(): - pytest.skip("Block probabilities data not available") - return pd.read_parquet(data_path) - - -@pytest.fixture -def sample_cps_data() -> tuple[pd.DataFrame, pd.DataFrame]: - rng = np.random.default_rng(42) - n_households = 100 - - households = pd.DataFrame( - { - "household_id": range(n_households), - "n_persons": rng.choice([1, 2, 3, 4, 5], size=n_households, p=[0.3, 0.3, 0.2, 0.15, 0.05]), - "state_fips": rng.choice([6, 48, 12, 36], size=n_households), - "tenure": rng.choice([1, 2], size=n_households), - "hh_weight": rng.uniform(100, 1000, size=n_households), - } - ) - households["n_adults"] = np.clip( - households["n_persons"] - rng.integers(0, 2, size=n_households), - 1, - None, - ) - households["n_children"] = households["n_persons"] - households["n_adults"] - - people: list[dict[str, float | int]] = [] - for _, household in households.iterrows(): - for person_idx in range(int(household["n_persons"])): - is_adult = person_idx < household["n_adults"] - people.append( - { - "household_id": household["household_id"], - "person_id": len(people), - "age": int(rng.integers(25, 65)) if is_adult else int(rng.integers(0, 18)), - "sex": int(rng.choice([1, 2])), - "income": float(rng.uniform(0, 100000)) if is_adult else 0.0, - "employment_status": int(rng.choice([1, 2, 3])) if is_adult else 0, - "education": int(rng.choice([1, 2, 3, 4])) if is_adult else 0, - "relationship_to_head": 0 if person_idx == 0 else int(rng.choice([1, 2, 3])), - } - ) - - return households, pd.DataFrame(people) - - -def _schema() -> HouseholdSchema: - return HouseholdSchema( - hh_vars=["n_persons", "n_adults", "n_children", "state_fips", "tenure"], - person_vars=["age", "sex", "income", "employment_status", "education", "relationship_to_head"], - ) - - -def test_generate_includes_block_geoid(sample_cps_data, block_probabilities: pd.DataFrame) -> None: - households, persons = sample_cps_data - synthesizer = HierarchicalSynthesizer( - schema=_schema(), - block_probabilities=block_probabilities, - random_state=42, - ) - - synthesizer.fit(households, persons, hh_weight_col="hh_weight", epochs=5, verbose=False) - synthetic_households, _ = synthesizer.generate(n_households=50, verbose=False) - - assert "block_geoid" in synthetic_households.columns - assert all(synthetic_households["block_geoid"].str.len() == 15) - - -def test_derive_geographies_post_hoc(sample_cps_data, block_probabilities: pd.DataFrame) -> None: - households, persons = sample_cps_data - synthesizer = HierarchicalSynthesizer( - schema=_schema(), - block_probabilities=block_probabilities, - random_state=42, - ) - - synthesizer.fit(households, persons, hh_weight_col="hh_weight", epochs=5, verbose=False) - synthetic_households, _ = synthesizer.generate(n_households=50, verbose=False) - - geographies = derive_geographies( - synthetic_households["block_geoid"], - include_cd=True, - include_sld=True, - block_data=block_probabilities, - ) - - assert set(["tract_geoid", "county_fips", "cd_id", "sldu_id", "sldl_id"]).issubset( - geographies.columns - ) - - -def test_block_geography_integration_uses_real_block_table( - block_probabilities: pd.DataFrame, -) -> None: - geography = BlockGeography(lazy_load=True) - geography._data = block_probabilities - - sample_block = block_probabilities["geoid"].iloc[0] - geos = geography.get_all_geographies(sample_block) - - assert geos["county_fips"][:2] == geos["state_fips"] - assert geos["tract_geoid"].startswith(geos["county_fips"]) diff --git a/tests/test_calibration_harness.py b/tests/test_calibration_harness.py deleted file mode 100644 index bdd5cc44..00000000 --- a/tests/test_calibration_harness.py +++ /dev/null @@ -1,164 +0,0 @@ -"""Tests for the canonical-target calibration harness.""" - -import numpy as np -import pandas as pd -from microplex.core import EntityType -from microplex.targets import ( - StaticTargetProvider, - TargetAggregation, - TargetFilter, - TargetSet, - TargetSpec, -) - -from microplex_us.calibration_harness import CalibrationHarness -from microplex_us.target_registry import ( - TargetCategory, - TargetGroup, - TargetRegistry, -) - - -def _make_registry() -> TargetRegistry: - geography_target = TargetSpec( - name="ca_people", - entity=EntityType.PERSON, - value=2.0, - period=2024, - aggregation=TargetAggregation.COUNT, - filters=(TargetFilter(feature="state_fips", operator="==", value="06"),), - metadata={ - "us_category": "geography", - "us_level": "state", - "us_group": "people", - "available_in_cps": True, - "requires_imputation": False, - }, - ) - income_target = TargetSpec( - name="ca_income", - entity=EntityType.PERSON, - value=30.0, - period=2024, - measure="employment_income", - aggregation=TargetAggregation.SUM, - filters=(TargetFilter(feature="state_fips", operator="==", value="06"),), - metadata={ - "us_category": "income", - "us_level": "state", - "us_group": "people", - "available_in_cps": True, - "requires_imputation": False, - }, - ) - return TargetRegistry( - groups={ - "people": TargetGroup( - name="people", - category=TargetCategory.GEOGRAPHY, - targets=[geography_target, income_target], - ), - }, - build_defaults=False, - ) - - -class TestCalibrationHarness: - def test_get_target_vector_uses_canonical_target_spec(self): - harness = CalibrationHarness(registry=_make_registry()) - df = pd.DataFrame( - { - "state_fips": ["06", "06", "08"], - "employment_income": [10.0, 20.0, 5.0], - "weight": [1.0, 1.0, 1.0], - } - ) - targets = harness.registry.get_all_targets() - - design_matrix, target_vector, target_names = harness.get_target_vector( - df, - targets, - entity=EntityType.PERSON, - ) - - np.testing.assert_allclose(design_matrix[:, 0], np.array([1.0, 1.0, 0.0])) - np.testing.assert_allclose(design_matrix[:, 1], np.array([10.0, 20.0, 0.0])) - np.testing.assert_allclose(target_vector, np.array([2.0, 30.0])) - assert target_names == ["ca_people", "ca_income"] - - def test_run_experiment_filters_to_selected_canonical_targets(self): - harness = CalibrationHarness(registry=_make_registry()) - df = pd.DataFrame( - { - "state_fips": ["06", "06", "08"], - "employment_income": [10.0, 20.0, 5.0], - "weight": [1.0, 1.0, 1.0], - } - ) - - result = harness.run_experiment( - df, - "people_only", - groups=["people"], - only_available=True, - entity=EntityType.PERSON, - verbose=False, - ) - - assert result.targets_used == ["ca_people", "ca_income"] - np.testing.assert_allclose(result.weights, np.ones(3)) - - def test_run_experiment_can_use_core_target_provider(self): - targets = _make_registry().get_all_targets() + [ - TargetSpec( - name="future_income", - entity=EntityType.PERSON, - value=50.0, - period=2025, - measure="employment_income", - aggregation=TargetAggregation.SUM, - metadata={ - "us_category": "income", - "us_level": "national", - "us_group": "future", - "available_in_cps": True, - "requires_imputation": False, - }, - ) - ] - provider = StaticTargetProvider(TargetSet(targets)) - harness = CalibrationHarness(target_provider=provider) - df = pd.DataFrame( - { - "state_fips": ["06", "06", "08"], - "employment_income": [10.0, 20.0, 5.0], - "weight": [1.0, 1.0, 1.0], - } - ) - - result = harness.run_experiment( - df, - "provider_people_only", - groups=["people"], - only_available=True, - period=2024, - entity=EntityType.PERSON, - verbose=False, - ) - - assert result.targets_used == ["ca_people", "ca_income"] - - def test_print_target_coverage_can_use_core_target_provider(self, capsys): - provider = StaticTargetProvider(TargetSet(_make_registry().get_all_targets())) - harness = CalibrationHarness(target_provider=provider) - df = pd.DataFrame( - { - "state_fips": ["06", "06", "08"], - "employment_income": [10.0, 20.0, 5.0], - "weight": [1.0, 1.0, 1.0], - } - ) - - harness.print_target_coverage(df, entity=EntityType.PERSON) - - assert "Available (2 targets)" in capsys.readouterr().out diff --git a/tests/test_capital_gains_lots.py b/tests/test_capital_gains_lots.py deleted file mode 100644 index 118947e1..00000000 --- a/tests/test_capital_gains_lots.py +++ /dev/null @@ -1,198 +0,0 @@ -from __future__ import annotations - -import sqlite3 - -import numpy as np -import pandas as pd -import pytest - -from microplex_us.capital_gains_lots import ( - CAPITAL_GAINS_LOT_COLUMNS, - SyntheticCapitalGainsLotConfig, - generate_synthetic_capital_gains_lots, - read_capital_gains_lots_sqlite, - synthetic_capital_gains_lot_metadata, - validate_capital_gains_lot_anchors, - write_capital_gains_lots_sqlite, -) -from microplex_us.pipelines.artifacts import _maybe_write_capital_gains_lot_artifact -from microplex_us.pipelines.us import ( - USMicroplexBuildConfig, - USMicroplexBuildResult, - USMicroplexTargets, -) -from microplex_us.policyengine.us import PolicyEngineUSEntityTableBundle - - -def test_generate_synthetic_capital_gains_lots_preserves_person_anchors(): - persons = pd.DataFrame( - { - "person_id": [1, 2, 3], - "tax_unit_id": [10, 20, 20], - "household_id": [100, 200, 200], - "long_term_capital_gains_before_response": [150_000.0, -20_000.0, 0.0], - } - ) - config = SyntheticCapitalGainsLotConfig(random_seed=7, max_lots_per_person=3) - - lots = generate_synthetic_capital_gains_lots( - persons, - period=2026, - config=config, - ) - - assert list(lots.columns) == list(CAPITAL_GAINS_LOT_COLUMNS) - assert set(lots["person_id"]) == {1, 2} - assert lots.groupby("person_id")["gain_or_loss"].sum().to_dict() == pytest.approx( - {1: 150_000.0, 2: -20_000.0} - ) - np.testing.assert_allclose( - lots["sale_proceeds"] - lots["basis"], - lots["gain_or_loss"], - ) - assert set(lots["asset_type"]) == {"unknown"} - assert (lots["sale_time"] == 2026.5).all() - np.testing.assert_allclose( - lots["sale_time"] - lots["holding_period"], - lots["purchase_time"], - ) - validate_capital_gains_lot_anchors(persons, lots) - - -def test_synthetic_capital_gains_lots_are_deterministic(): - persons = pd.DataFrame( - { - "person_id": [1, 2], - "tax_unit_id": [10, 20], - "household_id": [100, 200], - "long_term_capital_gains_before_response": [250_000.0, 125_000.0], - } - ) - config = SyntheticCapitalGainsLotConfig(random_seed=99) - - first = generate_synthetic_capital_gains_lots(persons, period=2026, config=config) - second = generate_synthetic_capital_gains_lots(persons, period=2026, config=config) - - pd.testing.assert_frame_equal(first, second) - - shuffled = persons.sample(frac=1.0, random_state=2).reset_index(drop=True) - shuffled_lots = generate_synthetic_capital_gains_lots( - shuffled, - period=2026, - config=config, - ) - - pd.testing.assert_frame_equal(first, shuffled_lots) - - -def test_validate_capital_gains_lot_anchors_allows_float_roundoff(): - persons = pd.DataFrame( - { - "person_id": [1], - "long_term_capital_gains_before_response": [6_005.71], - } - ) - lots = pd.DataFrame( - { - "person_id": [1, 1], - "gain_or_loss": [3_000.0, 3_005.7100076293945], - } - ) - - validate_capital_gains_lot_anchors(persons, lots) - - -def test_validate_capital_gains_lot_anchors_rejects_material_mismatch(): - persons = pd.DataFrame( - { - "person_id": [1], - "long_term_capital_gains_before_response": [6_005.71], - } - ) - lots = pd.DataFrame({"person_id": [1], "gain_or_loss": [6_005.72]}) - - with pytest.raises(ValueError, match="do not reconcile"): - validate_capital_gains_lot_anchors(persons, lots) - - -def test_write_and_read_capital_gains_lots_sqlite(tmp_path): - persons = pd.DataFrame( - { - "person_id": [1], - "tax_unit_id": [10], - "household_id": [100], - "long_term_capital_gains_before_response": [15_000.0], - } - ) - config = SyntheticCapitalGainsLotConfig(random_seed=1) - lots = generate_synthetic_capital_gains_lots(persons, period=2026, config=config) - db_path = tmp_path / "capital_gains_lots.db" - - write_capital_gains_lots_sqlite( - lots, - db_path, - metadata=synthetic_capital_gains_lot_metadata(config, period=2026), - ) - restored = read_capital_gains_lots_sqlite(db_path) - - assert restored["gain_or_loss"].sum() == pytest.approx(15_000.0) - with sqlite3.connect(db_path) as conn: - metadata = dict( - conn.execute("SELECT key, value FROM capital_gains_lot_metadata") - ) - index_rows = conn.execute( - """ - SELECT name - FROM sqlite_master - WHERE type = 'index' - ORDER BY name - """ - ).fetchall() - assert "config" in metadata - assert "limitations" in metadata - assert ("idx_capital_gains_lots_person_period",) in index_rows - assert ("idx_capital_gains_lots_tax_unit_period",) in index_rows - - -def test_capital_gains_lot_artifact_sidecar_is_config_gated(tmp_path): - persons = pd.DataFrame( - { - "person_id": [1, 2], - "tax_unit_id": [10, 20], - "household_id": [100, 200], - "long_term_capital_gains_before_response": [25_000.0, 0.0], - } - ) - result = USMicroplexBuildResult( - config=USMicroplexBuildConfig( - capital_gains_lots_enabled=True, - capital_gains_lots_max_lots_per_person=2, - policyengine_dataset_year=2026, - ), - seed_data=pd.DataFrame(), - synthetic_data=pd.DataFrame(), - calibrated_data=pd.DataFrame(), - targets=USMicroplexTargets(marginal={}, continuous={}), - calibration_summary={}, - policyengine_tables=PolicyEngineUSEntityTableBundle( - households=pd.DataFrame({"household_id": [100, 200]}), - persons=persons, - ), - ) - - path, summary = _maybe_write_capital_gains_lot_artifact(result, tmp_path) - - assert path == tmp_path / "capital_gains_lots.sqlite" - assert path.exists() - assert summary == { - "enabled": True, - "written": True, - "path": "capital_gains_lots.sqlite", - "person_rows": 2, - "nonzero_person_rows": 1, - "lot_rows": 2, - "source_gain_column": "long_term_capital_gains_before_response", - "max_lots_per_person": 2, - } - restored = read_capital_gains_lots_sqlite(path) - validate_capital_gains_lot_anchors(persons, restored) diff --git a/tests/test_cps_source_provider.py b/tests/test_cps_source_provider.py deleted file mode 100644 index 062fa11d..00000000 --- a/tests/test_cps_source_provider.py +++ /dev/null @@ -1,870 +0,0 @@ -"""Tests for CPS source-provider implementations.""" - -import zipfile - -import pandas as pd -import polars as pl -from microplex.core import EntityType, SourceArchetype, SourceProvider, SourceQuery - -from microplex_us.data_sources import CPSASECParquetSourceProvider -from microplex_us.data_sources.cps import ( - CPS_ASEC_PROCESSED_CACHE_VERSION, - PERSON_CACHE_REQUIRED_COLUMNS, - CPSASECSourceProvider, - _attach_cps_ssn_card_type, - _cps_age_band_key, - _sample_households_and_persons, - get_available_years, - load_cps_asec, - processed_cps_asec_cache_path, -) - - -def test_cps_asec_available_years_include_latest_survey(): - assert max(get_available_years()) == 2025 - - -def test_cps_parquet_source_provider_loads_observation_frame(tmp_path): - households = pd.DataFrame( - { - "household_id": [1, 2], - "state_fips": [6, 36], - "household_weight": [1.0, 2.0], - } - ) - persons = pd.DataFrame( - { - "household_id": [1, 1, 2], - "person_number": [1, 2, 1], - "age": [34, 12, 52], - "weight": [1.0, 1.0, 2.0], - } - ) - households.to_parquet(tmp_path / "cps_asec_households.parquet", index=False) - persons.to_parquet(tmp_path / "cps_asec_persons.parquet", index=False) - - provider = CPSASECParquetSourceProvider(data_dir=tmp_path, year=2024) - frame = provider.load_frame(SourceQuery(period=2024)) - - assert isinstance(provider, SourceProvider) - assert set(frame.tables) == {EntityType.HOUSEHOLD, EntityType.PERSON} - assert frame.tables[EntityType.PERSON]["person_id"].tolist() == [ - "1:1", - "1:2", - "2:1", - ] - assert frame.tables[EntityType.HOUSEHOLD]["year"].tolist() == [2024, 2024] - assert frame.source.archetype is SourceArchetype.HOUSEHOLD_INCOME - - -def test_cps_parquet_source_provider_derives_canonical_income_alias(tmp_path): - households = pd.DataFrame( - { - "household_id": [1], - "state_fips": [6], - "household_weight": [1.0], - } - ) - persons = pd.DataFrame( - { - "household_id": [1, 1], - "person_number": [1, 2], - "age": [44, 41], - "weight": [1.0, 1.0], - "wage_income": [50_000.0, 10_000.0], - "self_employment_income": [5_000.0, 0.0], - "interest_income": [100.0, 20.0], - "dividend_income": [50.0, 10.0], - "social_security": [0.0, 3_000.0], - "pension_income": [1_000.0, 4_000.0], - "taxable_pension_income": [800.0, 3_000.0], - "ssi": [1_000.0, 2_000.0], - } - ) - households.to_parquet(tmp_path / "cps_asec_households.parquet", index=False) - persons.to_parquet(tmp_path / "cps_asec_persons.parquet", index=False) - - provider = CPSASECParquetSourceProvider(data_dir=tmp_path, year=2024) - frame = provider.load_frame(SourceQuery(period=2024)) - - assert "income" in frame.source.observations[1].variable_names - assert frame.tables[EntityType.PERSON]["income"].tolist() == [56_150.0, 17_030.0] - - -def test_cps_parquet_source_provider_derives_tax_unit_roles_from_tax_id(tmp_path): - households = pd.DataFrame( - { - "household_id": [1], - "state_fips": [6], - "household_weight": [1.0], - } - ) - persons = pd.DataFrame( - { - "household_id": [1, 1, 1, 1], - "person_number": [1, 2, 3, 4], - "spouse_person_number": [2, 1, 0, 0], - "family_relationship": [1, 2, 3, 1], - "tax_unit_id": [100, 100, 100, 101], - "age": [40, 38, 10, 22], - "weight": [1.0, 1.0, 1.0, 1.0], - } - ) - households.to_parquet(tmp_path / "cps_asec_households.parquet", index=False) - persons.to_parquet(tmp_path / "cps_asec_persons.parquet", index=False) - - provider = CPSASECParquetSourceProvider(data_dir=tmp_path, year=2024) - frame = provider.load_frame(SourceQuery(period=2024)) - result = ( - frame.tables[EntityType.PERSON] - .sort_values("person_number") - .reset_index(drop=True) - ) - - assert result["tax_unit_id"].tolist() == [100, 100, 100, 101] - assert result["tax_unit_is_joint"].tolist() == [1.0, 1.0, 1.0, 0.0] - assert result["tax_unit_count_dependents"].tolist() == [1.0, 1.0, 1.0, 0.0] - assert result["is_tax_unit_head"].tolist() == [1.0, 0.0, 0.0, 1.0] - assert result["is_tax_unit_spouse"].tolist() == [0.0, 1.0, 0.0, 0.0] - assert result["is_tax_unit_dependent"].tolist() == [0.0, 0.0, 1.0, 0.0] - - -def test_attach_cps_ssn_card_type_derives_pe_style_categories(): - persons = pl.DataFrame( - { - "household_id": [1, 2, 3, 4], - "person_number": [1, 1, 1, 1], - "age": [30, 40, 28, 35], - "weight": [1.0, 1.0, 1.0, 1.0], - } - ) - households = pl.DataFrame( - { - "household_id": [1, 2, 3, 4], - "household_weight": [1.0, 1.0, 1.0, 1.0], - } - ) - persons_raw = pl.DataFrame( - { - "PRCITSHP": [1, 5, 5, 5], - "PEINUSYR": [0, 20, 20, 20], - "PENATVTY": [57, 303, 303, 303], - "A_HSCOL": [0, 0, 0, 0], - "A_AGE": [30, 40, 28, 35], - "A_MARITL": [0, 0, 0, 0], - "A_SPOUSE": [0, 0, 0, 0], - "MCARE": [0, 1, 0, 0], - "CAID": [0, 0, 0, 0], - "PEN_SC1": [0, 0, 0, 0], - "PEN_SC2": [0, 0, 0, 0], - "RESNSS1": [0, 0, 0, 0], - "RESNSS2": [0, 0, 0, 0], - "IHSFLG": [0, 0, 0, 0], - "CHAMPVA": [0, 0, 0, 0], - "MIL": [0, 0, 0, 0], - "PEIO1COW": [0, 0, 0, 0], - "A_MJOCC": [0, 0, 0, 0], - "SS_YN": [0, 0, 0, 0], - "SPM_ID": [11, 22, 33, 44], - "SPM_CAPHOUSESUB": [0.0, 0.0, 0.0, 0.0], - "PEAFEVER": [0, 0, 0, 0], - "SSI_YN": [0, 0, 0, 0], - "WSAL_VAL": [0.0, 0.0, 20_000.0, 0.0], - "SEMP_VAL": [0.0, 0.0, 0.0, 0.0], - } - ) - - result = _attach_cps_ssn_card_type( - persons=persons, - households=households, - persons_raw=persons_raw, - ) - - assert result["ssn_card_type"].to_list() == [ - "CITIZEN", - "OTHER_NON_CITIZEN", - "NON_CITIZEN_VALID_EAD", - "NONE", - ] - - -def test_attach_cps_ssn_card_type_falls_back_to_citizen_when_raw_fields_missing(): - persons = pl.DataFrame( - { - "household_id": [1, 2], - "person_number": [1, 1], - "age": [30, 40], - "weight": [1.0, 1.0], - } - ) - households = pl.DataFrame( - { - "household_id": [1, 2], - "household_weight": [1.0, 1.0], - } - ) - persons_raw = pl.DataFrame( - { - "PRCITSHP": [1, 5], - "PEINUSYR": [0, 20], - } - ) - - result = _attach_cps_ssn_card_type( - persons=persons, - households=households, - persons_raw=persons_raw, - ) - - assert result["ssn_card_type"].to_list() == ["CITIZEN", "CITIZEN"] - - -def test_cps_parquet_source_provider_supports_household_sampling(tmp_path): - households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "state_fips": [6, 36, 48], - "household_weight": [1.0, 2.0, 3.0], - } - ) - persons = pd.DataFrame( - { - "household_id": [1, 1, 2, 3], - "person_number": [1, 2, 1, 1], - "age": [34, 12, 52, 40], - "weight": [1.0, 1.0, 2.0, 3.0], - } - ) - households.to_parquet(tmp_path / "cps_asec_households.parquet", index=False) - persons.to_parquet(tmp_path / "cps_asec_persons.parquet", index=False) - - provider = CPSASECParquetSourceProvider(data_dir=tmp_path, year=2024) - frame = provider.load_frame( - SourceQuery( - period=2024, - provider_filters={"sample_n": 2, "random_seed": 0}, - ) - ) - - assert len(frame.tables[EntityType.HOUSEHOLD]) == 2 - assert frame.tables[EntityType.PERSON]["household_id"].nunique() == 2 - - -def test_cps_parquet_source_provider_sampling_respects_household_weights(tmp_path): - households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "state_fips": [6, 36, 48], - "household_weight": [0.0, 0.0, 100.0], - } - ) - persons = pd.DataFrame( - { - "household_id": [1, 2, 3], - "person_number": [1, 1, 1], - "age": [34, 52, 40], - "weight": [0.0, 0.0, 100.0], - } - ) - households.to_parquet(tmp_path / "cps_asec_households.parquet", index=False) - persons.to_parquet(tmp_path / "cps_asec_persons.parquet", index=False) - - provider = CPSASECParquetSourceProvider(data_dir=tmp_path, year=2024) - frame = provider.load_frame( - SourceQuery( - period=2024, - provider_filters={"sample_n": 1, "random_seed": 0}, - ) - ) - - assert frame.tables[EntityType.HOUSEHOLD]["household_id"].tolist() == [3] - assert frame.tables[EntityType.PERSON]["household_id"].tolist() == [3] - - -def test_cps_parquet_source_provider_applies_generic_atomic_variable_semantics( - tmp_path, -): - households = pd.DataFrame( - { - "household_id": [1], - "state_fips": [6], - "household_weight": [1.0], - } - ) - persons = pd.DataFrame( - { - "household_id": [1], - "person_number": [1], - "age": [34], - "weight": [1.0], - "qualified_dividend_income": [30.0], - "non_qualified_dividend_income": [12.0], - "dividend_income": [42.0], - "ordinary_dividend_income": [42.0], - } - ) - households.to_parquet(tmp_path / "cps_asec_households.parquet", index=False) - persons.to_parquet(tmp_path / "cps_asec_persons.parquet", index=False) - - provider = CPSASECParquetSourceProvider(data_dir=tmp_path, year=2024) - frame = provider.load_frame(SourceQuery(period=2024)) - descriptor = frame.source - - assert not descriptor.is_authoritative_for("dividend_income") - assert not descriptor.allows_conditioning_on("dividend_income") - assert not descriptor.is_authoritative_for("ordinary_dividend_income") - assert descriptor.is_authoritative_for("qualified_dividend_income") - assert descriptor.allows_conditioning_on("qualified_dividend_income") - - -def test_load_cps_asec_rebuilds_stale_processed_cache_without_state_fips(tmp_path): - stale_processed = pl.DataFrame( - { - "household_id": [1, 1, 2], - "person_number": [1, 2, 1], - "age": [34, 12, 52], - "weight": [1.0, 1.0, 2.0], - "year": [2023, 2023, 2023], - } - ) - stale_processed.write_parquet(tmp_path / "cps_asec_2023_processed.parquet") - - person_rows = pd.DataFrame( - { - "PH_SEQ": [1, 1, 2], - "GESTFIPS": [6, 6, 36], - "A_LINENO": [1, 2, 1], - "A_AGE": [34, 12, 52], - "A_FNLWGT": [100, 100, 200], - } - ) - with zipfile.ZipFile(tmp_path / "cps_asec_2023.zip", "w") as archive: - archive.writestr("pppub23.csv", person_rows.to_csv(index=False)) - - dataset = load_cps_asec(year=2023, cache_dir=tmp_path, download=False) - - assert "state_fips" in dataset.persons.columns - assert sorted(dataset.households["state_fips"].to_list()) == [6, 36] - - -def test_load_cps_asec_caches_household_geography_on_persons(tmp_path): - person_rows = pd.DataFrame( - { - "PH_SEQ": [1, 1, 2], - "A_LINENO": [1, 2, 1], - "A_SPOUSE": [2, 1, 0], - "A_AGE": [34, 12, 52], - "A_FNLWGT": [100, 100, 200], - "TAX_ID": [100, 100, 200], - "SPM_ID": [10, 10, 20], - "A_MARITL": [1, 1, 6], - "PRDTRACE": [4, 4, 1], - "PRDTHSP": [0, 1, 0], - "PEHSPNON": [2, 1, 2], - "PEDISDRS": [0, 1, 0], - "PEDISEAR": [0, 0, 0], - "PEDISEYE": [0, 0, 0], - "PEDISOUT": [0, 0, 0], - "PEDISPHY": [0, 0, 0], - "PEDISREM": [0, 0, 0], - "RESNSS1": [0, 2, 0], - "RESNSS2": [0, 0, 0], - "SS_VAL": [0, 9000, 0], - "WICYN": [1, 2, 0], - "NOW_MRK": [1, 0, 0], - "NOW_GRP": [0, 1, 0], - } - ) - household_rows = pd.DataFrame( - { - "H_SEQ": [1, 2], - "GESTFIPS": [6, 36], - "GTCO": [1, 61], - "HSUP_WGT": [100, 200], - } - ) - with zipfile.ZipFile(tmp_path / "cps_asec_2023.zip", "w") as archive: - archive.writestr("pppub23.csv", person_rows.to_csv(index=False)) - archive.writestr("hhpub23.csv", household_rows.to_csv(index=False)) - - first = load_cps_asec(year=2023, cache_dir=tmp_path, download=False) - cached_persons = pl.read_parquet( - processed_cps_asec_cache_path(year=2023, cache_dir=tmp_path) - ) - second = load_cps_asec(year=2023, cache_dir=tmp_path, download=False) - - assert "state_fips" in first.persons.columns - assert "county_fips" in first.persons.columns - assert "cps_race" in first.persons.columns - assert "is_hispanic" in first.persons.columns - assert "is_disabled" in first.persons.columns - assert "social_security_disability" in first.persons.columns - assert "social_security_retirement" in first.persons.columns - assert "social_security_survivors" in first.persons.columns - assert "social_security_dependents" in first.persons.columns - assert "receives_wic" in first.persons.columns - assert "spm_unit_pre_subsidy_childcare_expenses" in first.persons.columns - assert "has_marketplace_health_coverage" in first.persons.columns - assert "has_esi" in first.persons.columns - assert "tax_unit_id" in first.persons.columns - assert "spm_unit_id" in first.persons.columns - assert "spouse_person_number" in first.persons.columns - assert "marital_unit_id" in first.persons.columns - assert "is_surviving_spouse" in first.persons.columns - assert "is_separated" in first.persons.columns - assert cached_persons["state_fips"].to_list() == [6, 6, 36] - assert cached_persons["county_fips"].to_list() == [1, 1, 61] - assert cached_persons["cps_race"].to_list() == [4, 4, 1] - assert cached_persons["is_hispanic"].to_list() == [False, True, False] - assert cached_persons["is_disabled"].to_list() == [False, True, False] - assert cached_persons["social_security_disability"].to_list() == [0.0, 9000.0, 0.0] - assert cached_persons["social_security_retirement"].to_list() == [0.0, 0.0, 0.0] - assert cached_persons["social_security_survivors"].to_list() == [0.0, 0.0, 0.0] - assert cached_persons["social_security_dependents"].to_list() == [0.0, 0.0, 0.0] - assert cached_persons["receives_wic"].to_list() == [True, False, False] - assert cached_persons["spm_unit_pre_subsidy_childcare_expenses"].to_list() == [ - 0.0, - 0.0, - 0.0, - ] - assert cached_persons["has_marketplace_health_coverage"].to_list() == [ - True, - False, - False, - ] - assert cached_persons["has_esi"].to_list() == [False, True, False] - assert cached_persons["tax_unit_id"].to_list() == [100, 100, 200] - assert cached_persons["spm_unit_id"].to_list() == [10, 10, 20] - assert cached_persons["spouse_person_number"].to_list() == [2, 1, 0] - assert cached_persons["is_surviving_spouse"].to_list() == [False, False, False] - assert cached_persons["is_separated"].to_list() == [False, False, True] - assert cached_persons["marital_unit_id"].to_list() == [1, 1, 2] - assert second.source.endswith( - f"cps_asec_2023_processed_v{CPS_ASEC_PROCESSED_CACHE_VERSION}.parquet" - ) - assert sorted(second.households["state_fips"].to_list()) == [6, 36] - assert sorted(second.households["county_fips"].to_list()) == [1, 61] - - -def test_load_cps_asec_derives_policyengine_value_inputs(tmp_path): - person_rows = pd.DataFrame( - { - "PH_SEQ": [1, 1], - "A_LINENO": [1, 2], - "A_AGE": [34, 62], - "A_FNLWGT": [100, 100], - "OI_OFF": [20, 12], - "OI_VAL": [1200, 800], - "CSP_VAL": [300, -1], - "CHSP_VAL": [700, -1], - "DIS_VAL1": [500, 400], - "DIS_SC1": [2, 1], - "DIS_VAL2": [50, 25], - "DIS_SC2": [3, 2], - "RESNSS1": [2, 1], - "RESNSS2": [0, 0], - "SS_VAL": [1200, 800], - "MCARE": [1, 2], - "MCAID": [2, 1], - "WICYN": [1, 2], - "SPM_CAPHOUSESUB": [700, 0], - "SPM_ENGVAL": [90, -1], - "SPM_CAPWKCCXPNS": [1200, -1], - "SPM_CHILDCAREXPNS": [1500, -1], - "PHIP_VAL": [900, -1], - "POTC_VAL": [120, -1], - "PMED_VAL": [450, -1], - "PEMCPREM": [600, -1], - } - ) - with zipfile.ZipFile(tmp_path / "cps_asec_2023.zip", "w") as archive: - archive.writestr("pppub23.csv", person_rows.to_csv(index=False)) - - dataset = load_cps_asec(year=2023, cache_dir=tmp_path, download=False) - persons = ( - dataset.persons.to_pandas() - .sort_values(["household_id", "person_number"]) - .reset_index(drop=True) - ) - - assert persons["alimony_income"].tolist() == [1200, 0] - assert persons["child_support_received"].tolist() == [300, 0] - assert persons["child_support_expense"].tolist() == [700, 0] - assert persons["disability_benefits"].tolist() == [550, 25] - assert persons["social_security_disability"].tolist() == [1200, 0] - assert persons["social_security_retirement"].tolist() == [0, 800] - assert persons["social_security_survivors"].tolist() == [0, 0] - assert persons["social_security_dependents"].tolist() == [0, 0] - assert persons["has_medicare"].tolist() == [True, False] - assert persons["takes_up_medicare_if_eligible"].tolist() == [True, False] - assert persons["has_medicaid"].tolist() == [False, True] - assert persons["receives_wic"].tolist() == [True, False] - assert persons["receives_housing_assistance"].tolist() == [True, False] - assert persons["takes_up_housing_assistance_if_eligible"].tolist() == [True, False] - assert persons["spm_unit_energy_subsidy"].tolist() == [90, 0] - assert persons["spm_unit_capped_housing_subsidy_reported"].tolist() == [700, 0] - assert persons["spm_unit_capped_work_childcare_expenses"].tolist() == [1200, 0] - assert persons["spm_unit_pre_subsidy_childcare_expenses"].tolist() == [1500, 0] - assert persons["health_insurance_premiums_without_medicare_part_b"].tolist() == [ - 900, - 0, - ] - assert persons["over_the_counter_health_expenses"].tolist() == [120, 0] - assert persons["other_medical_expenses"].tolist() == [450, 0] - assert persons["medicare_part_b_premiums"].tolist() == [600, 0] - - -def test_load_cps_asec_falls_back_last_year_income_to_current_earnings(tmp_path): - # The prior-year-earnings lookback (EITC/CTC prior-year election) expired, - # so last-year income is a placeholder set to current-year earnings - # (WSAL_VAL / SEMP_VAL) with no prior-ASEC dependency. - # previous_year_income_available tracks whether the row has any earnings. - current_person_rows = pd.DataFrame( - { - "PERIDNUM": ["A", "B", "C", "D"], - "PH_SEQ": [1, 1, 2, 2], - "A_LINENO": [1, 2, 1, 2], - "A_AGE": [34, 31, 45, 17], - "A_FNLWGT": [100, 100, 200, 200], - "WSAL_VAL": [60_000, 10_000, 20_000, 0], - "SEMP_VAL": [5_000, 0, -3_000, 0], - "I_ERNVAL": [0, 1, 0, 0], - "I_SEVAL": [0, 0, 0, 0], - } - ) - with zipfile.ZipFile(tmp_path / "cps_asec_2023.zip", "w") as archive: - archive.writestr("pppub23.csv", current_person_rows.to_csv(index=False)) - - dataset = load_cps_asec(year=2023, cache_dir=tmp_path, download=False) - persons = ( - dataset.persons.to_pandas() - .sort_values(["household_id", "person_number"]) - .reset_index(drop=True) - ) - - assert persons["employment_income_last_year"].tolist() == [ - 60_000.0, - 10_000.0, - 20_000.0, - 0.0, - ] - assert persons["self_employment_income_last_year"].tolist() == [ - 5_000.0, - 0.0, - -3_000.0, - 0.0, - ] - assert persons["self_employment_income"].tolist() == [ - 5_000.0, - 0.0, - -3_000.0, - 0.0, - ] - assert persons["previous_year_income_available"].tolist() == [ - True, - True, - True, - False, - ] - - -def test_load_cps_asec_derives_survivor_and_dependent_social_security(tmp_path): - person_rows = pd.DataFrame( - { - "PH_SEQ": [1, 1, 1, 1], - "A_LINENO": [1, 2, 3, 4], - "A_AGE": [70, 40, 12, 10], - "A_FNLWGT": [100, 100, 100, 100], - "RESNSS1": [3, 5, 4, 6], - "RESNSS2": [0, 0, 0, 0], - "SS_VAL": [1000, 1100, 1200, 1300], - } - ) - with zipfile.ZipFile(tmp_path / "cps_asec_2023.zip", "w") as archive: - archive.writestr("pppub23.csv", person_rows.to_csv(index=False)) - - dataset = load_cps_asec(year=2023, cache_dir=tmp_path, download=False) - persons = ( - dataset.persons.to_pandas().sort_values("person_number").reset_index(drop=True) - ) - - assert persons["social_security_survivors"].tolist() == [1000.0, 1100.0, 0.0, 0.0] - assert persons["social_security_dependents"].tolist() == [0.0, 0.0, 1200.0, 1300.0] - assert persons["social_security_retirement"].tolist() == [0.0, 0.0, 0.0, 0.0] - assert persons["social_security_disability"].tolist() == [0.0, 0.0, 0.0, 0.0] - - -def test_cps_source_provider_repeat_loads_are_deterministic_for_cached_processed_data( - tmp_path, -): - cached_persons = pl.DataFrame( - { - "household_id": [2, 1, 2, 3, 1], - "person_number": [1, 2, 2, 1, 1], - "person_id": ["2:1", "1:2", "2:2", "3:1", "1:1"], - "age": [52, 12, 49, 40, 34], - "weight": [200.0, 100.0, 200.0, 300.0, 100.0], - "state_fips": [36, 6, 36, 48, 6], - "county_fips": [61, 1, 61, 201, 1], - "cps_race": [1, 4, 1, 2, 4], - "is_hispanic": [False, True, False, False, True], - "is_disabled": [False, False, False, True, False], - "social_security_disability": [0.0] * 5, - "social_security_retirement": [0.0] * 5, - "social_security_survivors": [0.0] * 5, - "social_security_dependents": [0.0] * 5, - "has_esi": [True, False, True, False, False], - "has_marketplace_health_coverage": [False, True, False, False, True], - "receives_wic": [False] * 5, - "alimony_income": [0.0, 0.0, 0.0, 0.0, 0.0], - "child_support_received": [0.0, 0.0, 0.0, 0.0, 0.0], - "child_support_expense": [0.0, 0.0, 0.0, 0.0, 0.0], - "disability_benefits": [0.0, 0.0, 0.0, 0.0, 0.0], - "health_insurance_premiums_without_medicare_part_b": [0.0] * 5, - "other_medical_expenses": [0.0] * 5, - "over_the_counter_health_expenses": [0.0] * 5, - "medicare_part_b_premiums": [0.0] * 5, - "spm_unit_pre_subsidy_childcare_expenses": [0.0] * 5, - "year": [2023, 2023, 2023, 2023, 2023], - } - ) - for column in PERSON_CACHE_REQUIRED_COLUMNS: - if column not in cached_persons.columns: - cached_persons = cached_persons.with_columns(pl.lit(0).alias(column)) - cached_persons.write_parquet( - processed_cps_asec_cache_path(year=2023, cache_dir=tmp_path) - ) - - provider = CPSASECSourceProvider(year=2023, cache_dir=tmp_path, download=False) - query = SourceQuery(provider_filters={"sample_n": 2, "random_seed": 42}) - - first = provider.load_frame(query) - second = provider.load_frame(query) - - first_households = first.tables[EntityType.HOUSEHOLD] - second_households = second.tables[EntityType.HOUSEHOLD] - first_persons = first.tables[EntityType.PERSON] - second_persons = second.tables[EntityType.PERSON] - - assert ( - first_households["household_id"].tolist() - == second_households["household_id"].tolist() - ) - assert first_persons["person_id"].tolist() == second_persons["person_id"].tolist() - assert ( - first_households["household_weight"].tolist() - == second_households["household_weight"].tolist() - ) - assert first_persons["weight"].tolist() == second_persons["weight"].tolist() - - -def test_load_cps_asec_rebuilds_stale_processed_cache_without_pe_presim_inputs( - tmp_path, -): - stale_processed = pl.DataFrame( - { - "household_id": [1, 1, 2], - "person_number": [1, 2, 1], - "age": [34, 12, 52], - "weight": [1.0, 1.0, 2.0], - "state_fips": [6, 6, 36], - "year": [2023, 2023, 2023], - } - ) - stale_processed.write_parquet(tmp_path / "cps_asec_2023_processed.parquet") - - person_rows = pd.DataFrame( - { - "PH_SEQ": [1, 1, 2], - "GESTFIPS": [6, 6, 36], - "A_LINENO": [1, 2, 1], - "A_AGE": [34, 12, 52], - "A_FNLWGT": [100, 100, 200], - "PRDTRACE": [4, 4, 1], - "PRDTHSP": [0, 1, 0], - "PEHSPNON": [2, 1, 2], - "PEDISDRS": [0, 1, 0], - "PEDISEAR": [0, 0, 0], - "PEDISEYE": [0, 0, 0], - "PEDISOUT": [0, 0, 0], - "PEDISPHY": [0, 0, 0], - "PEDISREM": [0, 0, 0], - "NOW_MRK": [1, 0, 0], - "NOW_GRP": [0, 1, 0], - "OI_OFF": [20, 0, 0], - "OI_VAL": [1200, 0, 0], - "CSP_VAL": [300, 0, 0], - "CHSP_VAL": [700, 0, 0], - "DIS_VAL1": [500, 0, 0], - "DIS_SC1": [2, 0, 0], - "DIS_VAL2": [50, 0, 0], - "DIS_SC2": [3, 0, 0], - "PHIP_VAL": [900, 0, 0], - "POTC_VAL": [120, 0, 0], - "PMED_VAL": [450, 0, 0], - "PEMCPREM": [600, 0, 0], - } - ) - household_rows = pd.DataFrame( - { - "H_SEQ": [1, 2], - "GESTFIPS": [6, 36], - "GTCO": [1, 61], - "HSUP_WGT": [100, 200], - } - ) - with zipfile.ZipFile(tmp_path / "cps_asec_2023.zip", "w") as archive: - archive.writestr("pppub23.csv", person_rows.to_csv(index=False)) - archive.writestr("hhpub23.csv", household_rows.to_csv(index=False)) - - dataset = load_cps_asec(year=2023, cache_dir=tmp_path, download=False) - - assert dataset.source.endswith("cps_asec_2023.zip") - assert dataset.persons["county_fips"].to_list() == [1, 1, 61] - assert dataset.persons["cps_race"].to_list() == [4, 4, 1] - assert dataset.persons["is_hispanic"].to_list() == [False, True, False] - assert dataset.persons["is_disabled"].to_list() == [False, True, False] - assert dataset.persons["has_marketplace_health_coverage"].to_list() == [ - True, - False, - False, - ] - assert dataset.persons["has_esi"].to_list() == [False, True, False] - assert dataset.persons["alimony_income"].to_list() == [1200, 0, 0] - assert dataset.persons["child_support_received"].to_list() == [300, 0, 0] - assert dataset.persons["child_support_expense"].to_list() == [700, 0, 0] - assert dataset.persons["disability_benefits"].to_list() == [550, 0, 0] - assert dataset.persons[ - "health_insurance_premiums_without_medicare_part_b" - ].to_list() == [900, 0, 0] - assert dataset.persons["other_medical_expenses"].to_list() == [450, 0, 0] - assert dataset.persons["over_the_counter_health_expenses"].to_list() == [120, 0, 0] - assert dataset.persons["medicare_part_b_premiums"].to_list() == [600, 0, 0] - - -def test_cps_sampling_falls_back_to_uniform_when_weighted_sampling_is_infeasible( - monkeypatch, -): - households = pd.DataFrame( - { - "household_id": [1, 2, 3], - "year": [2023, 2023, 2023], - "household_weight": [10.0, 20.0, 30.0], - } - ) - persons = pd.DataFrame( - { - "household_id": [1, 2, 3], - "person_id": ["1:1", "2:1", "3:1"], - "person_number": [1, 1, 1], - "year": [2023, 2023, 2023], - } - ) - - original_sample = pd.DataFrame.sample - - def flaky_sample(self, *args, **kwargs): - if kwargs.get("weights") is not None: - raise ValueError("Weighted sampling cannot be achieved with replace=False.") - return original_sample(self, *args, **kwargs) - - monkeypatch.setattr(pd.DataFrame, "sample", flaky_sample) - - sampled_households, sampled_persons = _sample_households_and_persons( - households=households, - persons=persons, - sample_n=2, - random_seed=42, - ) - - assert len(sampled_households) == 2 - assert len(sampled_persons) == 2 - assert set(sampled_persons["household_id"]) == set( - sampled_households["household_id"] - ) - - -def test_sample_households_and_persons_state_floor_preserves_state_coverage() -> None: - households = pd.DataFrame( - { - "household_id": [1, 2, 3, 4, 5, 6], - "state_fips": [6, 6, 36, 36, 48, 48], - "household_weight": [10.0, 9.0, 8.0, 7.0, 6.0, 5.0], - "year": [2024] * 6, - } - ) - persons = pd.DataFrame( - { - "person_id": [11, 21, 31, 41, 51, 61], - "household_id": [1, 2, 3, 4, 5, 6], - "person_number": [1, 1, 1, 1, 1, 1], - "year": [2024] * 6, - } - ) - - sampled_households, sampled_persons = _sample_households_and_persons( - households=households, - persons=persons, - sample_n=3, - random_seed=7, - state_floor=1, - ) - - assert len(sampled_households) == 3 - assert sampled_households["state_fips"].nunique() == 3 - assert set(sampled_persons["household_id"]) == set( - sampled_households["household_id"] - ) - - -def test_sample_households_and_persons_state_age_floor_preserves_age_band_coverage() -> ( - None -): - households = pd.DataFrame( - { - "household_id": [1, 2, 3, 4, 5, 6], - "state_fips": [6, 6, 6, 36, 36, 36], - "household_weight": [10.0, 9.0, 8.0, 7.0, 6.0, 5.0], - "year": [2024] * 6, - } - ) - persons = pd.DataFrame( - { - "person_id": [11, 21, 31, 41, 51, 61], - "household_id": [1, 2, 3, 4, 5, 6], - "person_number": [1, 1, 1, 1, 1, 1], - "age": [2, 7, 7, 4, 87, 87], - "year": [2024] * 6, - } - ) - - sampled_households, sampled_persons = _sample_households_and_persons( - households=households, - persons=persons, - sample_n=4, - random_seed=7, - state_age_floor=1, - ) - - observed_keys = { - (int(state), _cps_age_band_key(age)) - for state, age in persons.merge( - households[["household_id", "state_fips"]], - on="household_id", - how="left", - )[["state_fips", "age"]].itertuples(index=False, name=None) - } - sampled_keys = { - (int(state), _cps_age_band_key(age)) - for state, age in sampled_persons.merge( - sampled_households[["household_id", "state_fips"]], - on="household_id", - how="left", - )[["state_fips", "age"]].itertuples(index=False, name=None) - } - - assert len(sampled_households) == 4 - assert observed_keys.issubset(sampled_keys) - assert set(sampled_persons["household_id"]) == set( - sampled_households["household_id"] - ) diff --git a/tests/test_cps_synthetic.py b/tests/test_cps_synthetic.py deleted file mode 100644 index fbacb181..00000000 --- a/tests/test_cps_synthetic.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Focused tests for CPS summary-stat synthesis helpers.""" - -from __future__ import annotations - -import numpy as np -import pandas as pd - -from microplex_us.cps_synthetic import ( - CPSSummaryStats, - CPSSyntheticGenerator, - validate_synthetic, -) - - -def _sample_reference_data() -> pd.DataFrame: - rng = np.random.default_rng(123) - return pd.DataFrame( - { - "age": rng.integers(18, 85, size=512), - "sex": rng.integers(1, 3, size=512), - "income": np.where( - rng.random(512) < 0.2, - 0.0, - rng.lognormal(mean=10.5, sigma=0.6, size=512), - ), - "education": rng.integers(1, 5, size=512), - } - ) - - -def test_summary_stats_and_generator_round_trip() -> None: - reference = _sample_reference_data() - - stats = CPSSummaryStats.from_dataframe(reference) - synthetic = CPSSyntheticGenerator(stats).generate(n=256, seed=77) - - assert set(synthetic.columns) == set(reference.columns) - assert len(synthetic) == 256 - assert (synthetic["income"] >= 0).all() - - -def test_validate_synthetic_returns_aggregate_metrics() -> None: - reference = _sample_reference_data() - synthetic = reference.sample(n=256, replace=True, random_state=42).reset_index(drop=True) - - metrics = validate_synthetic(reference, synthetic) - - assert "ks_statistics" in metrics - assert "mean_ks" in metrics - assert "mean_corr_error" in metrics - assert metrics["mean_ks"] >= 0 diff --git a/tests/test_data.py b/tests/test_data.py deleted file mode 100644 index d19fb0f1..00000000 --- a/tests/test_data.py +++ /dev/null @@ -1,63 +0,0 @@ -"""Tests for the US-specific CPS data helpers.""" - -from __future__ import annotations - -from pathlib import Path - -import pandas as pd -import pytest - -from microplex_us.data import ( - create_sample_data, - get_data_info, - load_cps_asec, - load_cps_for_synthesis, -) - - -def test_create_sample_data_returns_households_and_persons() -> None: - households, persons = create_sample_data(n_households=64, seed=123) - - assert len(households) == 64 - assert households["household_id"].is_unique - assert persons["person_id"].is_unique - assert set(["household_id", "n_persons", "hh_weight"]).issubset(households.columns) - assert set(["person_id", "household_id", "age", "income"]).issubset(persons.columns) - - -def test_load_cps_asec_reads_preprocessed_parquet(tmp_path: Path) -> None: - households, persons = create_sample_data(n_households=32, seed=7) - households.to_parquet(tmp_path / "cps_asec_households.parquet") - persons.to_parquet(tmp_path / "cps_asec_persons.parquet") - - loaded_households, loaded_persons = load_cps_asec(data_dir=tmp_path) - - pd.testing.assert_frame_equal(loaded_households, households) - pd.testing.assert_frame_equal(loaded_persons, persons) - - -def test_load_cps_for_synthesis_samples_households_consistently(tmp_path: Path) -> None: - households, persons = create_sample_data(n_households=200, seed=11) - households.to_parquet(tmp_path / "cps_asec_households.parquet") - persons.to_parquet(tmp_path / "cps_asec_persons.parquet") - - sampled_households, sampled_persons = load_cps_for_synthesis( - data_dir=tmp_path, - sample_fraction=0.25, - random_state=99, - ) - - assert 0 < len(sampled_households) < len(households) - assert set(sampled_persons["household_id"]) <= set(sampled_households["household_id"]) - - -def test_get_data_info_reports_missing_files(tmp_path: Path) -> None: - info = get_data_info(data_dir=tmp_path) - - assert info["households"] == {"exists": False} - assert info["persons"] == {"exists": False} - - -def test_load_cps_asec_raises_helpful_error_for_missing_files(tmp_path: Path) -> None: - with pytest.raises(FileNotFoundError, match="CPS ASEC data files not found"): - load_cps_asec(data_dir=tmp_path) diff --git a/tests/test_donor_survey_source_providers.py b/tests/test_donor_survey_source_providers.py deleted file mode 100644 index b0bfffdb..00000000 --- a/tests/test_donor_survey_source_providers.py +++ /dev/null @@ -1,720 +0,0 @@ -"""Tests for PE-style donor survey source providers.""" - -from __future__ import annotations - -import h5py -import pandas as pd -import pytest -from microplex.core import EntityType - -import microplex_us.data_sources.donor_surveys as donor_surveys -from microplex_us.data_sources.donor_surveys import ( - ACSSourceProvider, - DonorSurveyTables, - SCFSourceProvider, - SIPPSourceProvider, -) - - -def _acs_tables(**_kwargs) -> DonorSurveyTables: - households = pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - "year": [2022, 2022], - } - ) - persons = pd.DataFrame( - { - "person_id": [11, 12, 21], - "household_id": [1, 1, 2], - "age": [45, 12, 68], - "sex": [1, 2, 2], - "is_male": [1.0, 0.0, 0.0], - "is_household_head": [1.0, 0.0, 1.0], - "tenure_type": [1, 1, 2], - "employment_income": [50_000.0, 0.0, 12_000.0], - "self_employment_income": [5_000.0, 0.0, 0.0], - "social_security": [0.0, 0.0, 20_000.0], - "taxable_pension_income": [0.0, 0.0, 15_000.0], - "rent": [1_200.0, 0.0, 950.0], - "real_estate_taxes": [3_000.0, 0.0, 0.0], - "income": [55_000.0, 0.0, 47_000.0], - "weight": [100.0, 100.0, 120.0], - "year": [2022, 2022, 2022], - } - ) - return DonorSurveyTables(households=households, persons=persons) - - -def _sipp_tips_tables(**_kwargs) -> DonorSurveyTables: - households = pd.DataFrame( - { - "household_id": ["100:1", "101:1"], - "household_weight": [80.0, 90.0], - "state_fips": [0, 0], - "tenure": [0, 0], - "year": [2023, 2023], - } - ) - persons = pd.DataFrame( - { - "person_id": ["100:1:1", "100:1:2", "101:1:1"], - "household_id": ["100:1", "100:1", "101:1"], - "age": [35, 8, 50], - "sex": [1, 2, 2], - "employment_income": [40_000.0, 0.0, 25_000.0], - "income": [40_000.0, 0.0, 25_000.0], - "tip_income": [900.0, 0.0, 250.0], - "count_under_18": [1.0, 1.0, 0.0], - "count_under_6": [0.0, 0.0, 0.0], - "weight": [80.0, 80.0, 90.0], - "year": [2023, 2023, 2023], - } - ) - return DonorSurveyTables(households=households, persons=persons) - - -def _sipp_assets_tables(**_kwargs) -> DonorSurveyTables: - households = pd.DataFrame( - { - "household_id": ["100", "101"], - "household_weight": [80.0, 90.0], - "state_fips": [0, 0], - "tenure": [0, 0], - "year": [2023, 2023], - } - ) - persons = pd.DataFrame( - { - "person_id": ["100:1", "101:1"], - "household_id": ["100", "101"], - "age": [35, 50], - "sex": [1, 2], - "is_female": [0.0, 1.0], - "is_married": [1.0, 0.0], - "employment_income": [40_000.0, 25_000.0], - "income": [40_000.0, 25_000.0], - "count_under_18": [1.0, 0.0], - "bank_account_assets": [2_500.0, 10_000.0], - "stock_assets": [0.0, 4_000.0], - "bond_assets": [0.0, 1_500.0], - "household_vehicles_owned": [2.0, 1.0], - "household_vehicles_value": [12_000.0, 6_000.0], - "weight": [80.0, 90.0], - "year": [2023, 2023], - } - ) - return DonorSurveyTables(households=households, persons=persons) - - -def test_sample_households_and_persons_prefers_positive_weight_households() -> None: - households = pd.DataFrame( - { - "household_id": ["h1", "h2", "h3"], - "household_weight": [10.0, 0.0, 20.0], - } - ) - persons = pd.DataFrame( - { - "person_id": ["p1", "p2", "p3"], - "household_id": ["h1", "h2", "h3"], - } - ) - - sampled_households, sampled_persons = donor_surveys._sample_households_and_persons( - households=households, - persons=persons, - sample_n=2, - random_seed=0, - ) - - assert sampled_households["household_id"].tolist() == ["h1", "h3"] - assert sampled_persons["household_id"].tolist() == ["h1", "h3"] - - -def test_sample_households_and_persons_falls_back_when_weighted_sampling_errors( - monkeypatch, -) -> None: - households = pd.DataFrame( - { - "household_id": ["h1", "h2", "h3"], - "household_weight": [10.0, 5.0, 20.0], - } - ) - persons = pd.DataFrame( - { - "person_id": ["p1", "p2", "p3"], - "household_id": ["h1", "h2", "h3"], - } - ) - original_sample = pd.DataFrame.sample - - def _flaky_sample(self, *args, **kwargs): - if kwargs.get("weights") is not None: - raise ValueError("weighted sampling failed") - return original_sample(self, *args, **kwargs) - - monkeypatch.setattr(pd.DataFrame, "sample", _flaky_sample) - - sampled_households, sampled_persons = donor_surveys._sample_households_and_persons( - households=households, - persons=persons, - sample_n=2, - random_seed=0, - ) - - assert len(sampled_households) == 2 - assert set(sampled_persons["household_id"]) == set( - sampled_households["household_id"] - ) - - -def test_sample_households_and_persons_state_floor_preserves_states() -> None: - households = pd.DataFrame( - { - "household_id": ["h1", "h2", "h3", "h4"], - "household_weight": [10.0, 9.0, 8.0, 7.0], - "state_fips": [6, 6, 36, 48], - } - ) - persons = pd.DataFrame( - { - "person_id": ["p1", "p2", "p3", "p4"], - "household_id": ["h1", "h2", "h3", "h4"], - } - ) - - sampled_households, sampled_persons = donor_surveys._sample_households_and_persons( - households=households, - persons=persons, - sample_n=3, - random_seed=0, - state_floor=1, - ) - - assert len(sampled_households) == 3 - assert sampled_households["state_fips"].nunique() == 3 - assert set(sampled_persons["household_id"]) == set( - sampled_households["household_id"] - ) - - -def test_sample_households_and_persons_state_age_floor_preserves_age_band_coverage() -> ( - None -): - households = pd.DataFrame( - { - "household_id": ["h1", "h2", "h3", "h4"], - "household_weight": [10.0, 9.0, 8.0, 7.0], - "state_fips": [6, 6, 36, 36], - } - ) - persons = pd.DataFrame( - { - "person_id": ["p1", "p2", "p3", "p4"], - "household_id": ["h1", "h2", "h3", "h4"], - "age": [7, 42, 10, 67], - } - ) - - sampled_households, sampled_persons = donor_surveys._sample_households_and_persons( - households=households, - persons=persons, - sample_n=4, - random_seed=0, - state_age_floor=1, - ) - - sampled = sampled_persons.merge( - sampled_households[["household_id", "state_fips"]], - on="household_id", - how="left", - ) - sampled["age_band"] = sampled["age"].map(donor_surveys._donor_age_band_key) - - assert len(sampled_households) == 4 - assert { - (6, "5_10"), - (6, "40_45"), - (36, "10_15"), - (36, "65_70"), - }.issubset(set(zip(sampled["state_fips"], sampled["age_band"], strict=False))) - - -def _scf_tables(**_kwargs) -> DonorSurveyTables: - households = pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [10.0, 12.0], - "state_fips": [0, 0], - "tenure": [0, 0], - "year": [2022, 2022], - } - ) - persons = pd.DataFrame( - { - "person_id": [1, 2], - "household_id": [1, 2], - "age": [45, 68], - "sex": [1, 2], - "is_female": [0.0, 1.0], - "cps_race": [1, 2], - "is_married": [1.0, 0.0], - "own_children_in_household": [1.0, 0.0], - "employment_income": [75_000.0, 0.0], - "income": [75_000.0, 0.0], - "interest_dividend_income": [1_200.0, 400.0], - "social_security_pension_income": [0.0, 18_000.0], - "net_worth": [350_000.0, 180_000.0], - "auto_loan_balance": [8_000.0, 0.0], - "auto_loan_interest": [550.0, 0.0], - "weight": [10.0, 12.0], - "year": [2022, 2022], - } - ) - return DonorSurveyTables(households=households, persons=persons) - - -def _write_uprating_factors( - repo_root, - rows: dict[str, tuple[float, float, float]], -) -> None: - storage_dir = repo_root / "policyengine_us_data" / "storage" - storage_dir.mkdir(parents=True) - pd.DataFrame( - [ - { - "Variable": variable, - "2022": values[0], - "2023": values[1], - "2024": values[2], - } - for variable, values in rows.items() - ] - ).to_csv(storage_dir / "uprating_factors.csv", index=False) - - -def test_acs_source_provider_builds_observation_frame_from_injected_loader() -> None: - provider = ACSSourceProvider(loader=_acs_tables) - - frame = provider.load_frame() - - assert frame.source.name == "acs_2022" - assert frame.source.observes("rent", EntityType.PERSON) - assert frame.source.allows_conditioning_on("state_fips") is True - assert list(frame.tables[EntityType.HOUSEHOLD]["household_id"]) == [1, 2] - - -def test_acs_source_provider_uses_manifest_backed_dataset_loader( - monkeypatch, -) -> None: - captured: dict[str, object] = {} - - def _fake_loader( - *, spec, year, sample_n, random_seed, **_kwargs - ) -> DonorSurveyTables: - captured["spec"] = spec - captured["year"] = year - captured["sample_n"] = sample_n - captured["random_seed"] = random_seed - return _acs_tables() - - monkeypatch.setattr( - donor_surveys, - "_run_policyengine_dataset_loader_from_spec", - _fake_loader, - ) - - provider = ACSSourceProvider() - frame = provider.load_frame() - - assert frame.source.name == "acs_2022" - assert captured["spec"].key == "acs" - assert captured["year"] == 2022 - assert captured["sample_n"] is None - assert captured["random_seed"] == 0 - - -def test_acs_source_provider_can_load_newer_storage_h5( - tmp_path, -) -> None: - storage_dir = tmp_path / "policyengine_us_data" / "storage" - storage_dir.mkdir(parents=True) - h5_path = storage_dir / "acs_2024.h5" - with h5py.File(h5_path, "w") as h5: - h5.create_dataset("household_id", data=[1, 2]) - h5.create_dataset("person_household_id", data=[1, 1, 2]) - h5.create_dataset("person_id", data=[11, 12, 21]) - h5.create_dataset("age", data=[45, 12, 68]) - h5.create_dataset("is_male", data=[True, False, False]) - h5.create_dataset("is_household_head", data=[True, False, True]) - h5.create_dataset("state_fips", data=[6, 36]) - h5.create_dataset( - "tenure_type", - data=[b"OWNED_WITH_MORTGAGE", b"RENTED"], - ) - h5.create_dataset("employment_income", data=[50_000.0, 0.0, 12_000.0]) - h5.create_dataset("self_employment_income", data=[5_000.0, 0.0, 0.0]) - h5.create_dataset("social_security", data=[0.0, 0.0, 20_000.0]) - h5.create_dataset( - "taxable_private_pension_income", - data=[0.0, 0.0, 15_000.0], - ) - h5.create_dataset("rent", data=[1_200.0, 0.0, 950.0]) - h5.create_dataset("real_estate_taxes", data=[3_000.0, 0.0, 0.0]) - h5.create_dataset("household_weight", data=[100.0, 120.0]) - - frame = ACSSourceProvider( - year=2024, policyengine_us_data_repo=tmp_path - ).load_frame() - - assert frame.source.name == "acs_2024" - households = frame.tables[EntityType.HOUSEHOLD] - persons = frame.tables[EntityType.PERSON] - assert households["household_weight"].tolist() == [100.0, 120.0] - assert persons["rent"].tolist() == [1_200.0, 0.0, 950.0] - assert persons["tenure"].tolist() == [1, 1, 2] - - -def test_acs_source_provider_forwards_state_age_floor_query_filter() -> None: - captured: dict[str, object] = {} - - def _loader(**kwargs) -> DonorSurveyTables: - captured.update(kwargs) - return _acs_tables() - - provider = ACSSourceProvider(loader=_loader) - provider.load_frame( - query=donor_surveys.SourceQuery( - provider_filters={ - "sample_n": 2, - "random_seed": 3, - "state_age_floor": 1, - } - ) - ) - - assert captured["sample_n"] == 2 - assert captured["random_seed"] == 3 - assert captured["state_age_floor"] == 1 - - -def test_acs_source_provider_deduplicates_households_from_dataset_loader( - monkeypatch, -) -> None: - def _fake_loader( - *, spec, year, sample_n, random_seed, **_kwargs - ) -> DonorSurveyTables: - households = pd.DataFrame( - { - "household_id": [1, 1, 2], - "household_weight": [100.0, 100.0, 120.0], - "state_fips": [6, 6, 36], - "tenure": [1, 1, 2], - "year": [2022, 2022, 2022], - } - ) - persons = pd.DataFrame( - { - "person_id": [11, 12, 21], - "household_id": [1, 1, 2], - "age": [45, 12, 68], - "sex": [1, 2, 2], - "is_male": [1.0, 0.0, 0.0], - "is_household_head": [1.0, 0.0, 1.0], - "tenure_type": [1, 1, 2], - "employment_income": [50_000.0, 0.0, 12_000.0], - "self_employment_income": [5_000.0, 0.0, 0.0], - "social_security": [0.0, 0.0, 20_000.0], - "taxable_pension_income": [0.0, 0.0, 15_000.0], - "rent": [1_200.0, 0.0, 950.0], - "real_estate_taxes": [3_000.0, 0.0, 0.0], - "income": [55_000.0, 0.0, 47_000.0], - "weight": [100.0, 100.0, 120.0], - "year": [2022, 2022, 2022], - } - ) - return DonorSurveyTables(households=households, persons=persons) - - monkeypatch.setattr( - donor_surveys, - "_run_policyengine_dataset_loader_from_spec", - _fake_loader, - ) - - frame = ACSSourceProvider().load_frame() - - assert frame.tables[EntityType.HOUSEHOLD]["household_id"].tolist() == [1, 2] - assert frame.tables[EntityType.PERSON]["household_id"].tolist() == [1, 1, 2] - - -def test_acs_source_provider_makes_duplicate_person_ids_household_scoped( - monkeypatch, -) -> None: - def _fake_loader( - *, spec, year, sample_n, random_seed, **_kwargs - ) -> DonorSurveyTables: - households = pd.DataFrame( - { - "household_id": [1, 2], - "household_weight": [100.0, 120.0], - "state_fips": [6, 36], - "tenure": [1, 2], - "year": [2022, 2022], - } - ) - persons = pd.DataFrame( - { - "person_id": [1, 2, 1], - "household_id": [1, 1, 2], - "age": [45, 12, 68], - "sex": [1, 2, 2], - "is_male": [1.0, 0.0, 0.0], - "is_household_head": [1.0, 0.0, 1.0], - "tenure_type": [1, 1, 2], - "employment_income": [50_000.0, 0.0, 12_000.0], - "self_employment_income": [5_000.0, 0.0, 0.0], - "social_security": [0.0, 0.0, 20_000.0], - "taxable_pension_income": [0.0, 0.0, 15_000.0], - "rent": [1_200.0, 0.0, 950.0], - "real_estate_taxes": [3_000.0, 0.0, 0.0], - "income": [55_000.0, 0.0, 47_000.0], - "weight": [100.0, 100.0, 120.0], - "year": [2022, 2022, 2022], - } - ) - return DonorSurveyTables(households=households, persons=persons) - - monkeypatch.setattr( - donor_surveys, - "_run_policyengine_dataset_loader_from_spec", - _fake_loader, - ) - - frame = ACSSourceProvider().load_frame() - person_ids = frame.tables[EntityType.PERSON]["person_id"].tolist() - - assert person_ids == ["1:1", "1:2", "2:1"] - assert len(person_ids) == len(set(person_ids)) - - -def test_sipp_and_scf_provider_fillers_are_not_usable_as_conditions() -> None: - tips_provider = SIPPSourceProvider(block="tips", loader=_sipp_tips_tables) - assets_provider = SIPPSourceProvider(block="assets", loader=_sipp_assets_tables) - scf_provider = SCFSourceProvider(loader=_scf_tables) - - tips_frame = tips_provider.load_frame() - assets_frame = assets_provider.load_frame() - scf_frame = scf_provider.load_frame() - - assert tips_frame.source.name == "sipp_tips_2023" - assert assets_frame.source.name == "sipp_assets_2023" - assert scf_frame.source.name == "scf_2022" - assert tips_frame.source.allows_conditioning_on("state_fips") is False - assert assets_frame.source.is_authoritative_for("tenure") is False - assert scf_frame.source.allows_conditioning_on("state_fips") is False - assert scf_frame.source.observes("net_worth", EntityType.PERSON) - - -def test_sipp_provider_uprates_amounts_to_target_year(tmp_path) -> None: - _write_uprating_factors( - tmp_path, - { - "employment_income_before_lsr": (1.0, 1.0, 1.1), - "tip_income": (1.0, 1.0, 1.25), - }, - ) - - provider = SIPPSourceProvider( - block="tips", - loader=_sipp_tips_tables, - policyengine_us_data_repo=tmp_path, - target_year=2024, - ) - - frame = provider.load_frame() - households = frame.tables[EntityType.HOUSEHOLD] - persons = frame.tables[EntityType.PERSON] - - assert frame.source.name == "sipp_tips_2023" - assert households["year"].tolist() == [2024, 2024] - assert persons["year"].tolist() == [2024, 2024, 2024] - assert persons["employment_income"].tolist() == pytest.approx( - [44_000.0, 0.0, 27_500.0] - ) - assert persons["income"].tolist() == pytest.approx([44_000.0, 0.0, 27_500.0]) - assert persons["tip_income"].tolist() == [1_125.0, 0.0, 312.5] - assert persons["weight"].tolist() == [80.0, 80.0, 90.0] - - -def test_sipp_asset_provider_uprates_liquid_assets_to_target_year(tmp_path) -> None: - _write_uprating_factors( - tmp_path, - { - "employment_income_before_lsr": (1.0, 1.0, 1.1), - "bank_account_assets": (1.0, 1.0, 1.2), - "stock_assets": (1.0, 1.0, 1.3), - "bond_assets": (1.0, 1.0, 1.4), - }, - ) - - provider = SIPPSourceProvider( - block="assets", - loader=_sipp_assets_tables, - policyengine_us_data_repo=tmp_path, - target_year=2024, - ) - - persons = provider.load_frame().tables[EntityType.PERSON] - - assert persons["employment_income"].tolist() == pytest.approx([44_000.0, 27_500.0]) - assert persons["income"].tolist() == pytest.approx([44_000.0, 27_500.0]) - assert persons["bank_account_assets"].tolist() == [3_000.0, 12_000.0] - assert persons["stock_assets"].tolist() == [0.0, 5_200.0] - assert persons["bond_assets"].tolist() == [0.0, 2_100.0] - - -def test_scf_provider_uprates_amounts_to_target_year(tmp_path) -> None: - _write_uprating_factors( - tmp_path, - { - "employment_income_before_lsr": (1.0, 1.0, 1.1), - "taxable_interest_income": (1.0, 1.0, 2.0), - "social_security_retirement": (1.0, 1.0, 1.5), - "net_worth": (1.0, 1.0, 1.2), - "auto_loan_balance": (1.0, 1.0, 1.3), - "auto_loan_interest": (1.0, 1.0, 1.4), - }, - ) - - provider = SCFSourceProvider( - loader=_scf_tables, - policyengine_us_data_repo=tmp_path, - target_year=2024, - ) - - frame = provider.load_frame() - households = frame.tables[EntityType.HOUSEHOLD] - persons = frame.tables[EntityType.PERSON] - - assert frame.source.name == "scf_2022" - assert households["year"].tolist() == [2024, 2024] - assert persons["year"].tolist() == [2024, 2024] - assert persons["employment_income"].tolist() == [82_500.0, 0.0] - assert persons["income"].tolist() == [82_500.0, 0.0] - assert persons["interest_dividend_income"].tolist() == [2_400.0, 800.0] - assert persons["social_security_pension_income"].tolist() == [0.0, 27_000.0] - assert persons["net_worth"].tolist() == [420_000.0, 216_000.0] - assert persons["auto_loan_balance"].tolist() == [10_400.0, 0.0] - assert persons["auto_loan_interest"].tolist() == [770.0, 0.0] - assert persons["weight"].tolist() == [10.0, 12.0] - - -def test_scf_source_provider_uses_manifest_backed_dataset_loader( - monkeypatch, -) -> None: - captured: dict[str, object] = {} - - def _fake_loader( - *, spec, year, sample_n, random_seed, **_kwargs - ) -> DonorSurveyTables: - captured["spec"] = spec - captured["year"] = year - captured["sample_n"] = sample_n - captured["random_seed"] = random_seed - return _scf_tables() - - monkeypatch.setattr( - donor_surveys, - "_run_policyengine_dataset_loader_from_spec", - _fake_loader, - ) - - provider = SCFSourceProvider() - frame = provider.load_frame() - - assert frame.source.name == "scf_2022" - assert captured["spec"].key == "scf" - assert captured["year"] == 2022 - assert captured["sample_n"] is None - assert captured["random_seed"] == 0 - - -def test_sipp_tips_provider_uses_manifest_backed_raw_loader( - tmp_path, - monkeypatch, -) -> None: - path = tmp_path / "pu2023_slim.csv" - pd.DataFrame( - { - "SSUID": ["100", "100", "101"], - "MONTHCODE": [1, 1, 2], - "PNUM": [1, 2, 1], - "WPFINWGT": [80.0, 80.0, 90.0], - "TAGE": [35, 8, 50], - "ESEX": [1, 2, 2], - "TPTOTINC": [1000.0, 0.0, 500.0], - "TXAMT1": [10.0, 0.0, 5.0], - "TXAMT2": [2.0, 0.0, 0.0], - } - ).to_csv(path, index=False) - - monkeypatch.setattr( - donor_surveys, - "_download_policyengine_us_data_file", - lambda **_kwargs: path, - ) - - frame = SIPPSourceProvider(block="tips").load_frame() - persons = frame.tables[EntityType.PERSON] - - assert frame.source.name == "sipp_tips_2023" - assert persons["tip_income"].tolist() == [144.0, 0.0, 60.0] - assert persons["employment_income"].tolist() == [12000.0, 0.0, 6000.0] - assert persons["count_under_18"].tolist() == [1.0, 1.0, 0.0] - assert persons["count_under_6"].tolist() == [0.0, 0.0, 0.0] - - -def test_sipp_assets_provider_uses_manifest_backed_raw_loader( - tmp_path, - monkeypatch, -) -> None: - path = tmp_path / "pu2023.csv" - pd.DataFrame( - { - "SSUID": ["100", "100", "101"], - "PNUM": [1, 2, 1], - "MONTHCODE": [11, 12, 12], - "WPFINWGT": [80.0, 80.0, 90.0], - "TAGE": [10, 35, 50], - "ESEX": [1, 2, 2], - "EMS": [2, 1, 0], - "TPTOTINC": [100.0, 200.0, 300.0], - "TVAL_BANK": [1.0, 2.0, 3.0], - "TVAL_STMF": [4.0, 5.0, 6.0], - "TVAL_BOND": [7.0, 8.0, 9.0], - "TVEH_NUM": [0.0, 2.0, 1.0], - "THVAL_VEH": [0.0, 12_000.0, 6_000.0], - } - ).to_csv(path, index=False, sep="|") - - monkeypatch.setattr( - donor_surveys, - "_download_policyengine_us_data_file", - lambda **_kwargs: path, - ) - - frame = SIPPSourceProvider(block="assets").load_frame() - persons = frame.tables[EntityType.PERSON] - - assert frame.source.name == "sipp_assets_2023" - assert persons["person_id"].tolist() == ["100:2", "101:1"] - assert persons["employment_income"].tolist() == [2400.0, 3600.0] - assert persons["is_female"].tolist() == [1.0, 1.0] - assert persons["is_married"].tolist() == [1.0, 0.0] - assert persons["count_under_18"].tolist() == [0.0, 0.0] - assert persons["household_vehicles_owned"].tolist() == [2.0, 1.0] - assert persons["household_vehicles_value"].tolist() == [12_000.0, 6_000.0] diff --git a/tests/test_family_imputation_benchmark.py b/tests/test_family_imputation_benchmark.py deleted file mode 100644 index 73a0636a..00000000 --- a/tests/test_family_imputation_benchmark.py +++ /dev/null @@ -1,393 +0,0 @@ -from __future__ import annotations - -import importlib.util - -import pandas as pd -import pytest - -from microplex_us.data_sources.family_imputation_benchmark import ( - DecomposableFamilyBenchmarkSpec, - _augment_sparse_shares_with_support_prior, - _mask_share_predictions_to_binary_support, - _mask_share_predictions_to_supported_components, - _sparsify_normalized_share_predictions, - benchmark_decomposable_family_imputers, - reconcile_component_predictions_to_total, -) - - -def _toy_family_frame() -> pd.DataFrame: - rows: list[dict[str, float | str]] = [] - for _ in range(20): - rows.append( - { - "age_bucket": "child", - "age": 12.0, - "is_male": 0.0, - "weight": 1.0, - "social_security": 100.0, - "social_security_retirement": 0.0, - "social_security_disability": 0.0, - "social_security_survivors": 0.0, - "social_security_dependents": 100.0, - } - ) - rows.append( - { - "age_bucket": "working", - "age": 45.0, - "is_male": 1.0, - "weight": 1.0, - "social_security": 100.0, - "social_security_retirement": 0.0, - "social_security_disability": 100.0, - "social_security_survivors": 0.0, - "social_security_dependents": 0.0, - } - ) - rows.append( - { - "age_bucket": "senior", - "age": 72.0, - "is_male": 0.0, - "weight": 1.0, - "social_security": 100.0, - "social_security_retirement": 100.0, - "social_security_disability": 0.0, - "social_security_survivors": 0.0, - "social_security_dependents": 0.0, - } - ) - return pd.DataFrame(rows) - - -def test_reconcile_component_predictions_to_total_respects_total_and_fallback(): - predicted = pd.DataFrame( - { - "ret": [1.0, 0.0], - "dis": [3.0, 0.0], - "surv": [0.0, 0.0], - "dep": [0.0, 0.0], - } - ) - total = pd.Series([20.0, 10.0], dtype=float) - reconciled = reconcile_component_predictions_to_total( - predicted, - family_total=total, - component_columns=("ret", "dis", "surv", "dep"), - fallback_shares={"ret": 0.25, "dis": 0.25, "surv": 0.25, "dep": 0.25}, - ) - - assert reconciled.sum(axis=1).tolist() == [20.0, 10.0] - assert reconciled.iloc[0]["ret"] == 5.0 - assert reconciled.iloc[0]["dis"] == 15.0 - assert reconciled.iloc[1]["dep"] == 2.5 - - -def test_support_gated_mask_keeps_top_supported_components(): - predicted_shares = pd.DataFrame( - { - "ret": [0.40, 0.40], - "dis": [0.35, 0.35], - "surv": [0.25, 0.25], - } - ) - support_probabilities = pd.DataFrame( - { - "ret": [0.90, 0.80], - "dis": [0.10, 0.70], - "surv": [0.20, 0.10], - } - ) - masked = _mask_share_predictions_to_supported_components( - predicted_shares, - support_probabilities, - predicted_active_counts=[1, 1], - component_columns=("ret", "dis", "surv"), - support_gate_probability_threshold=0.5, - ) - - assert masked.iloc[0].to_dict() == {"ret": 0.40, "dis": 0.0, "surv": 0.0} - assert masked.iloc[1].to_dict() == {"ret": 0.40, "dis": 0.35, "surv": 0.0} - - -def test_binary_support_mask_keeps_qrf_selected_components(): - predicted_shares = pd.DataFrame( - { - "ret": [0.40, 0.40], - "dis": [0.35, 0.35], - "surv": [0.25, 0.25], - } - ) - support_mask = pd.DataFrame( - { - "ret": [1.0, 0.0], - "dis": [0.0, 1.0], - "surv": [1.0, 0.0], - } - ) - masked = _mask_share_predictions_to_binary_support( - predicted_shares, - support_mask, - component_columns=("ret", "dis", "surv"), - ) - - assert masked.iloc[0].to_dict() == {"ret": 0.40, "dis": 0.0, "surv": 0.25} - assert masked.iloc[1].to_dict() == {"ret": 0.0, "dis": 0.35, "surv": 0.0} - - -def test_sparse_support_augmentation_adds_only_limited_supported_components(): - sparse_shares = pd.DataFrame( - { - "ret": [0.90, 0.80], - "dis": [0.10, 0.20], - "surv": [0.00, 0.00], - "dep": [0.00, 0.00], - } - ) - base_scores = pd.DataFrame( - { - "ret": [0.90, 0.80], - "dis": [0.10, 0.20], - "surv": [0.04, 0.03], - "dep": [0.02, 0.01], - } - ) - support_mask = pd.DataFrame( - { - "ret": [1.0, 1.0], - "dis": [0.0, 1.0], - "surv": [1.0, 1.0], - "dep": [1.0, 0.0], - } - ) - - augmented = _augment_sparse_shares_with_support_prior( - sparse_shares, - base_scores, - support_mask, - component_columns=("ret", "dis", "surv", "dep"), - max_extra_components=1, - ) - - assert augmented.iloc[0].to_dict() == { - "ret": 0.90, - "dis": 0.10, - "surv": 0.04, - "dep": 0.0, - } - assert augmented.iloc[1].to_dict() == { - "ret": 0.80, - "dis": 0.20, - "surv": 0.03, - "dep": 0.0, - } - - -def test_sparsify_normalized_shares_drops_tiny_components(): - normalized = pd.DataFrame( - { - "ret": [0.90, 0.04], - "dis": [0.07, 0.03], - "surv": [0.03, 0.93], - } - ) - sparsified = _sparsify_normalized_share_predictions( - normalized, - component_columns=("ret", "dis", "surv"), - min_component_share=0.05, - ) - - assert sparsified.iloc[0]["ret"] == pytest.approx(0.9278350515463918) - assert sparsified.iloc[0]["dis"] == pytest.approx(0.07216494845360825) - assert sparsified.iloc[0]["surv"] == pytest.approx(0.0) - assert sparsified.iloc[1]["ret"] == pytest.approx(0.0) - assert sparsified.iloc[1]["dis"] == pytest.approx(0.0) - assert sparsified.iloc[1]["surv"] == pytest.approx(1.0) - - -def test_grouped_share_benchmark_is_exact_on_group_determined_family(): - frame = _toy_family_frame() - report = benchmark_decomposable_family_imputers( - frame, - spec=DecomposableFamilyBenchmarkSpec( - total_column="social_security", - component_columns=( - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - ), - grouped_feature_sets=(("age_bucket",),), - qrf_condition_vars=("age", "is_male", "social_security"), - implicit_component_column="social_security_dependents", - group_eval_columns=("age_bucket",), - reweight_feature_sets=(("age_bucket",),), - reweight_initial_weight_mode="uniform", - qrf_n_estimators=20, - ), - train_frac=0.75, - target_frac=0.1, - random_seed=42, - repeat_count=3, - ) - - grouped = report.methods["grouped_share"] - forest = report.methods["forest_share"] - sparse_forest = report.methods["sparse_forest_share"] - support_gated = report.methods["support_gated_forest_share"] - qrf_masked = report.methods["qrf_support_masked_forest_share"] - qrf_augmented_sparse = report.methods["qrf_augmented_sparse_forest_share"] - assert report.train_row_count + report.eval_row_count + report.target_row_count == report.row_count - assert report.repeat_count == 3 - assert report.split_seeds == (42, 43, 44) - assert len(report.repeat_summaries) == 3 - assert grouped.component_group_sum_mare["social_security_retirement"] == 0.0 - assert grouped.component_group_sum_mare["social_security_disability"] == 0.0 - assert grouped.component_group_sum_mare["social_security_dependents"] == 0.0 - assert grouped.repeat_metric_summary is not None - assert ( - grouped.repeat_metric_summary["mean_component_total_relative_error"]["median"] - == grouped.mean_component_total_relative_error - ) - assert grouped.pre_target_mean_component_total_relative_error is not None - assert grouped.pre_target_mean_component_total_relative_error > 0.0 - assert grouped.post_reweight_mean_component_total_relative_error == pytest.approx(0.0) - assert grouped.post_reweight_mean_component_group_sum_mare == pytest.approx(0.0) - assert grouped.post_reweight_mean_component_total_error_lift < 0.0 - assert grouped.oracle_pre_target_mean_component_total_relative_error is not None - assert grouped.oracle_post_reweight_mean_component_total_relative_error == pytest.approx(0.0) - assert grouped.oracle_post_reweight_mean_component_total_error_lift <= 0.0 - assert grouped.post_reweight_mean_component_total_error_excess_over_oracle == pytest.approx(0.0) - assert grouped.reweighting_summary is not None - assert grouped.reweighting_summary["initial_weight_mode"] == "uniform" - assert grouped.reweighting_summary["target_row_count"] == report.target_row_count - assert grouped.reweighting_summary["eval_row_count"] == report.eval_row_count - assert grouped.reweighting_summary["mean_abs_relative_weight_change"] >= 0.0 - assert grouped.reweighting_summary["share_rows_changed_gt_1pct"] >= 0.0 - assert forest.component_group_sum_mare["social_security_retirement"] < 1.0 - assert forest.component_group_sum_mare["social_security_disability"] < 1.0 - assert forest.component_group_sum_mare["social_security_dependents"] < 1.0 - assert forest.pre_target_mean_component_total_relative_error is not None - assert forest.post_reweight_mean_component_total_relative_error is not None - assert forest.oracle_pre_target_mean_component_total_relative_error is not None - assert forest.oracle_post_reweight_mean_component_total_relative_error is not None - assert forest.post_reweight_mean_component_total_error_lift is not None - assert forest.post_reweight_mean_component_total_error_excess_over_oracle is not None - assert sparse_forest.mean_component_total_relative_error >= 0.0 - assert sparse_forest.mean_component_support_relative_error >= 0.0 - assert sparse_forest.pre_target_mean_component_total_relative_error is not None - assert sparse_forest.post_reweight_mean_component_total_relative_error is not None - assert sparse_forest.oracle_pre_target_mean_component_total_relative_error is not None - assert sparse_forest.oracle_post_reweight_mean_component_total_relative_error is not None - assert sparse_forest.post_reweight_mean_component_total_error_excess_over_oracle is not None - assert support_gated.mean_component_total_relative_error >= 0.0 - assert support_gated.mean_component_support_relative_error >= 0.0 - assert support_gated.pre_target_mean_component_total_relative_error is not None - assert support_gated.post_reweight_mean_component_total_relative_error is not None - assert support_gated.oracle_pre_target_mean_component_total_relative_error is not None - assert support_gated.oracle_post_reweight_mean_component_total_relative_error is not None - assert support_gated.post_reweight_mean_component_total_error_excess_over_oracle is not None - assert qrf_masked.mean_component_total_relative_error >= 0.0 - assert qrf_masked.mean_component_support_relative_error >= 0.0 - assert qrf_masked.pre_target_mean_component_total_relative_error is not None - assert qrf_masked.post_reweight_mean_component_total_relative_error is not None - assert qrf_masked.oracle_pre_target_mean_component_total_relative_error is not None - assert qrf_masked.oracle_post_reweight_mean_component_total_relative_error is not None - assert qrf_masked.post_reweight_mean_component_total_error_excess_over_oracle is not None - assert qrf_augmented_sparse.mean_component_total_relative_error >= 0.0 - assert qrf_augmented_sparse.mean_component_support_relative_error >= 0.0 - assert qrf_augmented_sparse.pre_target_mean_component_total_relative_error is not None - assert qrf_augmented_sparse.post_reweight_mean_component_total_relative_error is not None - assert qrf_augmented_sparse.oracle_pre_target_mean_component_total_relative_error is not None - assert qrf_augmented_sparse.oracle_post_reweight_mean_component_total_relative_error is not None - assert qrf_augmented_sparse.post_reweight_mean_component_total_error_excess_over_oracle is not None - - -@pytest.mark.skipif( - importlib.util.find_spec("quantile_forest") is None, - reason="quantile_forest not installed", -) -def test_qrf_benchmark_returns_expected_metric_surface(): - frame = _toy_family_frame() - report = benchmark_decomposable_family_imputers( - frame, - spec=DecomposableFamilyBenchmarkSpec( - total_column="social_security", - component_columns=( - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - ), - grouped_feature_sets=(("age_bucket",),), - qrf_condition_vars=("age", "is_male", "social_security"), - implicit_component_column="social_security_dependents", - group_eval_columns=("age_bucket",), - reweight_feature_sets=(("age_bucket",),), - qrf_n_estimators=20, - ), - train_frac=0.75, - target_frac=0.1, - random_seed=1, - repeat_count=2, - ) - - qrf = report.methods["qrf"] - forest = report.methods["forest_share"] - sparse_forest = report.methods["sparse_forest_share"] - support_gated = report.methods["support_gated_forest_share"] - qrf_masked = report.methods["qrf_support_masked_forest_share"] - assert report.repeat_count == 2 - assert len(report.repeat_summaries) == 2 - assert set(qrf.component_total_relative_error) == { - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - } - assert qrf.mean_component_total_relative_error >= 0.0 - assert qrf.pre_target_mean_component_total_relative_error is not None - assert qrf.post_reweight_mean_component_total_relative_error is not None - assert qrf.oracle_pre_target_mean_component_total_relative_error is not None - assert qrf.oracle_post_reweight_mean_component_total_relative_error is not None - assert qrf.repeat_metric_summary is not None - assert qrf.post_reweight_mean_component_total_error_lift is not None - assert set(forest.component_total_relative_error) == { - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - } - assert forest.mean_component_total_relative_error >= 0.0 - assert forest.pre_target_mean_component_total_relative_error is not None - assert forest.post_reweight_mean_component_total_relative_error is not None - assert forest.oracle_pre_target_mean_component_total_relative_error is not None - assert forest.oracle_post_reweight_mean_component_total_relative_error is not None - assert set(sparse_forest.component_total_relative_error) == { - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - } - assert sparse_forest.mean_component_total_relative_error >= 0.0 - assert sparse_forest.post_reweight_mean_component_total_relative_error is not None - assert sparse_forest.oracle_post_reweight_mean_component_total_relative_error is not None - assert set(support_gated.component_total_relative_error) == { - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - } - assert support_gated.mean_component_total_relative_error >= 0.0 - assert support_gated.post_reweight_mean_component_total_relative_error is not None - assert support_gated.oracle_post_reweight_mean_component_total_relative_error is not None - assert set(qrf_masked.component_total_relative_error) == { - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - "social_security_dependents", - } - assert qrf_masked.mean_component_total_relative_error >= 0.0 - assert qrf_masked.post_reweight_mean_component_total_relative_error is not None - assert qrf_masked.oracle_post_reweight_mean_component_total_relative_error is not None diff --git a/tests/test_forbes_fixed_spine.py b/tests/test_forbes_fixed_spine.py deleted file mode 100644 index e31d2414..00000000 --- a/tests/test_forbes_fixed_spine.py +++ /dev/null @@ -1,254 +0,0 @@ -"""Tests for the Forbes fixed-spine source contract.""" - -from __future__ import annotations - -import json - -import pandas as pd -import pytest -from microplex.core import EntityType -from microplex.targets import ( - TargetAggregation, - TargetFilter, - TargetSet, - TargetSpec, -) - -from microplex_us.data_sources.forbes import ( - ForbesFixedSpineConfig, - append_forbes_fixed_spine_tables, - build_forbes_fixed_spine, - fixed_spine_contribution_diagnostics_json, - read_forbes_fixed_spine_records, - residualize_targets_for_fixed_spine, -) -from microplex_us.policyengine.us import ( - PolicyEngineUSEntityTableBundle, - build_policyengine_us_export_variable_maps, -) - - -def _records() -> pd.DataFrame: - return pd.DataFrame( - [ - { - "forbes_unit_id": "forbes-1", - "name": "Example Founder", - "rank": 1, - "state_fips": 6, - "age": 71, - "is_female": 0, - "net_worth": 10_000_000_000.0, - "employment_income_before_lsr": 2_000_000.0, - "taxable_interest_income": 40_000_000.0, - "qualified_dividend_income": 80_000_000.0, - "long_term_capital_gains_before_response": 500_000_000.0, - "weight": 1.0, - } - ] - ) - - -def test_build_forbes_fixed_spine_splits_weights_and_keeps_metadata_separate(): - spine = build_forbes_fixed_spine( - _records(), - config=ForbesFixedSpineConfig( - snapshot_id="forbes-test-2024", - replicates_per_unit=4, - ), - ) - - assert len(spine.tables.households) == 4 - assert len(spine.tables.persons) == 4 - assert spine.tables.households["household_weight"].tolist() == pytest.approx( - [0.25, 0.25, 0.25, 0.25] - ) - assert spine.tables.households["net_worth"].tolist() == pytest.approx( - [10_000_000_000.0] * 4 - ) - assert spine.tables.households["state_fips"].tolist() == [6] * 4 - - for table in ( - spine.tables.households, - spine.tables.persons, - spine.tables.tax_units, - spine.tables.spm_units, - spine.tables.families, - spine.tables.marital_units, - ): - assert table is not None - assert not any(column.startswith("forbes_") for column in table.columns) - - assert spine.record_metadata["forbes_name"].tolist() == ["Example Founder"] * 4 - assert spine.record_metadata["replicate_index"].tolist() == [0, 1, 2, 3] - assert spine.source_metadata["snapshot_id"] == "forbes-test-2024" - assert spine.source_metadata["record_count"] == 4 - - -def test_forbes_fixed_spine_export_maps_do_not_include_source_diagnostics(): - class FakeEntity: - def __init__(self, key): - self.key = key - - class FakeVariable: - def __init__(self, entity): - self.entity = FakeEntity(entity) - - class FakeSystem: - variables = { - "state_fips": FakeVariable("household"), - "net_worth": FakeVariable("household"), - "age": FakeVariable("person"), - "taxable_interest_income": FakeVariable("person"), - "qualified_dividend_income": FakeVariable("person"), - "long_term_capital_gains_before_response": FakeVariable("person"), - "forbes_rank": FakeVariable("household"), - "forbes_name": FakeVariable("person"), - } - - spine = build_forbes_fixed_spine(_records()) - - export_maps = build_policyengine_us_export_variable_maps( - spine.tables, - tax_benefit_system=FakeSystem(), - ) - exported_variables = { - variable - for entity_map in export_maps.values() - for variable in entity_map.values() - } - - assert "net_worth" in exported_variables - assert "long_term_capital_gains_before_response" in exported_variables - assert not any(variable.startswith("forbes_") for variable in exported_variables) - - -def test_read_forbes_fixed_spine_records_tracks_source_checksum(tmp_path): - path = tmp_path / "forbes.jsonl" - path.write_text( - "\n".join(json.dumps(record) for record in _records().to_dict("records")) - ) - - records = read_forbes_fixed_spine_records(path) - spine = build_forbes_fixed_spine( - records, - config=ForbesFixedSpineConfig(replicates_per_unit=2), - source_path=path, - ) - - assert len(records) == 1 - assert spine.source_metadata["source_path"] == str(path) - assert isinstance(spine.source_metadata["source_sha256"], str) - assert len(spine.source_metadata["source_sha256"]) == 64 - - -def test_append_forbes_fixed_spine_tables_keeps_fixed_weights_post_calibration(): - base = PolicyEngineUSEntityTableBundle( - households=pd.DataFrame( - { - "household_id": [1], - "household_weight": [99.0], - "state_fips": [6], - } - ), - persons=pd.DataFrame( - { - "person_id": [10], - "household_id": [1], - "weight": [99.0], - } - ), - ) - spine = build_forbes_fixed_spine( - _records(), - config=ForbesFixedSpineConfig(replicates_per_unit=2), - ) - - appended = append_forbes_fixed_spine_tables(base, spine) - - assert appended.households["household_weight"].sum() == pytest.approx(100.0) - assert appended.persons["weight"].sum() == pytest.approx(100.0) - assert not any( - column.startswith("forbes_") for column in appended.households.columns - ) - - -def test_residualize_targets_for_fixed_spine_subtracts_additive_contributions(): - spine = build_forbes_fixed_spine( - _records(), - config=ForbesFixedSpineConfig(replicates_per_unit=5), - ) - targets = TargetSet( - [ - TargetSpec( - name="national_net_worth", - entity=EntityType.HOUSEHOLD, - value=15_000_000_000.0, - period=2024, - measure="net_worth", - aggregation=TargetAggregation.SUM, - ), - TargetSpec( - name="ca_ltcg", - entity=EntityType.PERSON, - value=800_000_000.0, - period=2024, - measure="long_term_capital_gains_before_response", - aggregation=TargetAggregation.SUM, - filters=(TargetFilter("state_fips", "==", "06"),), - ), - TargetSpec( - name="ca_top_tail_person_count", - entity=EntityType.PERSON, - value=12.0, - period=2024, - aggregation=TargetAggregation.COUNT, - filters=( - TargetFilter("state_fips", "==", "06"), - TargetFilter("long_term_capital_gains_before_response", ">", 0), - ), - ), - ] - ) - - result = residualize_targets_for_fixed_spine(targets, spine.tables) - residuals = {target.name: target.value for target in result.targets.targets} - - assert residuals["national_net_worth"] == pytest.approx(5_000_000_000.0) - assert residuals["ca_ltcg"] == pytest.approx(300_000_000.0) - assert residuals["ca_top_tail_person_count"] == pytest.approx(11.0) - assert [item.status for item in result.contributions] == [ - "supported", - "supported", - "supported", - ] - assert result.targets.targets[0].metadata["fixed_spine_residualization"] == { - "original_value": 15_000_000_000.0, - "fixed_spine_contribution": 10_000_000_000.0, - "residual_value": 5_000_000_000.0, - "clamped": False, - } - - -def test_residualize_targets_for_fixed_spine_reports_unsupported_mean_targets(): - spine = build_forbes_fixed_spine(_records()) - targets = TargetSet( - [ - TargetSpec( - name="mean_net_worth", - entity=EntityType.HOUSEHOLD, - value=100.0, - period=2024, - measure="net_worth", - aggregation=TargetAggregation.MEAN, - ) - ] - ) - - result = residualize_targets_for_fixed_spine(targets, spine.tables) - - assert result.targets.targets[0].value == 100.0 - assert result.contributions[0].status == "unsupported" - assert "not additive" in result.contributions[0].reason - diagnostics = json.loads(fixed_spine_contribution_diagnostics_json(result)) - assert diagnostics[0]["target_name"] == "mean_net_worth" diff --git a/tests/test_geography.py b/tests/test_geography.py deleted file mode 100644 index e11dba67..00000000 --- a/tests/test_geography.py +++ /dev/null @@ -1,330 +0,0 @@ -"""US-specific block-geography tests for microplex-us.""" - -from __future__ import annotations - -from pathlib import Path - -import numpy as np -import pandas as pd -import pytest -from microplex.geography import GeographyProvider - -from microplex_us.geography import ( - BLOCK_GEOID_LEN, - BLOCK_LEN, - COUNTY_GEOID_LEN, - COUNTY_LEN, - DEFAULT_BLOCK_PROBABILITIES_PATH, - STATE_GEOID_LEN, - STATE_LEN, - TRACT_GEOID_LEN, - TRACT_LEN, - BlockGeography, - derive_geographies, - load_block_probabilities, -) - - -def _sample_block_table() -> pd.DataFrame: - return pd.DataFrame( - { - "geoid": ["060010001001001", "060010001001002", "360610001001001"], - "state_fips": ["06", "06", "36"], - "county": ["001", "001", "061"], - "tract": ["000100", "000100", "000100"], - "tract_geoid": ["06001000100", "06001000100", "36061000100"], - "cd_id": ["CA-13", "CA-13", "NY-12"], - "prob": [0.6, 0.4, 1.0], - "national_prob": [0.3, 0.2, 0.5], - } - ) - - -def test_core_block_geography_proxy_supports_isinstance() -> None: - from microplex.geography import BlockGeography as CoreBlockGeography - - geography = BlockGeography.from_data(_sample_block_table()) - - assert isinstance(geography, CoreBlockGeography) - - -class TestGEOIDConstants: - def test_state_len(self) -> None: - assert STATE_LEN == 2 - - def test_county_len(self) -> None: - assert COUNTY_LEN == 3 - - def test_tract_len(self) -> None: - assert TRACT_LEN == 6 - - def test_block_len(self) -> None: - assert BLOCK_LEN == 4 - - def test_cumulative_lengths(self) -> None: - assert STATE_GEOID_LEN == 2 - assert COUNTY_GEOID_LEN == 5 - assert TRACT_GEOID_LEN == 11 - assert BLOCK_GEOID_LEN == 15 - - -class TestStaticGeographyExtraction: - SAMPLE_BLOCK = "060372073021001" - - def test_get_state_from_block_geoid(self) -> None: - assert BlockGeography.get_state(self.SAMPLE_BLOCK) == "06" - - def test_get_county_from_block_geoid(self) -> None: - assert BlockGeography.get_county(self.SAMPLE_BLOCK) == "06037" - - def test_get_tract_from_block_geoid(self) -> None: - assert BlockGeography.get_tract(self.SAMPLE_BLOCK) == "06037207302" - - def test_geoid_length_validation(self) -> None: - assert len(BlockGeography.get_state(self.SAMPLE_BLOCK)) == STATE_GEOID_LEN - assert len(BlockGeography.get_county(self.SAMPLE_BLOCK)) == COUNTY_GEOID_LEN - assert len(BlockGeography.get_tract(self.SAMPLE_BLOCK)) == TRACT_GEOID_LEN - - def test_multiple_blocks_different_states(self) -> None: - blocks = { - "010010201001000": ("01", "01001", "01001020100"), - "060372073021001": ("06", "06037", "06037207302"), - "481131234001234": ("48", "48113", "48113123400"), - } - for block, (state, county, tract) in blocks.items(): - assert BlockGeography.get_state(block) == state - assert BlockGeography.get_county(block) == county - assert BlockGeography.get_tract(block) == tract - - -class TestLoadBlockProbabilities: - @pytest.fixture - def data_path(self) -> Path: - return DEFAULT_BLOCK_PROBABILITIES_PATH - - def test_load_block_probabilities_reads_parquet(self, tmp_path: Path) -> None: - sample = _sample_block_table() - path = tmp_path / "block_probabilities.parquet" - sample.to_parquet(path) - - loaded = load_block_probabilities(path) - - pd.testing.assert_frame_equal(loaded, sample) - - def test_load_default_path(self, data_path: Path) -> None: - if not data_path.exists(): - pytest.skip("Block probabilities data not available") - df = load_block_probabilities() - assert isinstance(df, pd.DataFrame) - assert len(df) > 0 - - def test_load_explicit_path(self, data_path: Path) -> None: - if not data_path.exists(): - pytest.skip("Block probabilities data not available") - df = load_block_probabilities(data_path) - assert isinstance(df, pd.DataFrame) - - def test_required_columns_present(self, data_path: Path) -> None: - if not data_path.exists(): - pytest.skip("Block probabilities data not available") - df = load_block_probabilities(data_path) - required_cols = ["geoid", "state_fips", "population", "prob"] - for col in required_cols: - assert col in df.columns, f"Missing required column: {col}" - - def test_file_not_found_raises(self) -> None: - with pytest.raises(FileNotFoundError): - load_block_probabilities("/nonexistent/path/file.parquet") - - -class TestBlockGeographyProvider: - def test_block_geography_implements_provider_interface(self) -> None: - geo = BlockGeography.from_data( - pd.DataFrame( - { - "geoid": ["060010201001000", "360590101001000"], - "state_fips": ["06", "36"], - "county": ["001", "059"], - "tract": ["020100", "010100"], - "prob": [1.0, 1.0], - } - ) - ) - - assert isinstance(geo, GeographyProvider) - crosswalk = geo.load_crosswalk() - assigner = geo.load_assigner() - result = assigner.assign(pd.DataFrame({"state_fips": ["06", "36"]}), random_state=0) - - assert crosswalk.atomic_id_column == "block_geoid" - assert result["block_geoid"].tolist() == [ - "060010201001000", - "360590101001000", - ] - - def test_block_geography_materializes_and_samples_from_in_memory_data(self) -> None: - geography = BlockGeography.from_data(_sample_block_table()) - assigned = geography.assign(pd.DataFrame({"state_fips": ["06", "36"]}), random_state=1) - materialized = geography.materialize(assigned) - - assert "block_geoid" in assigned.columns - assert set(["state_fips", "county_fips", "tract_geoid", "cd_id"]).issubset( - materialized.columns - ) - assert set(materialized["state_fips"]) == {"06", "36"} - - def test_derive_geographies_uses_block_string_structure(self) -> None: - result = derive_geographies(["060010001001001", "360610001001001"]) - - assert list(result["state_fips"]) == ["06", "36"] - assert list(result["county_fips"]) == ["06001", "36061"] - - -class TestBlockGeography: - @pytest.fixture - def data_path(self) -> Path: - return DEFAULT_BLOCK_PROBABILITIES_PATH - - @pytest.fixture - def geo(self, data_path: Path) -> BlockGeography: - if not data_path.exists(): - pytest.skip("Block probabilities data not available") - return BlockGeography(data_path, lazy_load=False) - - def test_lazy_load_default(self, data_path: Path) -> None: - if not data_path.exists(): - pytest.skip("Block probabilities data not available") - geo = BlockGeography(data_path) - assert geo._data is None - - def test_eager_load(self, data_path: Path) -> None: - if not data_path.exists(): - pytest.skip("Block probabilities data not available") - geo = BlockGeography(data_path, lazy_load=False) - assert geo._data is not None - - def test_data_property_loads(self, data_path: Path) -> None: - if not data_path.exists(): - pytest.skip("Block probabilities data not available") - geo = BlockGeography(data_path) - assert geo._data is None - _ = geo.data - assert geo._data is not None - - def test_get_cd_requires_lookup(self, geo: BlockGeography) -> None: - sample_block = geo.data["geoid"].iloc[0] - expected_cd = geo.data[geo.data["geoid"] == sample_block]["cd_id"].iloc[0] - assert geo.get_cd(sample_block) == expected_cd - - def test_get_cd_unknown_block(self, geo: BlockGeography) -> None: - assert geo.get_cd("000000000000000") is None - - def test_get_all_geographies(self, geo: BlockGeography) -> None: - sample_block = geo.data["geoid"].iloc[0] - result = geo.get_all_geographies(sample_block) - - assert isinstance(result, dict) - assert "state_fips" in result - assert "county_fips" in result - assert "tract_geoid" in result - assert "cd_id" in result - - def test_states_property(self, geo: BlockGeography) -> None: - states = geo.states - assert isinstance(states, list) - assert len(states) > 0 - assert states == sorted(states) - - def test_n_blocks_property(self, geo: BlockGeography) -> None: - assert isinstance(geo.n_blocks, int) - assert geo.n_blocks > 0 - assert geo.n_blocks == len(geo.data) - - def test_from_data_supports_in_memory_crosswalks(self) -> None: - data = pd.DataFrame( - { - "geoid": ["060010201001000", "060010201001001"], - "state_fips": ["06", "06"], - "county": ["001", "001"], - "tract_geoid": ["06001020100", "06001020100"], - "cd_id": ["CA-01", "CA-01"], - "prob": [0.4, 0.6], - } - ) - - geo = BlockGeography.from_data(data) - - assert geo.n_blocks == 2 - assert geo.get_cd("060010201001000") == "CA-01" - - def test_assign_and_materialize_round_trip(self) -> None: - data = pd.DataFrame( - { - "geoid": [ - "060010201001000", - "060010201001001", - "360590101001000", - ], - "state_fips": ["06", "06", "36"], - "county": ["001", "001", "059"], - "tract_geoid": ["06001020100", "06001020100", "36059010100"], - "cd_id": ["CA-01", "CA-02", "NY-01"], - "prob": [0.25, 0.75, 1.0], - } - ) - geo = BlockGeography.from_data(data) - households = pd.DataFrame({"state_fips": [6.0, 47.9]}) - - assigned = geo.assign(households, random_state=42) - materialized = geo.materialize(assigned, columns=("tract_geoid", "cd_id")) - - assert "block_geoid" in assigned.columns - assert assigned["block_geoid"].iloc[0].startswith("06") - assert assigned["block_geoid"].iloc[1].startswith("36") - assert materialized["cd_id"].tolist() == ["CA-02", "NY-01"] - assert materialized["tract_geoid"].tolist() == [ - "06001020100", - "36059010100", - ] - - -class TestBlockSampling: - @pytest.fixture - def geo(self) -> BlockGeography: - if not DEFAULT_BLOCK_PROBABILITIES_PATH.exists(): - pytest.skip("Block probabilities data not available") - return BlockGeography(DEFAULT_BLOCK_PROBABILITIES_PATH, lazy_load=False) - - def test_sample_blocks_returns_array(self, geo: BlockGeography) -> None: - blocks = geo.sample_blocks("06", n=10, random_state=42) - assert isinstance(blocks, np.ndarray) - assert len(blocks) == 10 - - def test_sample_blocks_from_correct_state(self, geo: BlockGeography) -> None: - blocks = geo.sample_blocks("06", n=100, random_state=42) - for block in blocks: - assert BlockGeography.get_state(block) == "06" - - def test_sample_blocks_reproducible(self, geo: BlockGeography) -> None: - blocks1 = geo.sample_blocks("06", n=10, random_state=42) - blocks2 = geo.sample_blocks("06", n=10, random_state=42) - np.testing.assert_array_equal(blocks1, blocks2) - - def test_sample_blocks_invalid_state(self, geo: BlockGeography) -> None: - with pytest.raises(ValueError, match="not found"): - geo.sample_blocks("99", n=10) - - def test_sample_blocks_national(self, geo: BlockGeography) -> None: - blocks = geo.sample_blocks_national(n=100, random_state=42) - assert len(blocks) == 100 - states = set(BlockGeography.get_state(block) for block in blocks) - assert len(states) > 1 - - def test_sample_blocks_weighted(self, geo: BlockGeography) -> None: - blocks = geo.sample_blocks("06", n=10000, random_state=42) - counts = pd.Series(blocks).value_counts(normalize=True) - state_df = geo.get_blocks_in_state("06") - expected = state_df.set_index("geoid")["prob"] / state_df["prob"].sum() - - for geoid, expected_prob in expected.items(): - assert abs(counts.get(geoid, 0.0) - expected_prob) < 0.05 diff --git a/tests/test_hierarchical.py b/tests/test_hierarchical.py deleted file mode 100644 index 3adab0ab..00000000 --- a/tests/test_hierarchical.py +++ /dev/null @@ -1,27 +0,0 @@ -"""US-specific hierarchical preprocessing helpers.""" - -from __future__ import annotations - -import pandas as pd - -from microplex_us.hierarchical import prepare_cps_for_hierarchical - - -def test_prepare_cps_for_hierarchical_builds_household_summary() -> None: - cps_data = pd.DataFrame( - { - "household_id": [1, 1, 1, 2, 2, 3], - "age": [45, 42, 12, 67, 65, 35], - "state_fips": [6, 6, 6, 36, 36, 48], - "tenure": [1, 1, 1, 2, 2, 1], - "hh_weight": [1000, 1000, 1000, 800, 800, 1200], - } - ) - - households, persons = prepare_cps_for_hierarchical(cps_data) - - assert len(households) == 3 - assert households.loc[households["household_id"] == 1, "n_persons"].iloc[0] == 3 - assert households.loc[households["household_id"] == 1, "n_adults"].iloc[0] == 2 - assert households.loc[households["household_id"] == 1, "n_children"].iloc[0] == 1 - assert len(persons) == 6 diff --git a/tests/test_hierarchical_block_assignment.py b/tests/test_hierarchical_block_assignment.py deleted file mode 100644 index f0270546..00000000 --- a/tests/test_hierarchical_block_assignment.py +++ /dev/null @@ -1,200 +0,0 @@ -"""US-specific block-assignment tests for the core hierarchical synthesizer.""" - -from __future__ import annotations - -import pandas as pd -import pytest -from microplex.geography import ( - AtomicGeographyCrosswalk, - GeographyAssignmentPlan, - StaticGeographyProvider, -) -from microplex.hierarchical import HierarchicalSynthesizer - -from microplex_us.geography import derive_geographies - - -class TestBlockAssignment: - """Tests for US-style block-level geographic assignment.""" - - @pytest.fixture - def sample_block_probs(self) -> pd.DataFrame: - return pd.DataFrame( - { - "geoid": [ - "060010201001000", - "060010201001001", - "060010201001002", - "360590101001000", - "360590101001001", - "480010101001000", - "480010101001001", - "480010101001002", - ], - "state_fips": ["06", "06", "06", "36", "36", "48", "48", "48"], - "county": ["001", "001", "001", "059", "059", "001", "001", "001"], - "tract": ["020100", "020100", "020100", "010100", "010100", "010100", "010100", "010100"], - "block": ["1000", "1001", "1002", "1000", "1001", "1000", "1001", "1002"], - "population": [100, 200, 100, 300, 200, 150, 250, 100], - "tract_geoid": [ - "06001020100", - "06001020100", - "06001020100", - "36059010100", - "36059010100", - "48001010100", - "48001010100", - "48001010100", - ], - "cd_id": ["CA-01", "CA-01", "CA-01", "NY-01", "NY-01", "TX-01", "TX-01", "TX-01"], - "prob": [0.25, 0.50, 0.25, 0.6, 0.4, 0.3, 0.5, 0.2], - } - ) - - @pytest.fixture - def sample_cd_probs(self) -> pd.DataFrame: - return pd.DataFrame( - { - "state_fips": [6, 6, 36, 36, 48, 48], - "cd_id": ["CA-01", "CA-02", "NY-01", "NY-02", "TX-01", "TX-02"], - "prob": [0.6, 0.4, 0.5, 0.5, 0.7, 0.3], - } - ) - - def test_init_with_block_probabilities(self, sample_block_probs: pd.DataFrame) -> None: - synthesizer = HierarchicalSynthesizer(block_probabilities=sample_block_probs) - - assert synthesizer.geography_assignment is not None - assert synthesizer.geography_assignment.atomic_id_column == "block_geoid" - assert synthesizer._geography_assigner is not None - - def test_init_with_cd_probabilities_backward_compat( - self, - sample_cd_probs: pd.DataFrame, - ) -> None: - synthesizer = HierarchicalSynthesizer(cd_probabilities=sample_cd_probs) - - assert synthesizer.geography_assignment is not None - assert synthesizer.geography_assignment.atomic_id_column == "cd_id" - assert synthesizer._geography_assigner is not None - - def test_cd_probabilities_allow_state_local_district_ids(self) -> None: - cd_probs = pd.DataFrame( - { - "state_fips": [6, 6, 36, 36], - "cd_id": [1, 2, 1, 2], - "prob": [0.6, 0.4, 0.5, 0.5], - } - ) - households = pd.DataFrame({"state_fips": [6, 36]}) - synthesizer = HierarchicalSynthesizer( - cd_probabilities=cd_probs, - random_state=123, - ) - - result = synthesizer._apply_geography_assignment(households) - - assert "_microplex_cd_atomic_id" not in result.columns - assert result["state_fips"].tolist() == [6, 36] - assert result["cd_id"].isin([1, 2]).all() - - def test_block_probabilities_take_precedence( - self, - sample_block_probs: pd.DataFrame, - sample_cd_probs: pd.DataFrame, - ) -> None: - synthesizer = HierarchicalSynthesizer( - cd_probabilities=sample_cd_probs, - block_probabilities=sample_block_probs, - ) - - assert synthesizer.geography_assignment is not None - assert synthesizer.geography_assignment.atomic_id_column == "block_geoid" - - def test_init_with_geography_provider(self, sample_block_probs: pd.DataFrame) -> None: - crosswalk = AtomicGeographyCrosswalk( - data=sample_block_probs.rename(columns={"geoid": "block_geoid"}), - atomic_id_column="block_geoid", - geography_columns=tuple( - column - for column in ("state_fips", "cd_id", "tract_geoid") - if column in sample_block_probs.columns - ), - probability_column="prob", - ) - provider = StaticGeographyProvider( - crosswalk=crosswalk, - default_partition_columns=("state_fips",), - ) - plan = GeographyAssignmentPlan( - partition_columns=("state_fips",), - atomic_id_column="block_geoid", - ) - - synthesizer = HierarchicalSynthesizer( - geography_provider=provider, - geography_assignment=plan, - ) - - assert synthesizer.geography_assignment == plan - assert synthesizer._geography_assigner is not None - - def test_assign_blocks_adds_block_geoid_only(self, sample_block_probs: pd.DataFrame) -> None: - synthesizer = HierarchicalSynthesizer( - block_probabilities=sample_block_probs, - random_state=42, - ) - households = pd.DataFrame({"state_fips": [6, 36, 48], "n_persons": [3, 2, 4]}) - - result = synthesizer._apply_geography_assignment(households) - - assert "block_geoid" in result.columns - assert "tract_geoid" not in result.columns - assert "county_fips" not in result.columns - assert "cd_id" not in result.columns - - def test_block_geoid_structure(self, sample_block_probs: pd.DataFrame) -> None: - synthesizer = HierarchicalSynthesizer( - block_probabilities=sample_block_probs, - random_state=42, - ) - households = pd.DataFrame({"state_fips": [6, 36, 48], "n_persons": [3, 2, 4]}) - - result = synthesizer._apply_geography_assignment(households) - - for _, row in result.iterrows(): - block_geoid = row["block_geoid"] - assert len(block_geoid) == 15 - assert len(block_geoid[:11]) == 11 - assert len(block_geoid[:5]) == 5 - - def test_derive_geographies_from_block(self, sample_block_probs: pd.DataFrame) -> None: - synthesizer = HierarchicalSynthesizer( - block_probabilities=sample_block_probs, - random_state=42, - ) - households = pd.DataFrame({"state_fips": [6, 36, 48], "n_persons": [3, 2, 4]}) - - result = synthesizer._apply_geography_assignment(households) - geographies = derive_geographies( - result["block_geoid"], - include_cd=True, - block_data=sample_block_probs, - ) - - california_mask = geographies["state_fips"] == "06" - assert all(cd.startswith("CA-") for cd in geographies.loc[california_mask, "cd_id"]) - - new_york_mask = geographies["state_fips"] == "36" - assert all(cd.startswith("NY-") for cd in geographies.loc[new_york_mask, "cd_id"]) - - def test_state_fips_fixed_to_valid(self, sample_block_probs: pd.DataFrame) -> None: - synthesizer = HierarchicalSynthesizer( - block_probabilities=sample_block_probs, - random_state=42, - ) - households = pd.DataFrame({"state_fips": [6.3, 36.7, 47.9], "n_persons": [3, 2, 4]}) - - result = synthesizer._apply_geography_assignment(households) - - assert result["state_fips"].tolist() == ["06", "36", "48"] diff --git a/tests/test_install_script.py b/tests/test_install_script.py deleted file mode 100644 index 5735059d..00000000 --- a/tests/test_install_script.py +++ /dev/null @@ -1,98 +0,0 @@ -from __future__ import annotations - -import os -import subprocess -from pathlib import Path - -REPO_ROOT = Path(__file__).resolve().parents[1] -INSTALL_SCRIPT = REPO_ROOT / "scripts/install.sh" - - -def _run_install( - *args: str, - system: str = "Darwin", - machine: str = "arm64", -) -> subprocess.CompletedProcess[str]: - env = os.environ.copy() - env.update( - { - "CONDA_EXE": "conda", - "MICROPLEX_US_INSTALL_UNAME_S": system, - "MICROPLEX_US_INSTALL_UNAME_M": machine, - } - ) - return subprocess.run( - ["bash", str(INSTALL_SCRIPT), *args], - cwd=REPO_ROOT, - env=env, - text=True, - capture_output=True, - check=False, - ) - - -def test_help_lists_install_modes() -> None: - result = _run_install("--help") - - assert result.returncode == 0 - assert "--prod" in result.stdout - assert "--dev" in result.stdout - assert "--dev-intel-mac" in result.stdout - assert "--dry-run" in result.stdout - - -def test_prod_install_rejects_intel_macos() -> None: - result = _run_install("--prod", "--dry-run", machine="x86_64") - - assert result.returncode == 2 - assert "Production installs on macOS require Apple Silicon" in result.stderr - assert "./scripts/install.sh --dev-intel-mac" in result.stderr - - -def test_dev_install_rejects_intel_macos() -> None: - result = _run_install("--dev", "--dry-run", machine="x86_64") - - assert result.returncode == 2 - assert "Production installs on macOS require Apple Silicon" in result.stderr - assert "./scripts/install.sh --dev-intel-mac" in result.stderr - - -def test_prod_install_uses_python_314_on_arm_macos() -> None: - result = _run_install("--prod", "--dry-run") - - assert result.returncode == 0 - assert "uv sync --python 3.14 --extra policyengine" in result.stdout - - -def test_dev_install_uses_python_314_on_arm_macos() -> None: - result = _run_install("--dev", "--dry-run") - - assert result.returncode == 0 - assert ( - "uv sync --python 3.14 --extra dev --extra policyengine" - in result.stdout - ) - - -def test_intel_macos_dev_install_uses_conda_forge_environment() -> None: - result = _run_install("--dev-intel-mac", "--dry-run", machine="x86_64") - - assert result.returncode == 0 - assert "conda env update --file" in result.stdout - assert "envs/macos-intel-conda-forge.yml --prune" in result.stdout - assert "--solver" not in result.stdout - assert "/envs/microplex-us-intel/bin/python -m pip install" in ( - result.stdout - ) - assert "--upgrade-strategy only-if-needed -e" in result.stdout - assert "dev" in result.stdout - assert "policyengine" in result.stdout - assert "/envs/microplex-us-intel/bin/python -c" in result.stdout - assert "torch" in result.stdout - - -def test_intel_macos_dev_install_rejects_non_intel_platforms() -> None: - result = _run_install("--dev-intel-mac", "--dry-run") - - assert result.returncode == 2 - assert "--dev-intel-mac is only for Intel macOS" in result.stderr diff --git a/tests/test_microdata_roles.py b/tests/test_microdata_roles.py deleted file mode 100644 index 64fda780..00000000 --- a/tests/test_microdata_roles.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Tests for source-specific microdata variable roles.""" - -from microplex_us.microdata_roles import ( - MicrodataVariableRole, - PolicyEngineUSVariableRole, - blocked_policyengine_us_direct_export_variables, - is_model_input_microdata_variable, - is_policyengine_us_direct_export_blocked, - microdata_variable_role, - non_model_input_microdata_variables, - policyengine_us_variable_role, -) - - -def test_puf_tax_credit_lines_are_reported_outputs_not_model_inputs(): - for variable in ( - "foreign_tax_credit", - "savers_credit", - "state_and_local_sales_or_income_tax", - "state_income_tax_paid", - "taxable_social_security", - "taxable_unemployment_compensation", - ): - assert ( - microdata_variable_role("irs_soi_puf_2024", variable) - is MicrodataVariableRole.CALCULATED_TAX_OUTPUT - ) - assert not is_model_input_microdata_variable("irs_soi_puf_2024", variable) - assert is_model_input_microdata_variable( - "irs_soi_puf_2024", - "taxable_interest_income", - ) - - -def test_non_model_input_microdata_variables_is_source_specific(): - assert non_model_input_microdata_variables( - "irs_soi_puf_2024", - ["savers_credit", "taxable_interest_income", "taxable_social_security"], - ) == ("savers_credit", "taxable_social_security") - assert non_model_input_microdata_variables( - "cps_asec_2024", - ["savers_credit"], - ) == () - - -def test_policyengine_us_variable_roles_separate_inputs_from_outputs(): - assert ( - policyengine_us_variable_role("takes_up_snap_if_eligible") - is PolicyEngineUSVariableRole.TAKEUP_INPUT - ) - assert ( - policyengine_us_variable_role("takes_up_eitc") - is PolicyEngineUSVariableRole.TAKEUP_INPUT - ) - assert ( - policyengine_us_variable_role( - "would_file_if_eligible_for_refundable_credit" - ) - is PolicyEngineUSVariableRole.PRESERVED_INPUT - ) - assert ( - policyengine_us_variable_role("would_file_taxes_voluntarily") - is PolicyEngineUSVariableRole.TAKEUP_INPUT - ) - assert ( - policyengine_us_variable_role("snap") - is PolicyEngineUSVariableRole.CALCULATED_OUTPUT - ) - assert ( - policyengine_us_variable_role("state_income_tax") - is PolicyEngineUSVariableRole.CALCULATED_OUTPUT - ) - assert ( - policyengine_us_variable_role("filing_status") - is PolicyEngineUSVariableRole.CALCULATED_OUTPUT - ) - assert ( - policyengine_us_variable_role("snap_reported") - is PolicyEngineUSVariableRole.REPORTED_OUTPUT - ) - assert ( - policyengine_us_variable_role("taxable_interest_income") - is PolicyEngineUSVariableRole.PRESERVED_INPUT - ) - assert ( - policyengine_us_variable_role("non_sch_d_capital_gains") - is PolicyEngineUSVariableRole.PRESERVED_INPUT - ) - assert ( - policyengine_us_variable_role("long_term_capital_gains_before_response") - is PolicyEngineUSVariableRole.PRESERVED_INPUT - ) - assert ( - policyengine_us_variable_role("net_capital_gains") - is PolicyEngineUSVariableRole.CALCULATED_OUTPUT - ) - - -def test_policyengine_direct_export_guard_blocks_calculated_and_reported_outputs(): - blocked = blocked_policyengine_us_direct_export_variables( - [ - "takes_up_snap_if_eligible", - "would_file_taxes_voluntarily", - "net_capital_gains", - "non_sch_d_capital_gains", - "filing_status", - "rent", - "snap", - "snap_reported", - "state_income_tax", - "taxable_interest_income", - ] - ) - - assert blocked == ( - "filing_status", - "net_capital_gains", - "rent", - "snap", - "snap_reported", - "state_income_tax", - ) - assert is_policyengine_us_direct_export_blocked("filing_status") - assert is_policyengine_us_direct_export_blocked("rent") - assert is_policyengine_us_direct_export_blocked("snap") - assert not is_policyengine_us_direct_export_blocked("takes_up_snap_if_eligible") - assert not is_policyengine_us_direct_export_blocked( - "would_file_taxes_voluntarily" - ) - assert not is_policyengine_us_direct_export_blocked("non_sch_d_capital_gains") diff --git a/tests/test_no_policyengine_us_data_runtime_imports.py b/tests/test_no_policyengine_us_data_runtime_imports.py deleted file mode 100644 index a487f28c..00000000 --- a/tests/test_no_policyengine_us_data_runtime_imports.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Runtime dependency boundaries for the MP package.""" - -from __future__ import annotations - -import ast -from pathlib import Path - - -def test_microplex_package_has_no_policyengine_us_data_imports(): - repo_root = Path(__file__).resolve().parents[1] - package_root = repo_root / "src" / "microplex_us" - offenders: list[str] = [] - for path in sorted(package_root.rglob("*.py")): - tree = ast.parse(path.read_text()) - for node in ast.walk(tree): - if isinstance(node, ast.ImportFrom) and (node.module or "").startswith( - "policyengine_us_data" - ): - offenders.append(f"{path.relative_to(repo_root)}:{node.lineno}") - elif isinstance(node, ast.Import): - for alias in node.names: - if alias.name.startswith("policyengine_us_data"): - offenders.append(f"{path.relative_to(repo_root)}:{node.lineno}") - - assert offenders == [] diff --git a/tests/test_package_imports.py b/tests/test_package_imports.py deleted file mode 100644 index 7328cab1..00000000 --- a/tests/test_package_imports.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Package import contract tests.""" - -from __future__ import annotations - -import subprocess -import sys - - -def test_root_import_leaves_pipeline_exports_lazy() -> None: - result = subprocess.run( - [ - sys.executable, - "-c", - ("import microplex_us; print('build_us_microplex' in vars(microplex_us))"), - ], - check=True, - capture_output=True, - text=True, - ) - - assert result.stdout.strip() == "False" - - -def test_data_sources_import_leaves_family_benchmark_lazy() -> None: - result = subprocess.run( - [ - sys.executable, - "-c", - ( - "import sys; " - "import microplex_us.data_sources; " - "print('microplex_us.data_sources.family_imputation_benchmark' " - "in sys.modules)" - ), - ], - check=True, - capture_output=True, - text=True, - ) - - assert result.stdout.strip() == "False" diff --git a/tests/test_pe_source_impute_engine.py b/tests/test_pe_source_impute_engine.py deleted file mode 100644 index d7d767bb..00000000 --- a/tests/test_pe_source_impute_engine.py +++ /dev/null @@ -1,332 +0,0 @@ -"""Tests for the PE source-impute block engine.""" - -from __future__ import annotations - -import numpy as np -import pandas as pd -from microplex.core import EntityType - -from microplex_us.pe_source_impute_engine import ( - PE_SOURCE_IMPUTE_BLOCK_ENGINE, - PESourceImputeBlockRunRequest, - PESourceImputeConditionedBlockRunRequest, - PESourceImputePreparedBlockInputs, -) -from microplex_us.variables import DonorImputationBlockSpec - - -def test_prepare_condition_surface_and_predictors_for_acs_block() -> None: - donor_frame = pd.DataFrame( - { - "household_id": [1, 1, 2], - "age": [45, 12, 70], - "sex": [1, 2, 2], - "is_head": [1.0, 0.0, 1.0], - "tenure": [1, 1, 2], - "employment_income": [50_000.0, 0.0, 12_000.0], - "self_employment_income": [5_000.0, 0.0, 0.0], - "gross_social_security": [0.0, 0.0, 20_000.0], - "taxable_pension_income": [0.0, 0.0, 15_000.0], - "state_fips": [6, 6, 36], - "rent": [1_200.0, 0.0, 950.0], - } - ) - current_frame = donor_frame.copy() - - surface = PE_SOURCE_IMPUTE_BLOCK_ENGINE.prepare_condition_surface( - donor_frame=donor_frame, - current_frame=current_frame, - donor_source_name="acs_2022", - donor_block=("rent",), - ) - - assert surface is not None - assert surface.spec.key == "acs" - assert surface.donor_frame["is_household_head"].tolist() == [1.0, 0.0, 1.0] - assert surface.donor_frame["pension_income"].tolist() == [0.0, 0.0, 15_000.0] - assert surface.compatible_predictors( - compatibility_fn=lambda donor, current: donor.notna().all() and current.notna().all(), - ) == list(surface.spec.predictors) - - -def test_prepare_condition_surface_returns_none_for_unmapped_block() -> None: - frame = pd.DataFrame({"tip_income": [1.0], "employment_income": [10.0]}) - - surface = PE_SOURCE_IMPUTE_BLOCK_ENGINE.prepare_condition_surface( - donor_frame=frame, - current_frame=frame, - donor_source_name="unknown_source", - donor_block=("tip_income",), - ) - - assert surface is None - - -def test_run_prepared_block_executes_fit_generate_and_assignment() -> None: - donor_frame = pd.DataFrame( - { - "household_id": [1, 1, 2], - "age": [45, 12, 70], - "sex": [1, 2, 2], - "is_head": [1.0, 0.0, 1.0], - "tenure": [1, 1, 2], - "employment_income": [50_000.0, 0.0, 12_000.0], - "self_employment_income": [5_000.0, 0.0, 0.0], - "gross_social_security": [0.0, 0.0, 20_000.0], - "taxable_pension_income": [0.0, 0.0, 15_000.0], - "state_fips": [6, 6, 36], - "rent": [1_200.0, 0.0, 950.0], - "hh_weight": [100.0, 100.0, 120.0], - } - ) - current_frame = donor_frame.drop(columns=["rent"]).copy() - surface = PE_SOURCE_IMPUTE_BLOCK_ENGINE.prepare_condition_surface( - donor_frame=donor_frame, - current_frame=current_frame, - donor_source_name="acs_2022", - donor_block=("rent",), - ) - - class _FakeImputer: - def fit(self, frame, *, weight_col, **kwargs): - self.fit_frame = frame - self.weight_col = weight_col - self.fit_kwargs = kwargs - - def generate(self, frame, *, seed): - self.generate_frame = frame - self.seed = seed - return pd.DataFrame({"rent": [400.0, 100.0, 300.0]}, index=frame.index) - - fake_imputer = _FakeImputer() - built: dict[str, object] = {} - - def _build_imputer(condition_vars, target_vars): - built["condition_vars"] = tuple(condition_vars) - built["target_vars"] = tuple(target_vars) - return fake_imputer - - def _rank_match(scores, *, donor_values, donor_weights, rng, strategy): - built["rank_scores"] = scores.tolist() - built["rank_donor_values"] = donor_values.tolist() - built["rank_strategy"] = strategy - return scores.astype(float) - - assert surface is not None - result = PE_SOURCE_IMPUTE_BLOCK_ENGINE.run_prepared_block( - surface=surface, - request=PESourceImputeBlockRunRequest( - donor_block_spec=DonorImputationBlockSpec( - model_variables=("rent",), - restored_variables=("rent",), - ), - donor_fit_source=donor_frame, - current_generation_source=current_frame, - current_frame=current_frame, - entity_key=None, - ), - build_imputer=_build_imputer, - rank_match=_rank_match, - compatibility_fn=lambda donor, current: donor.notna().all() and current.notna().all(), - fit_kwargs={"epochs": 5, "batch_size": 32, "learning_rate": 0.01, "verbose": False}, - seed=17, - rng=np.random.default_rng(0), - ) - - assert result is not None - assert built["target_vars"] == ("rent",) - assert built["condition_vars"] == tuple(surface.spec.predictors) - assert fake_imputer.weight_col == "weight" - assert fake_imputer.seed == 17 - assert result.updated_frame["rent"].tolist() == [400.0, 100.0, 300.0] - assert result.integrated_variables == ("rent",) - - -def test_run_conditioned_block_executes_generic_donor_path() -> None: - donor_frame = pd.DataFrame( - { - "age": [45, 12, 70], - "state_fips": [6, 6, 36], - "rent": [1_200.0, 0.0, 950.0], - "hh_weight": [100.0, 100.0, 120.0], - } - ) - current_frame = donor_frame.drop(columns=["rent"]).copy() - - class _FakeImputer: - def fit(self, frame, *, weight_col, **kwargs): - self.fit_frame = frame - self.weight_col = weight_col - self.fit_kwargs = kwargs - - def generate(self, frame, *, seed): - self.generate_frame = frame - self.seed = seed - return pd.DataFrame({"rent": [500.0, 200.0, 350.0]}, index=frame.index) - - fake_imputer = _FakeImputer() - built: dict[str, object] = {} - - def _build_imputer(condition_vars, target_vars): - built["condition_vars"] = tuple(condition_vars) - built["target_vars"] = tuple(target_vars) - return fake_imputer - - def _rank_match(scores, *, donor_values, donor_weights, rng, strategy): - built["rank_scores"] = scores.tolist() - built["rank_donor_values"] = donor_values.tolist() - built["rank_strategy"] = strategy - return scores.astype(float) - - result = PE_SOURCE_IMPUTE_BLOCK_ENGINE.run_conditioned_block( - request=PESourceImputeConditionedBlockRunRequest( - block_request=PESourceImputeBlockRunRequest( - donor_block_spec=DonorImputationBlockSpec( - model_variables=("rent",), - restored_variables=("rent",), - ), - donor_fit_source=donor_frame, - current_generation_source=current_frame, - current_frame=current_frame, - entity_key=None, - ), - donor_condition_source=donor_frame, - current_condition_source=current_frame, - condition_vars=("age", "state_fips"), - ), - build_imputer=_build_imputer, - rank_match=_rank_match, - fit_kwargs={"epochs": 5, "batch_size": 32, "learning_rate": 0.01, "verbose": False}, - seed=23, - rng=np.random.default_rng(0), - ) - - assert result is not None - assert built["target_vars"] == ("rent",) - assert built["condition_vars"] == ("age", "state_fips") - assert fake_imputer.weight_col == "weight" - assert fake_imputer.seed == 23 - assert result.updated_frame["rent"].tolist() == [500.0, 200.0, 350.0] - assert result.integrated_variables == ("rent",) - - -def test_prepare_block_inputs_projects_entity_and_preserves_compatible_shared_vars() -> None: - donor_seed = pd.DataFrame( - { - "person_id": [10, 11, 20], - "household_id": [1, 1, 2], - "age": [45, 12, 70], - "state_fips": [6, 6, 36], - "tenure": [1, 1, 2], - "rent": [1200.0, 0.0, 950.0], - "hh_weight": [100.0, 100.0, 120.0], - } - ) - current_frame = donor_seed.drop(columns=["rent"]).copy() - - def _can_project(current, donor, entity): - return entity is EntityType.HOUSEHOLD - - def _project(frame, *, entity, variables): - assert entity is EntityType.HOUSEHOLD - columns = [ - "household_id", - *sorted(variable for variable in variables if variable != "household_id"), - ] - keep_columns = list(columns) - if "person_id" in frame.columns and "person_id" not in columns: - columns.append("person_id") - projected = frame[columns].copy().sort_values( - ["household_id", "person_id"], - kind="mergesort", - ) - aggregations = { - column: ("first" if column in {"state_fips", "tenure", "hh_weight", "person_id"} else "max") - for column in projected.columns - if column != "household_id" - } - projected = projected.groupby("household_id", as_index=False).agg(aggregations) - return projected[keep_columns] - - prepared = PE_SOURCE_IMPUTE_BLOCK_ENGINE.prepare_block_inputs( - donor_seed=donor_seed, - current_frame=current_frame, - shared_vars=["age", "state_fips", "tenure"], - donor_block_spec=DonorImputationBlockSpec( - model_variables=("rent",), - restored_variables=("rent",), - native_entity=EntityType.HOUSEHOLD, - condition_entities=(EntityType.HOUSEHOLD,), - ), - donor_source_name=None, - prepare_pe_surface=False, - can_project_to_entity=_can_project, - project_frame_to_entity=_project, - entity_key_fn=lambda entity: "household_id" if entity is EntityType.HOUSEHOLD else None, - ) - - assert isinstance(prepared, PESourceImputePreparedBlockInputs) - assert prepared.entity_key == "household_id" - assert prepared.raw_shared_vars == ("age", "state_fips", "tenure") - assert prepared.shared_vars_after_model_exclusion == ( - "age", - "state_fips", - "tenure", - ) - assert prepared.shared_vars_for_block == ("age", "state_fips", "tenure") - assert prepared.entity_compatible_shared_vars == ("age", "state_fips", "tenure") - assert prepared.projection_applied is True - assert prepared.condition_surface is None - assert prepared.donor_fit_source["household_id"].tolist() == [1, 2] - assert prepared.current_generation_source.columns.tolist() == [ - "household_id", - "age", - "state_fips", - "tenure", - ] - - -def test_prepare_block_inputs_builds_condition_surface_when_requested() -> None: - donor_seed = pd.DataFrame( - { - "household_id": [1, 1, 2], - "age": [45, 12, 70], - "sex": [1, 2, 2], - "is_head": [1.0, 0.0, 1.0], - "tenure": [1, 1, 2], - "employment_income": [50_000.0, 0.0, 12_000.0], - "self_employment_income": [5_000.0, 0.0, 0.0], - "gross_social_security": [0.0, 0.0, 20_000.0], - "taxable_pension_income": [0.0, 0.0, 15_000.0], - "state_fips": [6, 6, 36], - "rent": [1_200.0, 0.0, 950.0], - } - ) - current_frame = donor_seed.copy() - - prepared = PE_SOURCE_IMPUTE_BLOCK_ENGINE.prepare_block_inputs( - donor_seed=donor_seed, - current_frame=current_frame, - shared_vars=["age", "sex", "state_fips"], - donor_block_spec=DonorImputationBlockSpec( - model_variables=("rent",), - restored_variables=("rent",), - ), - donor_source_name="acs_2022", - prepare_pe_surface=True, - can_project_to_entity=lambda current, donor, entity: False, - project_frame_to_entity=lambda frame, *, entity, variables: frame, - entity_key_fn=lambda entity: None, - ) - - assert prepared.condition_surface is not None - assert prepared.raw_shared_vars == ("age", "sex", "state_fips") - assert prepared.shared_vars_after_model_exclusion == ( - "age", - "sex", - "state_fips", - ) - assert prepared.condition_surface.spec.key == "acs" - assert prepared.shared_vars_for_block == ("age", "sex", "state_fips") - assert prepared.entity_compatible_shared_vars == () - assert prepared.projection_applied is False diff --git a/tests/test_pe_source_impute_specs.py b/tests/test_pe_source_impute_specs.py deleted file mode 100644 index 8157ad11..00000000 --- a/tests/test_pe_source_impute_specs.py +++ /dev/null @@ -1,132 +0,0 @@ -"""Tests for shared PE source-impute donor block specs.""" - -from __future__ import annotations - -import pandas as pd -from microplex.core import SourceArchetype - -from microplex_us.pe_source_impute_specs import ( - apply_pe_source_impute_loader_postprocess, - get_pe_source_impute_block_spec, - load_pe_source_impute_block_specs, - prepare_pe_source_impute_condition_frame, - resolve_pe_source_impute_block_key, - resolve_sipp_source_impute_block_spec, -) - - -def test_load_pe_source_impute_block_specs_reads_manifest() -> None: - specs = load_pe_source_impute_block_specs() - - assert set(specs) == {"acs", "sipp_tips", "sipp_assets", "scf"} - assert specs["acs"].archetype is SourceArchetype.HOUSEHOLD_INCOME - assert specs["scf"].archetype is SourceArchetype.WEALTH - assert specs["sipp_assets"].target_variables == ( - "bank_account_assets", - "stock_assets", - "bond_assets", - "household_vehicles_owned", - "household_vehicles_value", - ) - assert specs["sipp_tips"].raw_loader is not None - assert specs["sipp_tips"].raw_loader.filename == "pu2023_slim.csv" - assert specs["sipp_assets"].raw_loader is not None - assert specs["sipp_assets"].raw_loader.usecols[0] == "SSUID" - assert specs["sipp_tips"].annualized_variables == ( - "tip_income", - "employment_income", - ) - assert specs["sipp_assets"].required_monthcode == 12 - assert specs["acs"].dataset_loader is not None - assert specs["acs"].dataset_loader.module == "policyengine_us_data.datasets.acs.acs" - assert specs["scf"].dataset_loader is not None - assert specs["scf"].dataset_loader.builder_kind == "single_person_households" - - -def test_resolve_pe_source_impute_block_key_uses_source_name_and_targets() -> None: - assert ( - resolve_pe_source_impute_block_key( - donor_source_name="acs_2022", - donor_block=("rent",), - ) - == "acs" - ) - assert ( - resolve_pe_source_impute_block_key( - donor_source_name="sipp_assets_2023", - donor_block=("stock_assets", "bond_assets"), - ) - == "sipp_assets" - ) - assert ( - resolve_pe_source_impute_block_key( - donor_source_name="scf_2022", - donor_block=("tip_income",), - ) - is None - ) - assert ( - resolve_pe_source_impute_block_key( - donor_source_name="sipp_2023", - donor_block=("tip_income",), - ) - is None - ) - - -def test_resolve_sipp_source_impute_block_spec_and_named_lookup() -> None: - tips = resolve_sipp_source_impute_block_spec("tips") - scf = get_pe_source_impute_block_spec("scf") - - assert tips.key == "sipp_tips" - assert tips.descriptor_name == "sipp_tips" - assert scf.descriptor_name == "scf" - assert tips.matches_source_name("sipp_tips_2023") is True - assert tips.matches_source_name("sipp_2023") is False - - -def test_prepare_pe_source_impute_condition_frame_derives_manifest_backed_predictors() -> ( - None -): - spec = get_pe_source_impute_block_spec("acs") - frame = pd.DataFrame( - { - "household_id": [1, 1, 2], - "age": [45, 12, 70], - "sex": [1, 2, 2], - "is_head": [1.0, 0.0, 1.0], - "tenure": [1, 1, 2], - "employment_income": [50_000.0, 0.0, 12_000.0], - "self_employment_income": [5_000.0, 0.0, 0.0], - "gross_social_security": [0.0, 0.0, 20_000.0], - "taxable_pension_income": [0.0, 0.0, 15_000.0], - "state_fips": [6, 6, 36], - } - ) - - prepared = prepare_pe_source_impute_condition_frame(frame, spec) - - assert prepared["is_male"].tolist() == [1.0, 0.0, 0.0] - assert prepared["is_household_head"].tolist() == [1.0, 0.0, 1.0] - assert prepared["tenure_type"].tolist() == [1.0, 1.0, 2.0] - assert prepared["social_security"].tolist() == [0.0, 0.0, 20_000.0] - assert prepared["pension_income"].tolist() == [0.0, 0.0, 15_000.0] - assert prepared["household_size"].tolist() == [2.0, 2.0, 1.0] - - -def test_apply_pe_source_impute_loader_postprocess_uses_manifest_rules() -> None: - spec = get_pe_source_impute_block_spec("sipp_assets") - frame = pd.DataFrame( - { - "MONTHCODE": [11, 12, 12], - "household_id": ["100", "100", "101"], - "age": [10, 35, 5], - "employment_income": [100.0, 200.0, 300.0], - } - ) - - postprocessed = apply_pe_source_impute_loader_postprocess(frame, spec) - - assert postprocessed["household_id"].tolist() == ["100", "101"] - assert postprocessed["employment_income"].tolist() == [2_400.0, 3_600.0] - assert postprocessed["count_under_18"].tolist() == [0.0, 1.0] diff --git a/tests/test_project_metadata.py b/tests/test_project_metadata.py deleted file mode 100644 index ff44022c..00000000 --- a/tests/test_project_metadata.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -from pathlib import Path - -import tomllib - -REPO_ROOT = Path(__file__).resolve().parents[1] - - -def test_project_metadata_does_not_omit_torch_on_macos_x86_64() -> None: - pyproject = tomllib.loads((REPO_ROOT / "pyproject.toml").read_text()) - lock_text = (REPO_ROOT / "uv.lock").read_text() - - uv_config = pyproject.get("tool", {}).get("uv", {}) - - assert "required-environments" not in uv_config - assert "override-dependencies" not in uv_config - assert "platform_machine != 'x86_64' or sys_platform != 'darwin'" not in ( - lock_text - ) - - -def test_intel_macos_conda_forge_environment_is_declared() -> None: - env_text = (REPO_ROOT / "envs/macos-intel-conda-forge.yml").read_text() - - assert "name: microplex-us-intel" in env_text - assert " - conda-forge" in env_text - assert " - nodefaults" in env_text - assert " - python=3.13" in env_text - assert " - pytorch=2.11.*" in env_text - assert " - pip" in env_text diff --git a/tests/test_psid_data_source.py b/tests/test_psid_data_source.py deleted file mode 100644 index 408a5421..00000000 --- a/tests/test_psid_data_source.py +++ /dev/null @@ -1,228 +0,0 @@ -"""Tests for PSID data source integration.""" - -import numpy as np -import pandas as pd -import pytest - -from microplex_us.data_sources.psid import ( - PSID_TO_MICROPLEX_VARS, - PSIDDataset, - calibrate_divorce_rates, - calibrate_marriage_rates, - extract_transition_rates, - get_age_specific_rates, - load_psid_panel, -) - - -class TestPSIDDataset: - """Test PSIDDataset container.""" - - def test_dataset_creation(self): - """Test creating a PSIDDataset.""" - persons = pd.DataFrame({ - "person_id": [1, 1, 2, 2], - "year": [2019, 2021, 2019, 2021], - "age": [30, 32, 45, 47], - "is_male": [True, True, False, False], - "marital_status": [1, 1, 2, 1], # married, married, single, married - }) - - ds = PSIDDataset(persons=persons, source="mock") - - assert ds.n_persons == 2 - assert ds.n_observations == 4 - assert ds.years == [2019, 2021] - - def test_dataset_summary(self): - """Test dataset summary method.""" - persons = pd.DataFrame({ - "person_id": [1, 1, 2, 2], - "year": [2019, 2021, 2019, 2021], - "age": [30, 32, 45, 47], - }) - - ds = PSIDDataset(persons=persons, source="mock") - summary = ds.summary() - - assert summary["n_persons"] == 2 - assert summary["n_observations"] == 4 - assert summary["years"] == [2019, 2021] - - -class TestLoadPSID: - """Test PSID loading functionality.""" - - def test_load_requires_data_dir(self): - """Test that loading requires a data directory.""" - with pytest.raises((FileNotFoundError, ValueError)): - load_psid_panel(data_dir="/nonexistent/path") - - def test_variable_mapping(self): - """Test that PSID variables map to microplex conventions.""" - # These should map to standard microplex names - assert "age" in PSID_TO_MICROPLEX_VARS.values() - assert "is_male" in PSID_TO_MICROPLEX_VARS.values() - assert "total_income" in PSID_TO_MICROPLEX_VARS.values() - - -class TestTransitionRates: - """Test transition rate extraction from PSID data.""" - - @pytest.fixture - def mock_transitions_df(self): - """Create mock transition data from PSID.""" - # Simulates output from psid.get_household_transitions() - return pd.DataFrame({ - "person_id": range(100), - "year_from": [2019] * 100, - "year_to": [2021] * 100, - "type": ["marriage"] * 20 + ["divorce"] * 10 + ["same_household"] * 70, - "age_from": np.random.randint(20, 60, 100), - "marital_from": [2] * 20 + [1] * 10 + [1] * 35 + [2] * 35, # Single/married - "marital_to": [1] * 20 + [4] * 10 + [1] * 35 + [2] * 35, # Married/divorced - }) - - def test_extract_transition_rates(self, mock_transitions_df): - """Test extracting overall transition rates.""" - rates = extract_transition_rates(mock_transitions_df) - - assert "marriage" in rates - assert "divorce" in rates - assert rates["marriage"] == pytest.approx(0.20, abs=0.01) - assert rates["divorce"] == pytest.approx(0.10, abs=0.01) - - def test_get_age_specific_rates(self, mock_transitions_df): - """Test extracting age-specific transition rates.""" - age_rates = get_age_specific_rates( - mock_transitions_df, - transition_type="marriage", - age_bins=[(20, 29), (30, 39), (40, 49), (50, 59)], - ) - - assert isinstance(age_rates, dict) - assert (20, 29) in age_rates or len(age_rates) >= 0 # May have empty bins - - def test_rates_are_probabilities(self, mock_transitions_df): - """Test that extracted rates are valid probabilities.""" - rates = extract_transition_rates(mock_transitions_df) - - for rate in rates.values(): - assert 0.0 <= rate <= 1.0 - - -class TestCalibration: - """Test calibration of microplex models from PSID rates.""" - - @pytest.fixture - def psid_rates(self): - """Mock PSID-derived transition rates.""" - return { - "marriage": { - (18, 24): 0.05, - (25, 29): 0.08, - (30, 34): 0.06, - (35, 44): 0.04, - (45, 54): 0.02, - (55, 99): 0.01, - }, - "divorce": { - (18, 24): 0.06, - (25, 29): 0.04, - (30, 34): 0.03, - (35, 44): 0.025, - (45, 54): 0.02, - (55, 99): 0.015, - }, - } - - def test_calibrate_marriage_rates(self, psid_rates): - """Test calibrating marriage rates from PSID.""" - calibrated = calibrate_marriage_rates(psid_rates["marriage"]) - - # Should return dict compatible with MarriageTransition - assert isinstance(calibrated, dict) - for age_range, rate in calibrated.items(): - assert isinstance(age_range, tuple) - assert len(age_range) == 2 - assert 0.0 <= rate <= 1.0 - - def test_calibrate_divorce_rates(self, psid_rates): - """Test calibrating divorce rates from PSID.""" - calibrated = calibrate_divorce_rates(psid_rates["divorce"]) - - assert isinstance(calibrated, dict) - for key, rate in calibrated.items(): - assert 0.0 <= rate <= 1.0 - - def test_calibrated_model_uses_psid_rates(self, psid_rates): - """Test that calibrated model actually uses PSID rates.""" - from microplex.transitions import MarriageTransition - - calibrate_marriage_rates(psid_rates["marriage"]) - - # Create model with calibrated rates - model = MarriageTransition(base_rates={"male": 0.05, "female": 0.06}) - - # Model should use provided rates - assert model.base_rates is not None - - -class TestMultiSourceIntegration: - """Test PSID integration with MultiSourceFusion.""" - - @pytest.fixture - def mock_psid_data(self): - """Create mock PSID panel data.""" - np.random.seed(42) - n = 100 - return pd.DataFrame({ - "person_id": np.repeat(range(n // 2), 2), - "period": np.tile([0, 1], n // 2), - "age": np.repeat(np.random.randint(20, 60, n // 2), 2), - "total_income": np.abs(np.random.randn(n) * 50000 + 40000), - "is_male": np.repeat(np.random.choice([True, False], n // 2), 2), - }) - - def test_psid_as_fusion_source(self, mock_psid_data): - """Test adding PSID as a source to MultiSourceFusion.""" - from microplex.fusion import MultiSourceFusion - - fusion = MultiSourceFusion( - shared_vars=["age", "total_income"], - all_vars=["age", "total_income"], - n_periods=2, - ) - - # Should be able to add PSID as a source - fusion.add_source( - "psid", - mock_psid_data, - source_vars=["age", "total_income"], - n_periods=2, - person_id_col="person_id", - period_col="period", - ) - - assert "psid" in fusion.sources - assert fusion.sources["psid"].source_vars == ["age", "total_income"] - - def test_coverage_evaluation_with_psid(self, mock_psid_data): - """Test evaluating coverage on PSID holdout data.""" - from microplex.fusion import MultiSourceFusion - - # Need at least 2 sources for fusion - mock_cps_data = mock_psid_data.copy() - mock_cps_data["person_id"] = mock_cps_data["person_id"] + 1000 - - fusion = MultiSourceFusion( - shared_vars=["age", "total_income"], - all_vars=["age", "total_income"], - n_periods=2, - ) - - fusion.add_source("psid", mock_psid_data, source_vars=["age", "total_income"]) - fusion.add_source("cps", mock_cps_data, source_vars=["age", "total_income"]) - - # Should be able to add sources - assert len(fusion.sources) == 2 diff --git a/tests/test_psid_source_provider.py b/tests/test_psid_source_provider.py deleted file mode 100644 index 8af38479..00000000 --- a/tests/test_psid_source_provider.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Tests for PSID source-provider implementation.""" - -from __future__ import annotations - -import pandas as pd -from microplex.core import EntityType, SourceProvider, SourceQuery - -from microplex_us.data_sources import PSIDDataset, PSIDSourceProvider - - -def test_psid_source_provider_projects_single_year_frame(tmp_path): - def loader(**_: object) -> PSIDDataset: - persons = pd.DataFrame( - { - "person_id": ["a", "b", "a", "b"], - "household_id": ["h1", "h1", "h1", "h1"], - "year": [2019, 2019, 2021, 2021], - "age": [30, 28, 32, 30], - "is_male": [True, False, True, False], - "education": [4, 4, 4, 4], - "total_income": [50_000.0, 10_000.0, 55_000.0, 12_000.0], - } - ) - return PSIDDataset(persons=persons, source="mock") - - provider = PSIDSourceProvider( - data_dir=tmp_path, - survey_year=2021, - loader=loader, - ) - frame = provider.load_frame( - SourceQuery(period=2021, provider_filters={"sample_n": 1, "random_seed": 0}) - ) - - assert isinstance(provider, SourceProvider) - assert set(frame.tables) == {EntityType.HOUSEHOLD, EntityType.PERSON} - assert frame.tables[EntityType.HOUSEHOLD]["year"].tolist() == [2021] - assert frame.tables[EntityType.PERSON]["year"].nunique() == 1 - assert frame.tables[EntityType.PERSON]["person_id"].str.startswith("2021:").all() - assert "income" in frame.tables[EntityType.PERSON].columns - assert provider.descriptor.name.startswith("psid_") diff --git a/tests/test_puf_source_provider.py b/tests/test_puf_source_provider.py deleted file mode 100644 index 70eaed0f..00000000 --- a/tests/test_puf_source_provider.py +++ /dev/null @@ -1,1501 +0,0 @@ -"""Tests for PUF source-provider implementation.""" - -from __future__ import annotations - -import sys -import types - -import numpy as np -import pandas as pd -import pytest -from microplex.core import EntityType, SourceArchetype, SourceProvider, SourceQuery - -import microplex_us.data_sources.puf as puf_module -from microplex_us.data_sources import PUFSourceProvider, expand_to_persons -from microplex_us.data_sources.puf import ( - PUF_UPRATING_MODE_PE_SOI, - PEStyleQRFShareModel, - _fit_pe_style_puf_social_security_qrf_model_from_reference, - _impute_missing_puf_demographics, - _impute_puf_social_security_components, - _sample_tax_units, - map_puf_variables, - uprate_mapped_puf_with_pe_factors, - uprate_raw_puf_pe_style, -) -from microplex_us.data_sources.share_imputation import fit_grouped_share_model - - -def _mock_social_security_share_model_loader(*_args): - reference = pd.DataFrame( - { - "age_bucket": [ - "under_18", - "18_to_29", - "30_to_44", - "45_to_61", - "62_to_74", - "75_plus", - ], - "weight": [1.0] * 6, - "social_security_retirement": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0], - "social_security_disability": [0.0, 1.0, 1.0, 1.0, 0.0, 0.0], - "social_security_survivors": [0.0] * 6, - "social_security_dependents": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0], - } - ) - return fit_grouped_share_model( - reference, - explicit_component_columns=( - "social_security_retirement", - "social_security_disability", - "social_security_survivors", - ), - implicit_component_column="social_security_dependents", - feature_sets=(("age_bucket",),), - weight_column="weight", - ) - - -def _install_fake_qrf(monkeypatch, prediction_frame: pd.DataFrame): - calls: dict[str, object] = {} - - class FakeFittedModel: - def predict(self, X_test): - calls["X_test"] = X_test.copy() - return prediction_frame.copy() - - class FakeQRF: - def __init__(self, **kwargs): - calls["init_kwargs"] = dict(kwargs) - - def fit(self, *, X_train, predictors, imputed_variables, n_jobs): - calls["X_train"] = X_train.copy() - calls["predictors"] = tuple(predictors) - calls["imputed_variables"] = tuple(imputed_variables) - calls["n_jobs"] = n_jobs - return FakeFittedModel() - - microimpute_module = types.ModuleType("microimpute") - models_module = types.ModuleType("microimpute.models") - qrf_module = types.ModuleType("microimpute.models.qrf") - qrf_module.QRF = FakeQRF - microimpute_module.models = models_module - models_module.qrf = qrf_module - monkeypatch.setitem(sys.modules, "microimpute", microimpute_module) - monkeypatch.setitem(sys.modules, "microimpute.models", models_module) - monkeypatch.setitem(sys.modules, "microimpute.models.qrf", qrf_module) - return calls - - -def test_load_puf_raw_disaggregates_aggregate_records(tmp_path): - regular_rows = [ - { - "RECID": recid, - "MARS": 2 if recid % 2 == 0 else 1, - "XTOT": 2 if recid % 2 == 0 else 1, - "DSI": 0, - "EIC": 0, - "S006": 100, - "E00100": 100_000_000 + recid * 1_000_000, - "E00200": 2_000_000 + recid * 10_000, - "P23250": 60_000_000 + recid * 100_000, - } - for recid in range(1, 25) - ] - aggregate = { - "RECID": 999999, - "MARS": 0, - "XTOT": 0, - "DSI": 0, - "EIC": 0, - "S006": 50_000, - "E00100": 300_000_000, - "E00200": 10_000_000, - "P23250": 250_000_000, - } - puf_path = tmp_path / "puf.csv" - pd.DataFrame([*regular_rows, aggregate]).to_csv(puf_path, index=False) - - result = puf_module.load_puf_raw(puf_path) - synthetic = result[result["RECID"] >= puf_module.PUF_SYNTHETIC_RECID_START] - synthetic_weights = synthetic["S006"] / 100 - aggregate_weight = aggregate["S006"] / 100 - - assert 999999 not in set(result["RECID"]) - assert not result["MARS"].eq(0).any() - assert len(synthetic) == 40 - assert synthetic_weights.sum() == pytest.approx(aggregate_weight) - for column in ("E00100", "E00200", "P23250"): - assert (synthetic[column] * synthetic_weights).sum() == pytest.approx( - aggregate[column] * aggregate_weight - ) - - -def test_load_puf_raw_disaggregates_small_aggregate_records_with_positive_weights( - tmp_path, -): - regular_rows = [ - { - "RECID": recid, - "MARS": 2 if recid % 2 == 0 else 1, - "XTOT": 2 if recid % 2 == 0 else 1, - "DSI": 0, - "EIC": 0, - "S006": 100, - "E00100": 100_000_000 + recid * 1_000_000, - "E00200": 2_000_000 + recid * 10_000, - "P23250": 60_000_000 + recid * 100_000, - } - for recid in range(1, 25) - ] - aggregate = { - "RECID": 999999, - "MARS": 0, - "XTOT": 0, - "DSI": 0, - "EIC": 0, - "S006": 3_000, - "E00100": 300_000_000, - "E00200": 10_000_000, - "P23250": 250_000_000, - } - puf_path = tmp_path / "puf.csv" - pd.DataFrame([*regular_rows, aggregate]).to_csv(puf_path, index=False) - - result = puf_module.load_puf_raw(puf_path) - synthetic = result[result["RECID"] >= puf_module.PUF_SYNTHETIC_RECID_START] - synthetic_weights = synthetic["S006"] / 100 - aggregate_weight = aggregate["S006"] / 100 - - assert len(synthetic) == 20 - assert synthetic_weights.min() > 0 - assert synthetic_weights.sum() == pytest.approx(aggregate_weight) - for column in ("E00100", "E00200", "P23250"): - assert (synthetic[column] * synthetic_weights).sum() == pytest.approx( - aggregate[column] * aggregate_weight - ) - - -def test_load_puf_raw_disaggregates_all_aggregate_records_preserving_top_tail_totals( - tmp_path, -): - columns_to_preserve = ( - "E00100", - "P23250", - "P22250", - "E00600", - "E00650", - "E00300", - "E00400", - ) - bucket_agi = { - 999996: -500_000, - 999997: 2_000_000, - 999998: 25_000_000, - 999999: 250_000_000, - } - regular_rows = [] - recid = 1 - for bucket_recid, agi in bucket_agi.items(): - for offset in range(30): - regular_rows.append( - { - "RECID": recid, - "MARS": 2 if offset % 3 == 0 else 1, - "XTOT": 2 if offset % 3 == 0 else 1, - "DSI": 0, - "EIC": 0, - "S006": 100, - "E00100": agi + offset * max(abs(agi) * 0.01, 1_000), - "P23250": abs(agi) * 0.30 + offset * 10_000, - "P22250": abs(agi) * 0.03 + offset * 1_000, - "E00600": abs(agi) * 0.05 + offset * 500, - "E00650": abs(agi) * 0.03 + offset * 300, - "E00300": abs(agi) * 0.01 + offset * 100, - "E00400": abs(agi) * 0.005 + offset * 50, - } - ) - recid += 1 - - aggregate_rows = [] - for index, (bucket_recid, agi) in enumerate(bucket_agi.items(), start=1): - aggregate_rows.append( - { - "RECID": bucket_recid, - "MARS": 0, - "XTOT": 0, - "DSI": 0, - "EIC": 0, - "S006": 20_000 + index * 100, - "E00100": agi, - "P23250": abs(agi) * 0.40, - "P22250": abs(agi) * 0.04, - "E00600": abs(agi) * 0.08, - "E00650": abs(agi) * 0.05, - "E00300": abs(agi) * 0.015, - "E00400": abs(agi) * 0.008, - } - ) - - puf_path = tmp_path / "puf.csv" - source = pd.DataFrame([*regular_rows, *aggregate_rows]) - source.to_csv(puf_path, index=False) - - result = puf_module.load_puf_raw(puf_path) - synthetic = result[result["RECID"] >= puf_module.PUF_SYNTHETIC_RECID_START] - synthetic_weights = synthetic["S006"] / 100 - aggregate = pd.DataFrame(aggregate_rows) - aggregate_weights = aggregate["S006"] / 100 - - assert not set(puf_module.PUF_AGGREGATE_RECIDS) & set(result["RECID"]) - assert len(synthetic) >= 80 - assert synthetic_weights.sum() == pytest.approx(aggregate_weights.sum()) - for column in columns_to_preserve: - expected = (aggregate[column] * aggregate_weights).sum() - observed = (synthetic[column] * synthetic_weights).sum() - assert observed == pytest.approx(expected) - - -def _write_minimal_soi_csv(path): - def row(variable, year, is_count, value): - return { - "Variable": variable, - "Year": year, - "Filing status": "All", - "AGI lower bound": float("-inf"), - "AGI upper bound": float("inf"), - "Count": bool(is_count), - "Taxable only": False, - "Value": float(value), - } - - rows = [ - row("count", 2015, True, 100), - row("count", 2021, True, 110), - row("count", 2024, True, 110), - row("employment_income", 2015, False, 200), - row("employment_income", 2021, False, 330), - row("employment_income", 2024, False, 330), - row("capital_gains_distributions", 2015, False, 50), - row("capital_gains_distributions", 2021, False, 110), - row("capital_gains_distributions", 2024, False, 110), - row("business_net_profits", 2015, False, 40), - row("business_net_profits", 2021, False, 88), - row("business_net_profits", 2024, False, 88), - row("business_net_losses", 2015, False, 20), - row("business_net_losses", 2021, False, 11), - row("business_net_losses", 2024, False, 11), - row("adjusted_gross_income", 2015, False, 1000), - row("adjusted_gross_income", 2021, False, 1320), - row("adjusted_gross_income", 2024, False, 1320), - ] - pd.DataFrame(rows).to_csv(path, index=False) - - -def _write_minimal_uprating_factors_csv(path): - pd.DataFrame( - [ - {"Variable": "household_weight", "2021": 1.0, "2024": 1.1}, - {"Variable": "employment_income", "2021": 1.0, "2024": 1.2}, - {"Variable": "non_sch_d_capital_gains", "2021": 1.0, "2024": 1.3}, - {"Variable": "social_security", "2021": 1.0, "2024": 1.4}, - {"Variable": "qualified_dividend_income", "2021": 1.0, "2024": 1.5}, - {"Variable": "non_qualified_dividend_income", "2021": 1.0, "2024": 1.6}, - {"Variable": "taxable_pension_income", "2021": 1.0, "2024": 1.7}, - {"Variable": "tax_exempt_pension_income", "2021": 1.0, "2024": 1.8}, - ] - ).to_csv(path, index=False) - - -def test_expand_to_persons_preserves_joint_tax_unit_monetary_totals(): - tax_units = pd.DataFrame( - { - "filing_status": ["JOINT"], - "employment_income": [100.0], - "self_employment_income": [50.0], - "taxable_interest_income": [20.0], - "ordinary_dividend_income": [30.0], - "qualified_dividend_income": [10.0], - "gross_social_security": [40.0], - "taxable_pension_income": [60.0], - "unemployment_compensation": [80.0], - "rental_income": [90.0], - "weight": [1.0], - "household_id": ["joint-household"], - "year": [2024], - } - ) - - persons = expand_to_persons(tax_units) - head = persons.loc[persons["is_head"] == 1].iloc[0] - spouse = persons.loc[persons["is_spouse"] == 1].iloc[0] - - assert head["employment_income"] == 60.0 - assert spouse["employment_income"] == 40.0 - assert head["self_employment_income"] == 30.0 - assert spouse["self_employment_income"] == 20.0 - assert head["taxable_interest_income"] == 10.0 - assert spouse["taxable_interest_income"] == 10.0 - assert head["ordinary_dividend_income"] == 15.0 - assert spouse["ordinary_dividend_income"] == 15.0 - assert head["non_qualified_dividend_income"] == 10.0 - assert spouse["non_qualified_dividend_income"] == 10.0 - assert persons["taxable_interest_income"].sum() == 20.0 - assert persons["ordinary_dividend_income"].sum() == 30.0 - assert persons["qualified_dividend_income"].sum() == 10.0 - assert persons["non_qualified_dividend_income"].sum() == 20.0 - assert persons["dividend_income"].sum() == 30.0 - assert persons["social_security"].sum() == 40.0 - assert persons["social_security_retirement"].sum() == 0.0 - assert persons["pension_income"].sum() == 60.0 - assert persons["income"].sum() == 470.0 - - -def test_expand_to_persons_derives_retirement_social_security_for_older_records(): - tax_units = pd.DataFrame( - { - "filing_status": ["SINGLE", "SINGLE"], - "gross_social_security": [40.0, 25.0], - "age": [68, 45], - "weight": [1.0, 1.0], - "household_id": ["older-household", "younger-household"], - "year": [2024, 2024], - } - ) - - persons = ( - expand_to_persons(tax_units).sort_values("household_id").reset_index(drop=True) - ) - - assert persons["social_security"].tolist() == [40.0, 25.0] - assert persons["social_security_retirement"].tolist() == [40.0, 0.0] - - -def test_expand_to_persons_preserves_qbi_boolean_flags_for_joint_units(): - tax_units = pd.DataFrame( - { - "filing_status": ["JOINT"], - "business_is_sstb": [True], - "self_employment_income_would_be_qualified": [True], - "sstb_self_employment_income_would_be_qualified": [True], - "weight": [1.0], - "household_id": ["joint-household"], - "year": [2024], - } - ) - - persons = expand_to_persons(tax_units) - - assert persons["business_is_sstb"].tolist() == [True, True] - assert persons["self_employment_income_would_be_qualified"].tolist() == [ - True, - True, - ] - assert persons["sstb_self_employment_income_would_be_qualified"].tolist() == [ - True, - True, - ] - - -def test_impute_puf_social_security_components_uses_grouped_cps_shares(): - persons = pd.DataFrame( - { - "age": [12, 40, 70], - "social_security": [100.0, 200.0, 300.0], - } - ) - - result = _impute_puf_social_security_components( - persons, - share_model=_mock_social_security_share_model_loader(), - ) - - assert result["social_security_dependents"].tolist() == [100.0, 0.0, 0.0] - assert result["social_security_disability"].tolist() == [0.0, 200.0, 0.0] - assert result["social_security_retirement"].tolist() == [0.0, 0.0, 300.0] - assert result["social_security_survivors"].tolist() == [0.0, 0.0, 0.0] - - -def test_fit_pe_style_puf_social_security_qrf_model_uses_pe_predictors(monkeypatch): - predictions = pd.DataFrame( - { - "social_security_retirement_share": [0.6, 0.1], - "social_security_disability_share": [0.3, 0.2], - "social_security_survivors_share": [0.05, 0.4], - "social_security_dependents_share": [0.05, 0.3], - } - ) - calls = _install_fake_qrf(monkeypatch, predictions) - reference = pd.DataFrame( - { - "age": [70, 45, 12, 67] * 30, - "sex": [1, 2, 2, 1] * 30, - "filing_status": ["JOINT", "SINGLE", "SINGLE", "JOINT"] * 30, - "is_head": [1, 1, 0, 1] * 30, - "is_dependent": [0, 0, 1, 0] * 30, - "social_security": [100.0, 100.0, 100.0, 100.0] * 30, - "social_security_retirement": [100.0, 0.0, 0.0, 100.0] * 30, - "social_security_disability": [0.0, 100.0, 0.0, 0.0] * 30, - "social_security_survivors": [0.0, 0.0, 0.0, 0.0] * 30, - "social_security_dependents": [0.0, 0.0, 100.0, 0.0] * 30, - } - ) - - model = _fit_pe_style_puf_social_security_qrf_model_from_reference( - reference, - min_training_records=1, - ) - - assert isinstance(model, PEStyleQRFShareModel) - assert model.predictors == ( - "age", - "is_male", - "tax_unit_is_joint", - "is_tax_unit_head", - "is_tax_unit_dependent", - ) - assert calls["predictors"] == model.predictors - assert calls["n_jobs"] == 1 - - persons = pd.DataFrame( - { - "age": [70, 45], - "sex": [1, 2], - "filing_status": ["JOINT", "SINGLE"], - "is_head": [1, 1], - "is_dependent": [0, 0], - "social_security": [300.0, 200.0], - } - ) - result = _impute_puf_social_security_components(persons, share_model=model) - - assert result["social_security_retirement"].tolist() == [180.0, 20.0] - assert result["social_security_disability"].tolist() == [90.0, 40.0] - assert result["social_security_survivors"].tolist() == [15.0, 80.0] - assert result["social_security_dependents"].tolist() == [15.0, 60.0] - assert list(calls["X_test"].columns) == list(model.predictors) - - -def test_puf_source_provider_selects_pe_qrf_social_security_strategy( - tmp_path, - monkeypatch, -): - puf = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ) - puf_path = tmp_path / "puf.csv" - demographics_path = tmp_path / "demographics.csv" - puf.to_csv(puf_path, index=False) - pd.DataFrame({"RECID": [101]}).to_csv(demographics_path, index=False) - - pe_qrf_called: list[tuple[int, object]] = [] - - def fake_pe_qrf_loader(*, cps_reference_year, cache_dir): - pe_qrf_called.append((cps_reference_year, cache_dir)) - return _mock_social_security_share_model_loader() - - def fail_grouped_loader(**_kwargs): - raise AssertionError("grouped-share loader should not be used in pe_qrf mode") - - monkeypatch.setattr( - puf_module, - "_default_pe_style_puf_social_security_share_model", - fake_pe_qrf_loader, - ) - monkeypatch.setattr( - puf_module, - "_default_puf_social_security_share_model", - fail_grouped_loader, - ) - - provider = PUFSourceProvider( - puf_path=puf_path, - demographics_path=demographics_path, - target_year=2024, - social_security_split_strategy="pe_qrf", - ) - frame = provider.load_frame(SourceQuery(period=2024)) - - assert pe_qrf_called - assert "social_security_retirement" in frame.tables[EntityType.PERSON].columns - - -def test_uprate_raw_puf_pe_style_matches_pe_soi_contract(tmp_path): - soi_path = tmp_path / "soi.csv" - _write_minimal_soi_csv(soi_path) - - raw = pd.DataFrame( - { - "E00200": [10.0, 20.0], - "E01100": [5.0, 0.0], - "E00900": [8.0, -4.0], - "E03290": [7.0, 3.0], - "S006": [100.0, 200.0], - } - ) - - result = uprate_raw_puf_pe_style( - raw, - from_year=2015, - to_year=2024, - soi_path=soi_path, - ) - - assert result["E00200"].tolist() == pytest.approx([15.0, 30.0]) - assert result["E01100"].tolist() == pytest.approx([10.0, 0.0]) - assert result["E00900"].tolist() == pytest.approx([19.2, -2.4]) - assert result["E03290"].tolist() == pytest.approx([8.4, 3.6]) - assert result["S006"].tolist() == pytest.approx([110.0, 220.0]) - - -def test_uprate_mapped_puf_with_pe_factors_uses_aliases_and_recomputes(tmp_path): - repo_root = tmp_path / "pe-us-data" - storage = repo_root / "policyengine_us_data" / "storage" - storage.mkdir(parents=True) - _write_minimal_uprating_factors_csv(storage / "uprating_factors.csv") - - mapped = pd.DataFrame( - { - "weight": [1.1], - "employment_income": [15.0], - "non_sch_d_capital_gains": [10.0], - "gross_social_security": [20.0], - "qualified_dividend_income": [4.0], - "non_qualified_dividend_income": [6.0], - "taxable_pension_income": [7.0], - "tax_exempt_pension_income": [3.0], - } - ) - - result = uprate_mapped_puf_with_pe_factors( - mapped, - from_year=2021, - to_year=2024, - policyengine_us_data_repo=repo_root, - ) - - assert result["weight"].tolist() == pytest.approx([1.21]) - assert result["employment_income"].tolist() == pytest.approx([18.0]) - assert result["non_sch_d_capital_gains"].tolist() == pytest.approx([13.0]) - assert result["gross_social_security"].tolist() == pytest.approx([28.0]) - assert result["qualified_dividend_income"].tolist() == pytest.approx([6.0]) - assert result["non_qualified_dividend_income"].tolist() == pytest.approx([9.6]) - assert result["ordinary_dividend_income"].tolist() == pytest.approx([15.6]) - assert result["taxable_pension_income"].tolist() == pytest.approx([11.9]) - assert result["tax_exempt_pension_income"].tolist() == pytest.approx([5.4]) - assert result["total_pension_income"].tolist() == pytest.approx([17.3]) - - -def test_puf_source_provider_pe_soi_mode_uses_raw_uprating(tmp_path): - repo_root = tmp_path / "pe-us-data" - storage = repo_root / "policyengine_us_data" / "storage" - storage.mkdir(parents=True) - soi_path = storage / "soi.csv" - uprating_factors_path = storage / "uprating_factors.csv" - _write_minimal_soi_csv(soi_path) - _write_minimal_uprating_factors_csv(uprating_factors_path) - puf = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00200": [10.0], - "E01100": [5.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ) - puf_path = tmp_path / "puf.csv" - demographics_path = tmp_path / "demographics.csv" - puf.to_csv(puf_path, index=False) - pd.DataFrame({"RECID": [101]}).to_csv(demographics_path, index=False) - - provider = PUFSourceProvider( - puf_path=puf_path, - demographics_path=demographics_path, - target_year=2024, - uprating_mode=PUF_UPRATING_MODE_PE_SOI, - policyengine_us_data_repo=repo_root, - soi_path=soi_path, - social_security_share_model_loader=_mock_social_security_share_model_loader, - ) - frame = provider.load_frame(SourceQuery(period=2024)) - household = frame.tables[EntityType.HOUSEHOLD].iloc[0] - person = frame.tables[EntityType.PERSON].iloc[0] - - assert household["household_weight"] == pytest.approx(1.21) - assert person["employment_income"] == pytest.approx(18.0) - assert person["non_sch_d_capital_gains"] == pytest.approx(13.0) - - -def test_expand_to_persons_splits_negative_joint_self_employment_losses(): - tax_units = pd.DataFrame( - { - "filing_status": ["JOINT"], - "self_employment_income": [-100.0], - "weight": [1.0], - "household_id": ["joint-household"], - "year": [2024], - } - ) - - persons = expand_to_persons(tax_units) - head = persons.loc[persons["is_head"] == 1].iloc[0] - spouse = persons.loc[persons["is_spouse"] == 1].iloc[0] - - assert head["self_employment_income"] == -60.0 - assert spouse["self_employment_income"] == -40.0 - assert persons["self_employment_income"].sum() == -100.0 - assert persons["income"].sum() == -100.0 - - -def test_expand_to_persons_uses_pe_demographic_helpers_when_present(): - tax_units = pd.DataFrame( - { - "filing_status": ["JOINT", "SINGLE"], - "employment_income": [100.0, 50.0], - "pre_tax_contributions": [20.0, 0.0], - "gross_social_security": [40.0, 0.0], - "weight": [1.0, 1.0], - "household_id": ["joint-household", "single-household"], - "exemptions_count": [2, 3], - "_puf_recid": [101, 202], - "_puf_agerange": [4, 5], - "_puf_earnsplit": [2, 0], - "_puf_gender": [1, 2], - "_puf_agedp1": [2, 4], - "_puf_agedp2": [3, 6], - "year": [2024, 2024], - } - ) - - persons = ( - expand_to_persons(tax_units).sort_values("person_id").reset_index(drop=True) - ) - persons_repeat = ( - expand_to_persons(tax_units).sort_values("person_id").reset_index(drop=True) - ) - - pd.testing.assert_frame_equal(persons, persons_repeat) - - assert persons["person_id"].tolist() == [ - "101:1", - "101:2", - "202:1", - "202:3", - "202:4", - ] - assert persons["tax_unit_id"].tolist() == ["101", "101", "202", "202", "202"] - - head = persons.loc[persons["person_id"] == "101:1"].iloc[0] - spouse = persons.loc[persons["person_id"] == "101:2"].iloc[0] - single = persons.loc[persons["person_id"] == "202:1"].iloc[0] - dependent_1 = persons.loc[persons["person_id"] == "202:3"].iloc[0] - dependent_2 = persons.loc[persons["person_id"] == "202:4"].iloc[0] - - assert head["employment_income"] == pytest.approx(27.825327362979824) - assert spouse["employment_income"] == pytest.approx(72.17467263702018) - assert head["pre_tax_contributions"] == pytest.approx(5.565065472595965) - assert spouse["pre_tax_contributions"] == pytest.approx(14.434934527404035) - assert head["age"] == 50 - assert spouse["age"] == 50 - assert spouse["is_male"] == 0.0 - assert single["age"] == 60 - assert single["is_male"] == 0.0 - assert dependent_1["is_dependent"] == 1 - assert dependent_2["is_dependent"] == 1 - assert dependent_1["employment_income"] == 0.0 - assert dependent_2["employment_income"] == 0.0 - assert dependent_1["age"] == 18 - assert dependent_2["age"] == 27 - assert dependent_1["is_male"] == 0.0 - assert dependent_2["is_male"] == 0.0 - - -def test_expand_to_persons_spreads_open_ended_puf_filer_age_band(): - tax_units = pd.DataFrame( - { - "filing_status": ["SINGLE"] * 10, - "weight": [1.0] * 10, - "household_id": [f"household-{i}" for i in range(10)], - "exemptions_count": [1] * 10, - "_puf_recid": list(range(1_001, 1_011)), - "_puf_agerange": [7] * 10, - "year": [2024] * 10, - } - ) - - persons = ( - expand_to_persons(tax_units).sort_values("person_id").reset_index(drop=True) - ) - persons_repeat = ( - expand_to_persons(tax_units).sort_values("person_id").reset_index(drop=True) - ) - - pd.testing.assert_frame_equal(persons, persons_repeat) - ages = persons["age"].tolist() - assert min(ages) >= 80 - assert max(ages) < 90 - assert len(set(ages)) > 1 - assert ages.count(80) < len(ages) - - -def test_expand_to_persons_clears_status_flags_for_non_head_members(): - tax_units = pd.DataFrame( - { - "filing_status": ["SURVIVING_SPOUSE"], - "is_surviving_spouse": [True], - "weight": [1.0], - "household_id": ["widow-household"], - "exemptions_count": [3], - "_puf_recid": [202], - "_puf_agerange": [5], - "_puf_agedp1": [2], - "_puf_agedp2": [3], - "year": [2024], - } - ) - - persons = ( - expand_to_persons(tax_units).sort_values("person_id").reset_index(drop=True) - ) - - assert persons["is_surviving_spouse"].tolist() == [True, False, False] - - -def test_puf_source_provider_loads_observation_frame_from_local_files(tmp_path): - puf = pd.DataFrame( - { - "RECID": [101, 202], - "MARS": [2, 1], - "XTOT": [2, 1], - "S006": [100.0, 200.0], - "E00200": [50_000.0, 20_000.0], - "E00900": [0.0, 5_000.0], - "AGE_HEAD": [45, 67], - "GENDER": [1, 2], - } - ) - puf_path = tmp_path / "puf.csv" - demographics_path = tmp_path / "demographics.csv" - puf.to_csv(puf_path, index=False) - pd.DataFrame({"RECID": [101, 202]}).to_csv(demographics_path, index=False) - - provider = PUFSourceProvider( - puf_path=puf_path, - demographics_path=demographics_path, - target_year=2024, - social_security_share_model_loader=_mock_social_security_share_model_loader, - ) - frame = provider.load_frame( - SourceQuery(period=2024, provider_filters={"sample_n": 1, "random_seed": 0}) - ) - - assert isinstance(provider, SourceProvider) - assert set(frame.tables) == {EntityType.HOUSEHOLD, EntityType.PERSON} - assert len(frame.tables[EntityType.HOUSEHOLD]) == 1 - assert frame.tables[EntityType.PERSON]["household_id"].nunique() == 1 - assert frame.tables[EntityType.HOUSEHOLD]["year"].tolist() == [2024] - assert frame.tables[EntityType.PERSON]["year"].nunique() == 1 - assert "income" in frame.tables[EntityType.PERSON].columns - assert provider.descriptor.name.startswith("irs_soi_puf_") - assert frame.source.archetype is SourceArchetype.TAX_MICRODATA - - -def test_puf_source_provider_sampling_respects_tax_unit_weights(tmp_path): - puf = pd.DataFrame( - { - "RECID": [101, 202, 303], - "MARS": [1, 1, 1], - "XTOT": [1, 1, 1], - "S006": [0.0, 0.0, 100.0], - "E00200": [10_000.0, 20_000.0, 30_000.0], - "AGE_HEAD": [45, 55, 65], - "GENDER": [1, 2, 1], - } - ) - puf_path = tmp_path / "puf.csv" - demographics_path = tmp_path / "demographics.csv" - puf.to_csv(puf_path, index=False) - pd.DataFrame({"RECID": [101, 202, 303]}).to_csv(demographics_path, index=False) - - provider = PUFSourceProvider( - puf_path=puf_path, - demographics_path=demographics_path, - target_year=2024, - social_security_share_model_loader=_mock_social_security_share_model_loader, - ) - frame = provider.load_frame( - SourceQuery(period=2024, provider_filters={"sample_n": 1, "random_seed": 0}) - ) - - assert frame.tables[EntityType.HOUSEHOLD]["household_id"].tolist() == ["303"] - assert frame.tables[EntityType.PERSON]["household_id"].nunique() == 1 - assert frame.tables[EntityType.PERSON]["household_id"].iloc[0] == "303" - - -def test_puf_source_provider_marks_placeholder_and_derived_variables_in_capabilities( - tmp_path, -): - puf = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ) - puf_path = tmp_path / "puf.csv" - demographics_path = tmp_path / "demographics.csv" - puf.to_csv(puf_path, index=False) - pd.DataFrame({"RECID": [101]}).to_csv(demographics_path, index=False) - - provider = PUFSourceProvider( - puf_path=puf_path, - demographics_path=demographics_path, - target_year=2024, - social_security_share_model_loader=_mock_social_security_share_model_loader, - ) - frame = provider.load_frame(SourceQuery(period=2024)) - descriptor = frame.source - - assert not descriptor.allows_conditioning_on("state_fips") - assert not descriptor.is_authoritative_for("state_fips") - assert not descriptor.allows_conditioning_on("income") - assert not descriptor.is_authoritative_for("income") - assert descriptor.is_authoritative_for("employment_income") - assert not descriptor.allows_conditioning_on("employment_income") - assert descriptor.allows_conditioning_on("age") - - -def test_puf_source_provider_does_not_duplicate_joint_tax_unit_financial_income( - tmp_path, -): - puf = pd.DataFrame( - { - "RECID": [101], - "MARS": [2], - "XTOT": [2], - "S006": [100.0], - "E00200": [100.0], - "E00900": [50.0], - "E00300": [20.0], - "E00600": [30.0], - "E00650": [10.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ) - puf_path = tmp_path / "puf.csv" - demographics_path = tmp_path / "demographics.csv" - puf.to_csv(puf_path, index=False) - pd.DataFrame({"RECID": [101]}).to_csv(demographics_path, index=False) - - provider = PUFSourceProvider( - puf_path=puf_path, - demographics_path=demographics_path, - target_year=2015, - social_security_share_model_loader=_mock_social_security_share_model_loader, - ) - frame = provider.load_frame(SourceQuery(period=2015)) - persons = frame.tables[EntityType.PERSON] - - assert len(persons) == 2 - assert persons["taxable_interest_income"].sum() == 20.0 - assert persons["ordinary_dividend_income"].sum() == 30.0 - assert persons["qualified_dividend_income"].sum() == 10.0 - assert persons["income"].sum() == 200.0 - - -def test_puf_source_provider_maps_policyengine_medical_and_alimony_inputs(tmp_path): - puf = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - "E00800": [2_000.0], - "E17500": [1_000.0], - "E26390": [700.0], - "E26400": [200.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ) - puf_path = tmp_path / "puf.csv" - demographics_path = tmp_path / "demographics.csv" - puf.to_csv(puf_path, index=False) - pd.DataFrame({"RECID": [101]}).to_csv(demographics_path, index=False) - - provider = PUFSourceProvider( - puf_path=puf_path, - demographics_path=demographics_path, - target_year=2015, - social_security_share_model_loader=_mock_social_security_share_model_loader, - ) - frame = provider.load_frame(SourceQuery(period=2015)) - persons = frame.tables[EntityType.PERSON] - - assert persons["alimony_income"].sum() == 2_000.0 - assert persons["health_insurance_premiums_without_medicare_part_b"].sum() == 453.0 - assert persons["other_medical_expenses"].sum() == 325.0 - assert persons["medicare_part_b_premiums"].sum() == 137.0 - assert persons["over_the_counter_health_expenses"].sum() == 85.0 - assert persons["estate_income"].sum() == 500.0 - assert persons["income"].sum() == 52_000.0 - - -def test_map_puf_variables_preserves_rental_loss_sign(): - raw = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E25850": [200.0], - "E25860": [500.0], - } - ) - - mapped = map_puf_variables(raw) - - assert mapped.loc[0, "rental_income_positive"] == 200.0 - assert mapped.loc[0, "rental_income_negative"] == 500.0 - assert mapped.loc[0, "rental_income"] == -300.0 - - -def test_map_puf_variables_uses_pe_puf_business_and_farm_income_formulas(): - raw = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E26270": [999.0], - "E26190": [1_200.0], - "E26180": [200.0], - "E25940": [25.0], - "E25980": [500.0], - "E25920": [0.0], - "E25960": [50.0], - "E30400": [923.5], - "E30500": [0.0], - "E00900": [0.0], - "E02100": [300.0], - "T27800": [700.0], - "E27200": [125.0], - } - ) - - mapped = map_puf_variables(raw) - - assert mapped.loc[0, "partnership_s_corp_income"] == 1_450.0 - assert mapped.loc[0, "partnership_se_income"] == 700.0 - assert mapped.loc[0, "farm_income"] == 700.0 - assert mapped.loc[0, "farm_operations_income"] == 300.0 - assert mapped.loc[0, "farm_rent_income"] == 125.0 - - -def test_map_puf_variables_adds_qbi_export_support_columns(): - n = 800 - raw = pd.DataFrame( - { - "RECID": range(1, n + 1), - "MARS": np.where(np.arange(n) % 3 == 0, 2, 1), - "XTOT": np.where(np.arange(n) % 3 == 0, 2, 1), - "S006": np.full(n, 100.0), - "E00900": np.linspace(1_000.0, 200_000.0, n), - "E02100": np.linspace(0.0, 40_000.0, n), - "E26270": np.linspace(0.0, 60_000.0, n), - "E26390": np.linspace(0.0, 8_000.0, n), - "E26400": np.zeros(n), - "E25850": np.linspace(0.0, 30_000.0, n), - "E25860": np.zeros(n), - } - ) - - mapped = map_puf_variables(raw) - - expected_columns = { - "business_is_sstb", - "w2_wages_from_qualified_business", - "unadjusted_basis_qualified_property", - "sstb_self_employment_income_before_lsr", - "sstb_self_employment_income_would_be_qualified", - "sstb_w2_wages_from_qualified_business", - "sstb_unadjusted_basis_qualified_property", - "qualified_reit_and_ptp_income", - "qualified_bdc_income", - "self_employment_income_would_be_qualified", - "farm_operations_income_would_be_qualified", - "farm_rent_income_would_be_qualified", - "rental_income_would_be_qualified", - "estate_income_would_be_qualified", - "partnership_s_corp_income_would_be_qualified", - } - assert expected_columns <= set(mapped.columns) - assert mapped["w2_wages_from_qualified_business"].sum() > 0.0 - assert mapped["unadjusted_basis_qualified_property"].sum() > 0.0 - assert mapped["qualified_reit_and_ptp_income"].sum() > 0.0 - assert mapped["business_is_sstb"].any() - assert mapped["self_employment_income_would_be_qualified"].nunique() == 2 - np.testing.assert_allclose( - mapped["self_employment_income"] - + mapped["sstb_self_employment_income_before_lsr"], - raw["E00900"], - ) - - -def test_map_puf_variables_adds_pe_exact_irs_inputs(): - raw = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00600": [500.0], - "E00650": [200.0], - "E01400": [1_250.0], - "E01500": [2_100.0], - "E01700": [1_600.0], - "E02300": [800.0], - "E02400": [1_100.0], - "E03290": [300.0], - "E07300": [90.0], - "E07400": [80.0], - "E07600": [70.0], - "E09700": [60.0], - "E09800": [50.0], - "E11200": [40.0], - "E24518": [30.0], - "E24515": [20.0], - "E58990": [10.0], - "E62900": [5.0], - "E87521": [15.0], - "P08000": [25.0], - "E07240": [35.0], - "E07260": [45.0], - "E00700": [55.0], - "E01200": [65.0], - } - ) - - mapped = map_puf_variables(raw) - - assert mapped.loc[0, "non_qualified_dividend_income"] == 300.0 - assert mapped.loc[0, "taxable_ira_distributions"] == 1_250.0 - assert mapped.loc[0, "taxable_unemployment_compensation"] == 800.0 - assert mapped.loc[0, "social_security"] == 1_100.0 - assert mapped.loc[0, "tax_exempt_pension_income"] == 500.0 - assert mapped.loc[0, "health_savings_account_ald"] == 300.0 - assert mapped.loc[0, "foreign_tax_credit"] == 90.0 - assert mapped.loc[0, "general_business_credit"] == 80.0 - assert mapped.loc[0, "prior_year_minimum_tax_credit"] == 70.0 - assert mapped.loc[0, "recapture_of_investment_credit"] == 60.0 - assert mapped.loc[0, "unreported_payroll_tax"] == 50.0 - assert mapped.loc[0, "excess_withheld_payroll_tax"] == 40.0 - assert mapped.loc[0, "long_term_capital_gains_on_collectibles"] == 30.0 - assert mapped.loc[0, "unrecaptured_section_1250_gain"] == 20.0 - assert mapped.loc[0, "investment_income_elected_form_4952"] == 10.0 - assert mapped.loc[0, "amt_foreign_tax_credit"] == 5.0 - assert mapped.loc[0, "american_opportunity_credit"] == 15.0 - assert mapped.loc[0, "other_credits"] == 25.0 - assert mapped.loc[0, "savers_credit"] == 35.0 - assert mapped.loc[0, "energy_efficient_home_improvement_credit"] == 45.0 - assert mapped.loc[0, "salt_refund_income"] == 55.0 - assert mapped.loc[0, "miscellaneous_income"] == 65.0 - - -def test_map_puf_variables_can_impute_pre_tax_contributions_with_injected_model(): - class DummyFittedModel: - def predict(self, X_test): - return pd.DataFrame( - {"pre_tax_contributions": X_test["employment_income"] * 0.1}, - index=X_test.index, - ) - - raw = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ) - - mapped = map_puf_variables( - raw, - impute_pre_tax_contributions=True, - pre_tax_contribution_model=puf_module.PEStyleQRFImputationModel( - predictors=("employment_income", "age", "is_male"), - imputed_variable="pre_tax_contributions", - fitted_model=DummyFittedModel(), - ), - ) - - assert mapped.loc[0, "pre_tax_contributions"] == 5_000.0 - - -def test_map_puf_variables_can_require_pre_tax_contribution_model(monkeypatch): - raw = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ) - - def _raise_missing_model(**_kwargs): - raise ModuleNotFoundError("policyengine_us_data") - - monkeypatch.setattr( - puf_module, - "_default_pe_style_puf_pre_tax_contribution_model", - _raise_missing_model, - ) - - with pytest.raises(ModuleNotFoundError, match="policyengine_us_data"): - map_puf_variables( - raw, - impute_pre_tax_contributions=True, - require_pre_tax_contribution_model=True, - ) - - -def test_map_puf_variables_uses_microplex_cps_pre_tax_training_when_legacy_h5_missing( - monkeypatch, tmp_path -): - raw = pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ) - qrf_calls = _install_fake_qrf( - monkeypatch, - pd.DataFrame({"pre_tax_contributions": [4321.0]}), - ) - h5_calls: list[dict[str, object]] = [] - local_calls: list[int] = [] - - def missing_h5(**kwargs): - h5_calls.append(dict(kwargs)) - raise FileNotFoundError("missing h5") - - def local_training_frame(*, training_year: int): - local_calls.append(training_year) - return pd.DataFrame( - { - "employment_income": [10_000.0, 20_000.0], - "age": [30.0, 45.0], - "is_male": [0.0, 1.0], - "pre_tax_contributions": [500.0, 1_500.0], - } - ) - - monkeypatch.setattr( - puf_module, - "_load_pe_extended_cps_pre_tax_training_frame", - missing_h5, - ) - monkeypatch.setattr( - puf_module, - "_load_microplex_cps_pre_tax_training_frame", - local_training_frame, - ) - - mapped = map_puf_variables( - raw, - impute_pre_tax_contributions=True, - policyengine_us_data_repo=tmp_path, - policyengine_us_data_python="/fake/python", - require_pre_tax_contribution_model=True, - ) - - assert mapped.loc[0, "pre_tax_contributions"] == 4321.0 - assert h5_calls == [{"policyengine_us_data_repo": tmp_path, "training_year": 2024}] - assert local_calls == [2024] - assert qrf_calls["predictors"] == ("employment_income", "age", "is_male") - assert qrf_calls["imputed_variables"] == ("pre_tax_contributions",) - - -def test_map_puf_variables_maps_widow_status_to_surviving_spouse(): - raw = pd.DataFrame( - { - "RECID": [101], - "MARS": [5], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - } - ) - - mapped = map_puf_variables( - raw, - impute_pre_tax_contributions=False, - ) - - assert mapped.loc[0, "filing_status"] == "SURVIVING_SPOUSE" - assert bool(mapped.loc[0, "is_surviving_spouse"]) - - -def test_map_puf_variables_does_not_infer_is_separated_from_mars_code(): - raw = pd.DataFrame( - { - "RECID": [101], - "MARS": [3], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - } - ) - - mapped = map_puf_variables( - raw, - impute_pre_tax_contributions=False, - ) - - assert mapped.loc[0, "filing_status"] == "SEPARATE" - assert "is_separated" not in mapped.columns - assert not bool(mapped.loc[0, "is_surviving_spouse"]) - - -def test_impute_missing_puf_demographics_uses_qrf_predictions(monkeypatch): - prediction_frame = pd.DataFrame( - { - "AGEDP1": [2.2], - "AGEDP2": [3.1], - "AGEDP3": [0.0], - "AGERANGE": [4.4], - "EARNSPLIT": [2.6], - "GENDER": [1.8], - } - ) - calls = _install_fake_qrf(monkeypatch, prediction_frame) - - raw = pd.DataFrame( - { - "RECID": list(range(101, 202)) + [999], - "E00200": [50_000.0] * 101 + [80_000.0], - "MARS": [1] * 101 + [2], - "DSI": [0] * 102, - "EIC": [0] * 101 + [1], - "XTOT": [1] * 101 + [2], - "AGEDP1": [1.0] * 101 + [float("nan")], - "AGEDP2": [0.0] * 101 + [float("nan")], - "AGEDP3": [0.0] * 101 + [float("nan")], - "AGERANGE": [3.0] * 101 + [float("nan")], - "EARNSPLIT": [0.0] * 101 + [float("nan")], - "GENDER": [1.0] * 101 + [float("nan")], - } - ) - - imputed = _impute_missing_puf_demographics(raw) - - assert calls["predictors"] == ("E00200", "MARS", "DSI", "EIC", "XTOT") - assert calls["imputed_variables"] == ( - "AGEDP1", - "AGEDP2", - "AGEDP3", - "AGERANGE", - "EARNSPLIT", - "GENDER", - ) - assert imputed.loc[101, "AGEDP1"] == 2 - assert imputed.loc[101, "AGEDP2"] == 3 - assert imputed.loc[101, "AGERANGE"] == 4 - assert imputed.loc[101, "EARNSPLIT"] == 3 - assert imputed.loc[101, "GENDER"] == 2 - - -def test_download_puf_prefers_existing_local_files_without_hub_lookup( - tmp_path, monkeypatch -): - puf_path = tmp_path / "puf_2015.csv" - demographics_path = tmp_path / "demographics_2015.csv" - puf_path.write_text("RECID,MARS\n1,1\n") - demographics_path.write_text("RECID\n1\n") - - def fail_download(*args, **kwargs): - raise AssertionError( - "hf_hub_download should not be called when local files exist" - ) - - monkeypatch.setattr(puf_module, "hf_hub_download", fail_download, raising=False) - - resolved_puf_path, resolved_demo_path = puf_module.download_puf(tmp_path) - - assert resolved_puf_path == puf_path - assert resolved_demo_path == demographics_path - - -def test_puf_source_provider_prefers_policyengine_repo_local_raw_files( - tmp_path, monkeypatch -): - repo_root = tmp_path / "policyengine-us-data" - storage_dir = repo_root / "policyengine_us_data" / "storage" - storage_dir.mkdir(parents=True) - pd.DataFrame( - { - "RECID": [101], - "MARS": [1], - "XTOT": [1], - "S006": [100.0], - "E00200": [50_000.0], - "E02400": [0.0], - "E01400": [0.0], - "AGE_HEAD": [45], - "GENDER": [1], - } - ).to_csv(storage_dir / "puf_2015.csv", index=False) - pd.DataFrame({"RECID": [101]}).to_csv( - storage_dir / "demographics_2015.csv", index=False - ) - - def fail_loader(*args, **kwargs): - raise AssertionError( - "remote/cache loader should not run when repo-local PUF exists" - ) - - provider = PUFSourceProvider( - target_year=2015, - policyengine_us_data_repo=repo_root, - loader=fail_loader, - social_security_share_model_loader=_mock_social_security_share_model_loader, - ) - - frame = provider.load_frame(SourceQuery(period=2015)) - persons = frame.tables[EntityType.PERSON] - - assert len(persons) == 1 - assert persons["employment_income"].sum() == 50_000.0 - - -def test_map_puf_variables_seed_controls_age_imputation(): - puf = pd.DataFrame( - { - "RECID": [101, 202, 303], - "MARS": [1, 1, 1], - "XTOT": [1, 1, 1], - "S006": [100.0, 100.0, 100.0], - "E00200": [50_000.0, 150_000.0, 250_000.0], - "E02400": [0.0, 10_000.0, 0.0], - "E01400": [0.0, 0.0, 20_000.0], - } - ) - - first = puf_module.map_puf_variables(puf, random_seed=17) - second = puf_module.map_puf_variables(puf, random_seed=17) - third = puf_module.map_puf_variables(puf, random_seed=18) - - assert first["age"].tolist() == second["age"].tolist() - assert first["age"].tolist() != third["age"].tolist() - - -def test_puf_source_provider_age_imputation_is_reproducible_with_same_seed(tmp_path): - puf = pd.DataFrame( - { - "RECID": [101, 202, 303], - "MARS": [1, 2, 1], - "XTOT": [1, 2, 1], - "S006": [100.0, 120.0, 80.0], - "E00200": [50_000.0, 75_000.0, 150_000.0], - "E02400": [0.0, 8_000.0, 0.0], - "E01400": [0.0, 0.0, 25_000.0], - "GENDER": [1, 2, 1], - } - ) - puf_path = tmp_path / "puf.csv" - demographics_path = tmp_path / "demographics.csv" - puf.to_csv(puf_path, index=False) - pd.DataFrame({"RECID": [101, 202, 303]}).to_csv(demographics_path, index=False) - - provider = PUFSourceProvider( - puf_path=puf_path, - demographics_path=demographics_path, - target_year=2024, - social_security_share_model_loader=_mock_social_security_share_model_loader, - ) - query = SourceQuery(period=2024, provider_filters={"sample_n": 3, "random_seed": 7}) - first = provider.load_frame(query) - second = provider.load_frame(query) - - first_persons = ( - first.tables[EntityType.PERSON].sort_values("person_id").reset_index(drop=True) - ) - second_persons = ( - second.tables[EntityType.PERSON].sort_values("person_id").reset_index(drop=True) - ) - - assert first_persons["age"].tolist() == second_persons["age"].tolist() - - -def test_puf_sampling_falls_back_to_uniform_when_weighted_sampling_is_infeasible( - monkeypatch, -): - tax_units = pd.DataFrame( - { - "household_id": [1, 2, 3], - "weight": [10.0, 20.0, 30.0], - } - ) - - original_sample = pd.DataFrame.sample - - def flaky_sample(self, *args, **kwargs): - if kwargs.get("weights") is not None: - raise ValueError("Weighted sampling cannot be achieved with replace=False.") - return original_sample(self, *args, **kwargs) - - monkeypatch.setattr(pd.DataFrame, "sample", flaky_sample) - - sampled = _sample_tax_units( - tax_units, - sample_n=2, - random_seed=42, - ) - - assert len(sampled) == 2 - - -def test_puf_sampling_uses_raw_s006_weights_when_weight_column_missing(): - tax_units = pd.DataFrame( - { - "household_id": [1, 2, 3], - "S006": [0.0, 0.0, 100.0], - } - ) - - sampled = _sample_tax_units( - tax_units, - sample_n=1, - random_seed=42, - ) - - assert sampled["household_id"].tolist() == [3] diff --git a/tests/test_share_imputation.py b/tests/test_share_imputation.py deleted file mode 100644 index bac1a0d0..00000000 --- a/tests/test_share_imputation.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import annotations - -import pandas as pd - -from microplex_us.data_sources.share_imputation import ( - fit_grouped_share_model, - predict_grouped_component_shares, -) - - -def test_grouped_share_model_predicts_specific_group_shares(): - reference = pd.DataFrame( - { - "age_bucket": ["young", "young", "old", "old"], - "weight": [1.0, 1.0, 1.0, 1.0], - "ret": [0.0, 0.0, 1.0, 1.0], - "dis": [1.0, 1.0, 0.0, 0.0], - "surv": [0.0, 0.0, 0.0, 0.0], - "dep": [0.0, 0.0, 0.0, 0.0], - } - ) - model = fit_grouped_share_model( - reference, - explicit_component_columns=("ret", "dis", "surv"), - implicit_component_column="dep", - feature_sets=(("age_bucket",),), - weight_column="weight", - ) - - target = pd.DataFrame({"age_bucket": ["young", "old"]}) - shares = predict_grouped_component_shares(target, model) - - assert shares["ret"].tolist() == [0.0, 1.0] - assert shares["dis"].tolist() == [1.0, 0.0] - assert shares["surv"].tolist() == [0.0, 0.0] - assert shares["dep"].tolist() == [0.0, 0.0] - - -def test_grouped_share_model_falls_back_to_overall_and_keeps_mece(): - reference = pd.DataFrame( - { - "age_bucket": ["young", "old"], - "weight": [3.0, 1.0], - "ret": [0.0, 1.0], - "dis": [1.0, 0.0], - "surv": [0.0, 0.0], - "dep": [0.0, 0.0], - } - ) - model = fit_grouped_share_model( - reference, - explicit_component_columns=("ret", "dis", "surv"), - implicit_component_column="dep", - feature_sets=(("age_bucket",),), - weight_column="weight", - ) - - target = pd.DataFrame({"age_bucket": ["missing"]}) - shares = predict_grouped_component_shares(target, model) - - assert shares["ret"].iloc[0] == 0.25 - assert shares["dis"].iloc[0] == 0.75 - assert shares["surv"].iloc[0] == 0.0 - assert shares["dep"].iloc[0] == 0.0 - assert shares.sum(axis=1).iloc[0] == 1.0 diff --git a/tests/test_source_registry.py b/tests/test_source_registry.py deleted file mode 100644 index 8c06fa2a..00000000 --- a/tests/test_source_registry.py +++ /dev/null @@ -1,63 +0,0 @@ -"""Tests for declarative source-variable capability registry.""" - -from microplex.core import SourceVariableCapability - -from microplex_us.source_registry import ( - PUF_SOURCE_VARIABLE_POLICY, - SourceVariablePolicy, - resolve_source_variable_capabilities, -) - - -def test_source_variable_policy_overrides_selected_fields(): - base = SourceVariableCapability(authoritative=True, usable_as_condition=True) - policy = SourceVariablePolicy(authoritative=False) - - resolved = policy.apply(base) - - assert not resolved.authoritative - assert resolved.usable_as_condition - - -def test_resolve_source_variable_capabilities_matches_source_prefix_and_year_suffix(): - capabilities = resolve_source_variable_capabilities( - "irs_soi_puf_2024", - ( - "state_fips", - "income", - "employment_status", - "taxable_interest_income", - "filing_status_code", - ), - ) - - assert not capabilities["state_fips"].usable_as_condition - assert not capabilities["income"].authoritative - assert not capabilities["income"].usable_as_condition - assert not capabilities["employment_status"].usable_as_condition - assert "taxable_interest_income" not in capabilities - assert capabilities["filing_status_code"].authoritative - assert not capabilities["filing_status_code"].usable_as_condition - - -def test_puf_policy_spec_matches_provider_names(): - assert PUF_SOURCE_VARIABLE_POLICY.matches("irs_soi_puf") - assert PUF_SOURCE_VARIABLE_POLICY.matches("irs_soi_puf_2024") - assert not PUF_SOURCE_VARIABLE_POLICY.matches("cps_asec_2023") - - -def test_resolve_source_variable_capabilities_applies_generic_variable_semantics(): - capabilities = resolve_source_variable_capabilities( - "cps_asec_2023", - ( - "qualified_dividend_income", - "non_qualified_dividend_income", - "ordinary_dividend_income", - "dividend_income", - ), - ) - - assert not capabilities["dividend_income"].authoritative - assert not capabilities["dividend_income"].usable_as_condition - assert not capabilities["ordinary_dividend_income"].authoritative - assert not capabilities["ordinary_dividend_income"].usable_as_condition diff --git a/tests/test_target_adapters.py b/tests/test_target_adapters.py deleted file mode 100644 index a1d19dda..00000000 --- a/tests/test_target_adapters.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Tests for translating US targets into the canonical core target spec.""" - -from microplex.core import EntityType -from microplex.targets import TargetAggregation - -from microplex_us.policyengine.us import ( - PolicyEngineUSConstraint, - PolicyEngineUSDBTarget, -) -from microplex_us.targets import ( - policyengine_db_target_to_canonical_spec, -) - - -class TestPolicyEngineTargetAdapters: - def test_policyengine_count_target_inferrs_entity_and_filters(self): - target = PolicyEngineUSDBTarget( - target_id=1, - variable="person_count", - period=2024, - stratum_id=2, - reform_id=0, - value=250.0, - active=True, - geo_level="state", - geographic_id="06", - constraints=( - PolicyEngineUSConstraint("state_fips", "==", "06"), - PolicyEngineUSConstraint("age", ">=", "65"), - ), - ) - - canonical = policyengine_db_target_to_canonical_spec(target) - - assert canonical.entity is EntityType.PERSON - assert canonical.aggregation is TargetAggregation.COUNT - assert canonical.measure is None - assert canonical.filters[0].feature == "state_fips" - assert canonical.filters[1].feature == "age" - assert canonical.metadata["target_id"] == 1 - - def test_policyengine_sum_target_uses_override_for_entity(self): - target = PolicyEngineUSDBTarget( - target_id=2, - variable="snap", - period=2024, - stratum_id=1, - reform_id=0, - value=10_000.0, - active=True, - ) - - canonical = policyengine_db_target_to_canonical_spec( - target, - entity_overrides={"snap": EntityType.SPM_UNIT}, - ) - - assert canonical.entity is EntityType.SPM_UNIT - assert canonical.aggregation is TargetAggregation.SUM - assert canonical.measure == "snap" - - def test_calculated_output_targets_require_policyengine_materialization(self): - target = PolicyEngineUSDBTarget( - target_id=3, - variable="snap", - period=2024, - stratum_id=1, - reform_id=0, - value=10_000.0, - active=True, - ) - - canonical = policyengine_db_target_to_canonical_spec(target) - - assert canonical.sim_modifier_names == ("policyengine_us_materialize",) - assert canonical.sim_modifiers[0].parameters == {"features": ["snap"]} - - def test_takeup_input_constraints_require_rerandomized_takeup_handler(self): - target = PolicyEngineUSDBTarget( - target_id=4, - variable="household_count", - period=2024, - stratum_id=1, - reform_id=0, - value=10_000.0, - active=True, - constraints=( - PolicyEngineUSConstraint("takes_up_snap_if_eligible", "==", "1"), - ), - ) - - canonical = policyengine_db_target_to_canonical_spec(target) - - assert canonical.sim_modifier_names == ("rerandomize_takeup",) - assert canonical.sim_modifiers[0].parameters == { - "features": ["takes_up_snap_if_eligible"] - } diff --git a/tests/test_target_registry.py b/tests/test_target_registry.py deleted file mode 100644 index 0b507d04..00000000 --- a/tests/test_target_registry.py +++ /dev/null @@ -1,105 +0,0 @@ -"""Tests for the canonical US target registry.""" - -from microplex.core import EntityType -from microplex.targets import ( - TargetAggregation, - TargetFilter, - TargetProvider, - TargetQuery, - TargetSpec, -) - -from microplex_us.target_registry import ( - TargetCategory, - TargetGroup, - TargetLevel, - TargetRegistry, -) - - -class TestTargetRegistry: - def test_registry_emits_canonical_target_specs(self): - registry = TargetRegistry() - - target = registry.get_group("state_population").targets[0] - - assert isinstance(target, TargetSpec) - assert target.entity is EntityType.PERSON - assert target.aggregation is TargetAggregation.COUNT - assert target.measure is None - assert target.filters == ( - TargetFilter(feature="state_fips", operator="==", value="01"), - ) - assert target.metadata["us_category"] == "geography" - assert target.metadata["us_level"] == "state" - assert target.metadata["us_group"] == "state_population" - assert target.metadata["available_in_cps"] is True - - def test_registry_selects_targets_by_metadata(self): - geography_target = TargetSpec( - name="ca_people", - entity=EntityType.PERSON, - value=2.0, - period=2024, - aggregation=TargetAggregation.COUNT, - filters=(TargetFilter(feature="state_fips", operator="==", value="06"),), - metadata={ - "us_category": "geography", - "us_level": "state", - "us_group": "geography_targets", - "available_in_cps": True, - "requires_imputation": False, - }, - ) - tax_target = TargetSpec( - name="tax_claims", - entity=EntityType.TAX_UNIT, - value=1.0, - period=2024, - aggregation=TargetAggregation.COUNT, - filters=(TargetFilter(feature="filing_status", operator="==", value="single"),), - metadata={ - "us_category": "tax", - "us_level": "national", - "us_group": "tax_targets", - "available_in_cps": False, - "requires_imputation": True, - }, - ) - registry = TargetRegistry( - groups={ - "geography_targets": TargetGroup( - name="geography_targets", - category=TargetCategory.GEOGRAPHY, - targets=[geography_target], - ), - "tax_targets": TargetGroup( - name="tax_targets", - category=TargetCategory.TAX, - targets=[tax_target], - ), - }, - build_defaults=False, - ) - - selected = registry.select_targets( - categories=[TargetCategory.GEOGRAPHY], - levels=[TargetLevel.STATE], - groups=["geography_targets"], - only_available=True, - entity=EntityType.PERSON, - ) - - assert selected == [geography_target] - assert isinstance(registry, TargetProvider) - assert registry.load_target_set( - TargetQuery( - entity=EntityType.PERSON, - provider_filters={ - "categories": [TargetCategory.GEOGRAPHY], - "levels": [TargetLevel.STATE], - "groups": ["geography_targets"], - "only_available": True, - }, - ) - ).targets == [geography_target] diff --git a/tests/test_variables.py b/tests/test_variables.py deleted file mode 100644 index 575a4852..00000000 --- a/tests/test_variables.py +++ /dev/null @@ -1,562 +0,0 @@ -"""Tests for atomic vs derived variable normalization.""" - -from __future__ import annotations - -import pandas as pd -import pytest -from microplex.core import EntityType - -from microplex_us.variables import ( - ConditionScoreMode, - DonorMatchStrategy, - ProjectionAggregation, - VariableSupportFamily, - add_dividend_composition_features, - apply_donor_variable_semantics, - donor_imputation_block_specs, - donor_imputation_blocks, - is_condition_var_compatible_with_entity, - is_condition_var_compatible_with_targets, - is_projected_condition_var_compatible, - normalize_dividend_columns, - normalize_social_security_columns, - prune_redundant_variables, - resolve_condition_entities_for_targets, - resolve_variable_semantic_capabilities, - restore_dividend_components_from_composition, - validate_donor_variable_semantics, -) - - -def test_normalize_dividend_columns_prefers_atomic_components_over_totals(): - frame = pd.DataFrame( - { - "qualified_dividend_income": [30.0], - "non_qualified_dividend_income": [12.0], - "ordinary_dividend_income": [80.0], - "dividend_income": [5.0], - } - ) - - normalized = normalize_dividend_columns(frame) - - assert normalized["qualified_dividend_income"].tolist() == [30.0] - assert normalized["non_qualified_dividend_income"].tolist() == [12.0] - assert normalized["ordinary_dividend_income"].tolist() == [42.0] - assert normalized["dividend_income"].tolist() == [42.0] - - -def test_normalize_dividend_columns_coalesces_sparse_total_aliases_by_row(): - frame = pd.DataFrame( - { - "ordinary_dividend_income": [0.0, 30.0, 0.0], - "dividend_income": [80.0, 999.0, 0.0], - "qualified_dividend_income": [0.0, 5.0, 0.0], - "non_qualified_dividend_income": [0.0, 25.0, 0.0], - } - ) - - normalized = normalize_dividend_columns(frame) - - # Row 0 carries only a dividend total (80) with no observed split, so it is - # allocated by the SOI qualified share instead of defaulting 100% to - # non-qualified. Rows 1-2 keep their observed components unchanged. - assert normalized["qualified_dividend_income"].tolist() == pytest.approx( - [62.4, 5.0, 0.0] - ) - assert normalized["non_qualified_dividend_income"].tolist() == pytest.approx( - [17.6, 25.0, 0.0] - ) - assert normalized["ordinary_dividend_income"].tolist() == [80.0, 30.0, 0.0] - assert normalized["dividend_income"].tolist() == [80.0, 30.0, 0.0] - - -def test_normalize_dividend_columns_splits_unsplit_total_by_qualified_share(): - # A row with only a dividend total (e.g. CPS DIV_VAL) and no qualified / - # non-qualified components must be split by the SOI qualified share, not - # left entirely non-qualified (which zeroed qualified dividends nationally - # and inverted the split vs the SOI targets). - frame = pd.DataFrame( - { - "qualified_dividend_income": [0.0], - "non_qualified_dividend_income": [0.0], - "dividend_income": [1_000.0], - } - ) - - normalized = normalize_dividend_columns(frame) - - assert normalized["qualified_dividend_income"].tolist() == pytest.approx([780.0]) - assert normalized["non_qualified_dividend_income"].tolist() == pytest.approx( - [220.0] - ) - assert normalized["dividend_income"].tolist() == pytest.approx([1_000.0]) - - -def test_normalize_social_security_columns_tracks_unclassified_residual(): - frame = pd.DataFrame( - { - "gross_social_security": [1_000.0, 900.0, 0.0], - "social_security_disability": [400.0, 0.0, 0.0], - "social_security_survivors": [100.0, 0.0, 50.0], - } - ) - - normalized = normalize_social_security_columns(frame) - - assert normalized["social_security"].tolist() == [1_000.0, 900.0, 50.0] - assert normalized["social_security_retirement"].tolist() == [0.0, 0.0, 0.0] - assert normalized["social_security_disability"].tolist() == [400.0, 0.0, 0.0] - assert normalized["social_security_survivors"].tolist() == [100.0, 0.0, 50.0] - assert normalized["social_security_dependents"].tolist() == [0.0, 0.0, 0.0] - assert normalized["social_security_unclassified"].tolist() == [500.0, 900.0, 0.0] - - -def test_prune_redundant_variables_drops_dividend_totals_when_basis_present(): - variables = { - "income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "dividend_income", - "ordinary_dividend_income", - } - - assert prune_redundant_variables(variables) == { - "income", - "qualified_dividend_income", - "non_qualified_dividend_income", - } - - -def test_dividend_composition_features_derive_total_and_share(): - frame = pd.DataFrame( - { - "qualified_dividend_income": [30.0, 0.0], - "non_qualified_dividend_income": [12.0, 0.0], - } - ) - - enriched = add_dividend_composition_features(frame) - - assert enriched["dividend_income"].tolist() == [42.0, 0.0] - assert enriched["ordinary_dividend_income"].tolist() == [42.0, 0.0] - assert enriched["qualified_dividend_share"].tolist() == [30.0 / 42.0, 0.0] - - -def test_restore_dividend_components_from_composition_reconstructs_atomic_basis(): - frame = pd.DataFrame( - { - "dividend_income": [42.0, 10.0], - "qualified_dividend_share": [30.0 / 42.0, 0.25], - } - ) - - restored = restore_dividend_components_from_composition(frame) - - assert restored["qualified_dividend_income"].round(6).tolist() == [30.0, 2.5] - assert restored["non_qualified_dividend_income"].round(6).tolist() == [12.0, 7.5] - assert restored["ordinary_dividend_income"].round(6).tolist() == [42.0, 10.0] - assert restored["dividend_income"].round(6).tolist() == [42.0, 10.0] - - -def test_donor_imputation_blocks_keep_dividends_in_one_composition_block(): - blocks = donor_imputation_blocks( - { - "qualified_dividend_income", - "non_qualified_dividend_income", - "taxable_interest_income", - "partnership_s_corp_income", - } - ) - - assert blocks == ( - ("dividend_income", "qualified_dividend_share"), - ("partnership_s_corp_income", "taxable_interest_income"), - ) - - -def test_donor_imputation_block_specs_include_match_strategies_and_restored_variables(): - specs = donor_imputation_block_specs( - { - "qualified_dividend_income", - "non_qualified_dividend_income", - "taxable_interest_income", - } - ) - - assert specs[0].model_variables == ("dividend_income", "qualified_dividend_share") - assert specs[0].restored_variables == ( - "qualified_dividend_income", - "non_qualified_dividend_income", - ) - assert specs[0].strategy_for("dividend_income") is DonorMatchStrategy.RANK - assert specs[0].native_entity is EntityType.PERSON - assert specs[0].condition_entities == ( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ) - assert specs[0].strategy_for("qualified_dividend_share") is DonorMatchStrategy.RANK - assert specs[1].model_variables == ("taxable_interest_income",) - assert specs[1].native_entity is EntityType.PERSON - assert specs[1].condition_entities == ( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ) - assert specs[1].strategy_for("taxable_interest_income") is DonorMatchStrategy.RANK - - -def test_donor_imputation_block_specs_use_zero_inflated_matching_for_sparse_irs_amounts(): - specs = donor_imputation_block_specs( - { - "health_savings_account_ald", - "self_employed_health_insurance_ald", - "self_employed_pension_contribution_ald", - "partnership_s_corp_income", - } - ) - - by_variable = { - variable_name: spec for spec in specs for variable_name in spec.model_variables - } - for variable_name in ( - "health_savings_account_ald", - "self_employed_health_insurance_ald", - "self_employed_pension_contribution_ald", - ): - assert by_variable[variable_name].native_entity is EntityType.PERSON - assert by_variable[variable_name].condition_entities == ( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ) - assert ( - by_variable[variable_name].strategy_for(variable_name) - is DonorMatchStrategy.RANK - ) - assert by_variable["partnership_s_corp_income"].native_entity is EntityType.PERSON - assert ( - by_variable["partnership_s_corp_income"].strategy_for( - "partnership_s_corp_income" - ) - is DonorMatchStrategy.RANK - ) - - -def test_condition_var_compatibility_allows_household_controls_for_tax_unit_targets(): - assert is_condition_var_compatible_with_entity( - "state_fips", - target_entity=EntityType.TAX_UNIT, - ) - assert is_condition_var_compatible_with_entity( - "tenure", - target_entity=EntityType.TAX_UNIT, - ) - assert not is_condition_var_compatible_with_entity( - "age", - target_entity=EntityType.TAX_UNIT, - ) - - -def test_resolve_condition_entities_uses_variable_family_policy(): - assert resolve_condition_entities_for_targets(("taxable_interest_income",)) == ( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ) - assert resolve_condition_entities_for_targets(("self_employment_income",)) == ( - EntityType.PERSON, - EntityType.HOUSEHOLD, - EntityType.TAX_UNIT, - ) - - -def test_condition_var_compatibility_with_targets_distinguishes_asset_and_labor_tax_vars(): - assert is_condition_var_compatible_with_targets( - "age", - target_variables=("taxable_interest_income",), - ) - assert is_condition_var_compatible_with_targets( - "age", - target_variables=("self_employment_income",), - ) - assert is_condition_var_compatible_with_targets( - "tenure", - target_variables=("taxable_interest_income",), - ) - - -def test_projected_condition_var_compatibility_promotes_person_features_to_group_entity(): - assert is_projected_condition_var_compatible( - "age", - projected_entity=EntityType.TAX_UNIT, - allowed_condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - ) - assert not is_projected_condition_var_compatible( - "age", - projected_entity=EntityType.HOUSEHOLD, - allowed_condition_entities=(EntityType.TAX_UNIT,), - ) - - -def test_resolve_variable_semantic_capabilities_marks_redundant_dividend_totals(): - capabilities = resolve_variable_semantic_capabilities( - { - "qualified_dividend_income", - "non_qualified_dividend_income", - "ordinary_dividend_income", - "dividend_income", - } - ) - - assert not capabilities["dividend_income"].authoritative - assert not capabilities["dividend_income"].usable_as_condition - assert not capabilities["ordinary_dividend_income"].authoritative - assert not capabilities["ordinary_dividend_income"].usable_as_condition - - -def test_variable_semantics_define_projection_aggregation_for_person_controls(): - from microplex_us.variables import variable_semantic_spec_for - - assert ( - EntityType.RECORD - not in variable_semantic_spec_for("age").allowed_condition_entities - ) - assert ( - variable_semantic_spec_for("age").projection_aggregation - is ProjectionAggregation.MAX - ) - assert ( - variable_semantic_spec_for("income").projection_aggregation - is ProjectionAggregation.SUM - ) - - -def test_state_program_proxy_semantics_are_registered(): - from microplex_us.variables import variable_semantic_spec_for - - has_medicaid = variable_semantic_spec_for("has_medicaid") - assert has_medicaid.support_family is VariableSupportFamily.SUPPORT_SENSITIVE - assert has_medicaid.donor_match_strategy is DonorMatchStrategy.RANK - assert has_medicaid.condition_score_mode is ConditionScoreMode.VALUE_AND_SUPPORT - assert has_medicaid.projection_aggregation is ProjectionAggregation.MAX - - for variable_name in ("public_assistance", "ssi", "social_security"): - spec = variable_semantic_spec_for(variable_name) - assert spec.support_family is VariableSupportFamily.SUPPORT_SENSITIVE - assert spec.donor_match_strategy is DonorMatchStrategy.RANK - - -def test_sparse_irs_ald_semantics_are_registered(): - from microplex_us.variables import variable_semantic_spec_for - - for variable_name in ( - "health_savings_account_ald", - "self_employed_health_insurance_ald", - "self_employed_pension_contribution_ald", - ): - spec = variable_semantic_spec_for(variable_name) - assert spec.native_entity is EntityType.PERSON - assert spec.support_family is VariableSupportFamily.SUPPORT_SENSITIVE - assert spec.donor_match_strategy is DonorMatchStrategy.RANK - assert spec.condition_score_mode is ConditionScoreMode.VALUE_AND_SUPPORT - - -def test_partnership_income_semantics_remain_person_native(): - from microplex_us.variables import variable_semantic_spec_for - - spec = variable_semantic_spec_for("partnership_s_corp_income") - assert spec.native_entity is EntityType.PERSON - assert spec.support_family is VariableSupportFamily.SUPPORT_SENSITIVE - assert spec.donor_match_strategy is DonorMatchStrategy.RANK - assert spec.condition_score_mode is ConditionScoreMode.VALUE_AND_SUPPORT - - -def test_sparse_irs_tax_variables_use_puf_irs_predictors(): - from microplex_us.variables import ( - PUF_DIVIDEND_INTEREST_CHALLENGER_SHARED_CONDITION_VARS, - PUF_IRS_TAX_PREFERRED_CONDITION_VARS, - PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, - PUF_PARTNERSHIP_CHALLENGER_SHARED_CONDITION_VARS, - PUF_PENSION_CHALLENGER_SHARED_CONDITION_VARS, - variable_semantic_spec_for, - ) - - for variable_name in ( - "dividend_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "taxable_interest_income", - "tax_exempt_interest_income", - "taxable_pension_income", - "taxable_social_security", - "student_loan_interest", - "health_savings_account_ald", - "self_employed_health_insurance_ald", - "self_employed_pension_contribution_ald", - "tax_unit_partnership_s_corp_income", - "partnership_s_corp_income", - ): - assert ( - variable_semantic_spec_for(variable_name).preferred_condition_vars - == PUF_IRS_TAX_PREFERRED_CONDITION_VARS - ) - assert ( - variable_semantic_spec_for(variable_name).supplemental_shared_condition_vars - == PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS - ) - - assert PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS == () - assert ( - variable_semantic_spec_for( - "taxable_interest_income" - ).challenger_shared_condition_vars - == PUF_DIVIDEND_INTEREST_CHALLENGER_SHARED_CONDITION_VARS - ) - assert ( - variable_semantic_spec_for( - "qualified_dividend_income" - ).challenger_shared_condition_vars - == PUF_DIVIDEND_INTEREST_CHALLENGER_SHARED_CONDITION_VARS - ) - assert ( - variable_semantic_spec_for( - "taxable_pension_income" - ).challenger_shared_condition_vars - == PUF_PENSION_CHALLENGER_SHARED_CONDITION_VARS - ) - assert ( - variable_semantic_spec_for( - "partnership_s_corp_income" - ).challenger_shared_condition_vars - == PUF_PARTNERSHIP_CHALLENGER_SHARED_CONDITION_VARS - ) - - -def test_rental_income_components_use_sparse_asset_conditioning(): - from microplex_us.variables import ( - RENTAL_INCOME_COMPONENT_PREFERRED_CONDITION_VARS, - variable_semantic_spec_for, - ) - - for variable_name in ("rental_income_positive", "rental_income_negative"): - spec = variable_semantic_spec_for(variable_name) - assert spec.support_family is VariableSupportFamily.SUPPORT_SENSITIVE - assert spec.donor_match_strategy is DonorMatchStrategy.RANK - assert spec.condition_score_mode is ConditionScoreMode.VALUE_AND_SUPPORT - assert ( - spec.preferred_condition_vars - == RENTAL_INCOME_COMPONENT_PREFERRED_CONDITION_VARS - ) - - -def test_person_native_irs_semantics_match_current_policyengine_entities(): - from microplex_us.variables import variable_semantic_spec_for - - for variable_name in ( - "dividend_income", - "ordinary_dividend_income", - "qualified_dividend_income", - "non_qualified_dividend_income", - "taxable_interest_income", - "tax_exempt_interest_income", - "taxable_pension_income", - "taxable_social_security", - "student_loan_interest", - "self_employment_income", - ): - assert ( - variable_semantic_spec_for(variable_name).native_entity is EntityType.PERSON - ) - - -def test_self_employment_income_semantics_preserve_signed_support(): - from microplex_us.variables import variable_semantic_spec_for - - spec = variable_semantic_spec_for("self_employment_income") - - assert spec.support_family is VariableSupportFamily.CONTINUOUS - assert spec.donor_match_strategy is DonorMatchStrategy.RANK - - -def test_employment_income_donor_semantics_zero_minor_wages(): - frame = pd.DataFrame( - { - "age": [16.0, 19.0], - "employment_income": [50_000.0, 25_000.0], - } - ) - - adjusted = apply_donor_variable_semantics(frame, ("employment_income",)) - - assert adjusted["employment_income"].tolist() == [0.0, 25_000.0] - - -def test_employment_income_donor_semantics_zero_retired_senior_wages_without_esi(): - frame = pd.DataFrame( - { - "age": [68.0, 68.0, 68.0], - "employment_income": [80_000.0, 80_000.0, 80_000.0], - "social_security_retirement": [18_000.0, 18_000.0, 0.0], - "has_esi": [0.0, 1.0, 0.0], - } - ) - - adjusted = apply_donor_variable_semantics(frame, ("employment_income",)) - - assert adjusted["employment_income"].tolist() == [0.0, 80_000.0, 80_000.0] - - -def test_employment_income_donor_semantics_uses_unclassified_social_security_compatibly(): - frame = pd.DataFrame( - { - "age": [68.0, 68.0, 68.0], - "employment_income": [80_000.0, 80_000.0, 80_000.0], - "social_security": [18_000.0, 18_000.0, 0.0], - "has_esi": [0.0, 1.0, 0.0], - } - ) - - adjusted = apply_donor_variable_semantics(frame, ("employment_income",)) - - assert adjusted["social_security_retirement"].tolist() == [0.0, 0.0, 0.0] - assert adjusted["social_security_unclassified"].tolist() == [ - 18_000.0, - 18_000.0, - 0.0, - ] - assert adjusted["employment_income"].tolist() == [0.0, 80_000.0, 80_000.0] - - -def test_validate_donor_variable_semantics_reports_minor_positive_wages(): - frame = pd.DataFrame( - { - "age": [16.0, 19.0], - "employment_income": [50_000.0, 25_000.0], - } - ) - - reports = validate_donor_variable_semantics(frame, ("employment_income",)) - - assert len(reports) == 1 - assert reports[0].name == "minor_positive_employment_income" - assert reports[0].evaluated is True - assert reports[0].violating_row_count == 1 - assert reports[0].passed is False - - -def test_validate_donor_variable_semantics_passes_after_minor_wage_guard(): - frame = pd.DataFrame( - { - "age": [16.0, 19.0], - "employment_income": [50_000.0, 25_000.0], - } - ) - - adjusted = apply_donor_variable_semantics(frame, ("employment_income",)) - reports = validate_donor_variable_semantics(adjusted, ("employment_income",)) - - assert len(reports) == 1 - assert reports[0].violating_row_count == 0 - assert reports[0].passed is True diff --git a/tests/validation/test_downstream.py b/tests/validation/test_downstream.py deleted file mode 100644 index 6f17873c..00000000 --- a/tests/validation/test_downstream.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Downstream tax-benefit aggregate validation (B2). - -After calibration, the synthesized microdata is ingested by -``policyengine_us.Microsimulation``. This module computes a canonical -set of downstream aggregates — federal income tax, EITC, CTC, SNAP, -SSI, ACA PTC — and compares them against external benchmarks (IRS -SOI, USDA, SSA, CMS). The comparison is the validation a tax-microsim -reviewer actually wants: not whether input targets were hit, but -whether the downstream policy outputs computed on the synthetic frame -look like the real-world outputs. - -These tests drive: - -1. ``DownstreamBenchmark`` is a typed record for one - external-benchmark comparison (name, computed, benchmark, source, - unit). -2. ``compute_downstream_comparison`` returns a dict of benchmark - name → ``DownstreamBenchmark`` with absolute and relative errors. -3. The module's canonical benchmark set for 2024 includes the six - required headline aggregates. -4. Relative error is signed (computed − benchmark) / benchmark. -5. A benchmark record round-trips to JSON. -""" - -from __future__ import annotations - -import json -import sys -from pathlib import Path -from types import ModuleType, SimpleNamespace - -import pytest - -from microplex_us.validation.downstream import ( - DOWNSTREAM_BENCHMARKS_2024, - DownstreamBenchmark, - compute_downstream_aggregates, - compute_downstream_comparison, - compute_downstream_weighted_aggregate, -) - - -class TestDownstreamBenchmark: - def test_benchmark_record_fields(self) -> None: - record = DownstreamBenchmark( - name="eitc", - computed=65_000_000_000.0, - benchmark=64_000_000_000.0, - unit="USD", - source="IRS SOI 2024", - ) - assert record.abs_error == pytest.approx(1_000_000_000.0) - assert record.rel_error == pytest.approx(1_000_000_000.0 / 64_000_000_000.0) - - def test_benchmark_record_serializes_to_json(self) -> None: - record = DownstreamBenchmark( - name="snap", - computed=100.0, - benchmark=110.0, - unit="USD", - source="USDA 2024", - ) - as_json = json.loads(json.dumps(record.to_dict())) - assert as_json["name"] == "snap" - assert as_json["computed"] == 100.0 - assert as_json["benchmark"] == 110.0 - assert as_json["rel_error"] == pytest.approx(-10.0 / 110.0) - - def test_benchmark_zero_benchmark_returns_none_rel(self) -> None: - """Guard against divide-by-zero in report generation.""" - record = DownstreamBenchmark( - name="zero", - computed=5.0, - benchmark=0.0, - unit="USD", - source="test", - ) - assert record.rel_error is None - - -class TestDownstreamBenchmarksSet: - def test_2024_benchmark_set_covers_headline_aggregates(self) -> None: - names = {b.name for b in DOWNSTREAM_BENCHMARKS_2024} - assert names >= {"income_tax", "eitc", "ctc", "snap", "ssi", "aca_ptc"} - - def test_2024_benchmarks_have_sources_cited(self) -> None: - """No magic numbers — each benchmark must declare its source.""" - for benchmark in DOWNSTREAM_BENCHMARKS_2024: - assert benchmark.source, f"missing source on {benchmark.name}" - assert benchmark.benchmark > 0, f"non-positive benchmark on {benchmark.name}" - - -class TestComputeDownstreamComparison: - def test_compute_from_aggregates_dict(self) -> None: - """The pure comparison step: given computed numbers, wrap them - with their benchmarks and errors. No PE-sim needed. - """ - computed = { - "income_tax": 2_300_000_000_000.0, - "eitc": 64_000_000_000.0, - "ctc": 115_000_000_000.0, - "snap": 98_000_000_000.0, - "ssi": 66_000_000_000.0, - "aca_ptc": 55_000_000_000.0, - } - result = compute_downstream_comparison(computed, DOWNSTREAM_BENCHMARKS_2024) - - assert set(result) == set(computed) - eitc = result["eitc"] - assert eitc.computed == 64_000_000_000.0 - assert eitc.benchmark > 0 - assert abs(eitc.rel_error) < 0.2, "EITC computed ~ benchmark" - assert eitc.source - - def test_compute_skips_missing_variables(self) -> None: - """If a variable doesn't have a benchmark, it's silently omitted.""" - computed = {"not_a_benchmark_name": 1.0, "eitc": 60_000_000_000.0} - result = compute_downstream_comparison(computed, DOWNSTREAM_BENCHMARKS_2024) - assert "not_a_benchmark_name" not in result - assert "eitc" in result - - -class TestComputeDownstreamAggregates: - @staticmethod - def _fake_simulation( - *, - values: dict[str, list[float]], - entities: dict[str, str], - ): - class FakeMicrosimulation: - def __init__(self, dataset: str = "fake.h5") -> None: - self.dataset = dataset - self.tax_benefit_system = SimpleNamespace( - get_variable=lambda name: SimpleNamespace( - entity=SimpleNamespace(key=entities[name]) - ) - ) - - def calculate(self, variable: str, period: int): - assert period == 2024 - return SimpleNamespace(values=values[variable]) - - return FakeMicrosimulation() - - def test_uses_entity_weights_for_weighted_totals( - self, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - ) -> None: - class FakeMicrosimulation: - def __init__(self, dataset: str) -> None: - self.dataset = dataset - self.tax_benefit_system = SimpleNamespace( - get_variable=lambda name: SimpleNamespace( - entity=SimpleNamespace( - key={ - "eitc": "tax_unit", - "snap": "spm_unit", - "ssi": "person", - }[name] - ) - ) - ) - - def calculate(self, variable: str, period: int): - assert period == 2024 - values = { - "eitc": [10.0, 20.0], - "tax_unit_weight": [100.0, 200.0], - "snap": [1.0, 2.0, 3.0], - "spm_unit_weight": [10.0, 20.0, 30.0], - "ssi": [7.0, 11.0], - "person_weight": [2.0, 3.0], - } - return SimpleNamespace(sum=lambda: sum(values[variable]), values=values[variable]) - - fake_module = ModuleType("policyengine_us") - fake_module.Microsimulation = FakeMicrosimulation - monkeypatch.setitem(sys.modules, "policyengine_us", fake_module) - - aggregates = compute_downstream_aggregates( - tmp_path / "fake.h5", - period=2024, - variables=("eitc", "snap", "ssi"), - ) - - assert aggregates["eitc"] == pytest.approx(10.0 * 100.0 + 20.0 * 200.0) - assert aggregates["snap"] == pytest.approx( - 1.0 * 10.0 + 2.0 * 20.0 + 3.0 * 30.0 - ) - assert aggregates["ssi"] == pytest.approx(7.0 * 2.0 + 11.0 * 3.0) - - def test_weighted_aggregate_rejects_unsupported_entity(self) -> None: - simulation = self._fake_simulation( - values={"odd_output": [1.0, 2.0]}, - entities={"odd_output": "benefit_unit"}, - ) - - with pytest.raises(ValueError, match="Unsupported entity"): - compute_downstream_weighted_aggregate( - simulation, - "odd_output", - period=2024, - ) - - def test_weighted_aggregate_rejects_value_weight_length_mismatch(self) -> None: - simulation = self._fake_simulation( - values={ - "eitc": [10.0, 20.0, 30.0], - "tax_unit_weight": [100.0, 200.0], - }, - entities={"eitc": "tax_unit"}, - ) - - with pytest.raises(ValueError, match="does not match"): - compute_downstream_weighted_aggregate(simulation, "eitc", period=2024) diff --git a/tests/validation/test_run_b2_batched.py b/tests/validation/test_run_b2_batched.py deleted file mode 100644 index f59069f3..00000000 --- a/tests/validation/test_run_b2_batched.py +++ /dev/null @@ -1,89 +0,0 @@ -from __future__ import annotations - -import importlib.util -from pathlib import Path - -import h5py -import numpy as np -import pytest - - -def _load_run_b2_batched_module(): - script_path = ( - Path(__file__).resolve().parents[2] / "scripts" / "run_b2_batched.py" - ) - spec = importlib.util.spec_from_file_location("run_b2_batched", script_path) - assert spec is not None and spec.loader is not None - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -class TestRunB2BatchedEntityResolution: - def test_prefers_policyengine_metadata_over_length_match(self) -> None: - module = _load_run_b2_batched_module() - arrays = { - "household_id": np.array([1, 2, 3]), - "tax_unit_id": np.array([10, 20, 30]), - "some_tax_unit_var": np.array([100.0, 200.0, 300.0]), - } - - entity = module._entity_of( - "some_tax_unit_var", - arrays, - variable_entities={"some_tax_unit_var": "tax_unit"}, - ) - - assert entity == "tax_unit" - - def test_ambiguous_length_match_raises_without_metadata(self) -> None: - module = _load_run_b2_batched_module() - arrays = { - "household_id": np.array([1, 2, 3]), - "tax_unit_id": np.array([10, 20, 30]), - "ambiguous_var": np.array([100.0, 200.0, 300.0]), - } - - with pytest.raises(ValueError, match="Ambiguous entity for variable"): - module._entity_of("ambiguous_var", arrays) - - def test_write_chunk_h5_slices_mixed_entities( - self, - tmp_path: Path, - ) -> None: - module = _load_run_b2_batched_module() - arrays = { - "household_id": np.array([1, 2]), - "household_weight": np.array([100.0, 200.0]), - "person_id": np.array([10, 11, 20]), - "person_household_id": np.array([1, 1, 2]), - "tax_unit_id": np.array([100, 200]), - "person_tax_unit_id": np.array([100, 100, 200]), - "tax_unit_weight": np.array([100.0, 200.0]), - "household_output": np.array([1.0, 2.0]), - "person_output": np.array([3.0, 4.0, 5.0]), - "tax_unit_output": np.array([6.0, 7.0]), - } - masks = module._build_entity_masks(arrays, np.array([1])) - output_path = tmp_path / "chunk.h5" - - module._write_chunk_h5( - arrays, - masks, - "2024", - output_path, - variable_entities={ - "household_output": "household", - "person_output": "person", - "tax_unit_output": "tax_unit", - }, - ) - - with h5py.File(output_path, "r") as handle: - assert handle["household_id"]["2024"][:].tolist() == [1] - assert handle["person_id"]["2024"][:].tolist() == [10, 11] - assert handle["tax_unit_id"]["2024"][:].tolist() == [100] - assert handle["household_output"]["2024"][:].tolist() == [1.0] - assert handle["person_output"]["2024"][:].tolist() == [3.0, 4.0] - assert handle["tax_unit_output"]["2024"][:].tolist() == [6.0] - assert handle["tax_unit_weight"]["2024"][:].tolist() == [100.0] diff --git a/uv.lock b/uv.lock deleted file mode 100644 index db81e926..00000000 --- a/uv.lock +++ /dev/null @@ -1,3135 +0,0 @@ -version = 1 -revision = 3 -requires-python = ">=3.13" -resolution-markers = [ - "python_full_version >= '3.14' and sys_platform == 'win32'", - "python_full_version >= '3.14' and sys_platform == 'emscripten'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", - "python_full_version < '3.14' and sys_platform == 'win32'", - "python_full_version < '3.14' and sys_platform == 'emscripten'", - "python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", -] - -[[package]] -name = "accessible-pygments" -version = "0.0.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bc/c1/bbac6a50d02774f91572938964c582fff4270eee73ab822a4aeea4d8b11b/accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872", size = 1377899, upload-time = "2024-05-10T11:23:10.216Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/3f/95338030883d8c8b91223b4e21744b04d11b161a3ef117295d8241f50ab4/accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7", size = 1395903, upload-time = "2024-05-10T11:23:08.421Z" }, -] - -[[package]] -name = "alabaster" -version = "0.7.16" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c9/3e/13dd8e5ed9094e734ac430b5d0eb4f2bb001708a8b7856cbf8e084e001ba/alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", size = 23776, upload-time = "2024-01-10T00:56:10.189Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/34/d4e1c02d3bee589efb5dfa17f88ea08bdb3e3eac12bc475462aec52ed223/alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92", size = 13511, upload-time = "2024-01-10T00:56:08.388Z" }, -] - -[[package]] -name = "alembic" -version = "1.18.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mako" }, - { name = "sqlalchemy" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/94/13/8b084e0f2efb0275a1d534838844926f798bd766566b1375174e2448cd31/alembic-1.18.4.tar.gz", hash = "sha256:cb6e1fd84b6174ab8dbb2329f86d631ba9559dd78df550b57804d607672cedbc", size = 2056725, upload-time = "2026-02-10T16:00:47.195Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/29/6533c317b74f707ea28f8d633734dbda2119bbadfc61b2f3640ba835d0f7/alembic-1.18.4-py3-none-any.whl", hash = "sha256:a5ed4adcf6d8a4cb575f3d759f071b03cd6e5c7618eb796cb52497be25bfe19a", size = 263893, upload-time = "2026-02-10T16:00:49.997Z" }, -] - -[[package]] -name = "annotated-doc" -version = "0.0.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" }, -] - -[[package]] -name = "annotated-types" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, -] - -[[package]] -name = "anyio" -version = "4.13.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "idna" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" }, -] - -[[package]] -name = "appnope" -version = "0.1.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, -] - -[[package]] -name = "asttokens" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/be/a5/8e3f9b6771b0b408517c82d97aed8f2036509bc247d46114925e32fe33f0/asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7", size = 62308, upload-time = "2025-11-15T16:43:48.578Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" }, -] - -[[package]] -name = "attrs" -version = "26.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9a/8e/82a0fe20a541c03148528be8cac2408564a6c9a0cc7e9171802bc1d26985/attrs-26.1.0.tar.gz", hash = "sha256:d03ceb89cb322a8fd706d4fb91940737b6642aa36998fe130a9bc96c985eff32", size = 952055, upload-time = "2026-03-19T14:22:25.026Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" }, -] - -[[package]] -name = "babel" -version = "2.18.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" }, -] - -[[package]] -name = "beautifulsoup4" -version = "4.14.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "soupsieve" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, -] - -[[package]] -name = "blosc2" -version = "4.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "msgpack" }, - { name = "ndindex" }, - { name = "numexpr", marker = "platform_machine != 'wasm32'" }, - { name = "numpy" }, - { name = "pydantic" }, - { name = "requests" }, - { name = "threadpoolctl", marker = "platform_machine != 'wasm32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/dc/19/e5ea8014688a1b0f3be16b662b6d0837da191f8fec79f3ca7c9452457ffe/blosc2-4.3.3.tar.gz", hash = "sha256:7c477482f10506a3b1e8868cb705b45caa64d13f42572878d0e4f26978ada55f", size = 5366402, upload-time = "2026-05-21T12:06:06.502Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/35/55/9178c8ef9a32613cd37a83275f584e1619df583c74678c1d47b6723e13b9/blosc2-4.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b21d6d3fb77b53d36f35a315bc4c4437a9f40cf35a76966871f5a8bdfbb07262", size = 5861725, upload-time = "2026-05-21T12:05:41.709Z" }, - { url = "https://files.pythonhosted.org/packages/ee/81/c30a9f3000e65acd677d0831cf23eb5d80d575d2cd5f05aa0ced29b52990/blosc2-4.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7f4b5bfb37121e46fc855d273a786cfc0c731802220f254241ac01ee1384fe06", size = 5025052, upload-time = "2026-05-21T12:05:43.964Z" }, - { url = "https://files.pythonhosted.org/packages/78/5f/b5b93cd123953120df0aa992fa93a7f41f1525c74f348c95f4d8e25721ae/blosc2-4.3.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a9458facbf4323377cb32b25478d200996b699e20338087291ca35aea32c8da8", size = 6254481, upload-time = "2026-05-21T12:05:45.363Z" }, - { url = "https://files.pythonhosted.org/packages/91/19/71fe5d7e517f30b595bf9e33154959eda23910db36991728fde51bdbdf42/blosc2-4.3.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b1bd93b4c7e8a96afddc17883e56a38e0b799876756371f64f94e639a51aa7e5", size = 6548217, upload-time = "2026-05-21T12:05:47.03Z" }, - { url = "https://files.pythonhosted.org/packages/98/88/337a23227257bc17a8a80c33fbaa7186534d75034da74c1ad5ea7e74b9f1/blosc2-4.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:0a4aa9fad69b8c125e959d005989991c3230dfa456caf21bb81186be9e2f61a2", size = 4127867, upload-time = "2026-05-21T12:05:48.552Z" }, - { url = "https://files.pythonhosted.org/packages/5a/6d/2e811ee8dc814cd9204058bb453961b19920f9ba222a39900b8975155d78/blosc2-4.3.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f1b37cdfc41c1da50b05b674416e495efb64054554d109433e3818310da99737", size = 5863540, upload-time = "2026-05-21T12:05:50.248Z" }, - { url = "https://files.pythonhosted.org/packages/ec/a4/5f61f69efb96c18185f5d1c5919ddb1183c33ea567eb5431762a93621c8c/blosc2-4.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c03b207730294e5809e872c5a5bd65f6d3370fdf101f1e6136e83621412d931e", size = 5027561, upload-time = "2026-05-21T12:05:51.744Z" }, - { url = "https://files.pythonhosted.org/packages/63/3a/b1f4e32b7d2947decba824a1ac34985713b7596d0986cbb22252fb67a128/blosc2-4.3.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d1834c6578d45bb50f48a293dd87fcc194cae5b45851ed07bb7e4116373d3f5b", size = 6266314, upload-time = "2026-05-21T12:05:53.278Z" }, - { url = "https://files.pythonhosted.org/packages/7e/41/e334180c1ba6f4ef1b20c6551e90aa3267d2996cff9e3f655425f35a49a9/blosc2-4.3.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1552a0257e4c9fc9b9594bbab5d8b908937a7a8d63bf3e33e8720fe59b4d0b03", size = 6549380, upload-time = "2026-05-21T12:05:54.866Z" }, - { url = "https://files.pythonhosted.org/packages/61/7f/35c785444704d2876402723e311ad14d387de0592682327978c10c0426a2/blosc2-4.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:fcd1bbc598542b2530fbd9fe5bf8146d15f01af4c2bcb12d2610160d218535d1", size = 4211185, upload-time = "2026-05-21T12:05:56.432Z" }, - { url = "https://files.pythonhosted.org/packages/44/9b/2823fd88862e9f2a54dd89fea98c051a3741e36bab5ad249e307eff5fcef/blosc2-4.3.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d0c8dc54155331a439c57a8858250c2af692ddaa39361c043c0544e5a2ea692b", size = 5901833, upload-time = "2026-05-21T12:05:58.04Z" }, - { url = "https://files.pythonhosted.org/packages/10/57/c67252bce832c0d6b3c54e4bfca7a6129037be64f78522f72b3fb2eec3dc/blosc2-4.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:36e03791fa1882ef25a5d0e2c3ed92e1b1265f05fb33094aa4c6b15dd5d7bf82", size = 5073470, upload-time = "2026-05-21T12:05:59.638Z" }, - { url = "https://files.pythonhosted.org/packages/ca/c4/1cb5ea5fed31740d9d7652944367703a1ff5cd76e74f15215e4691fd959e/blosc2-4.3.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:69afc003043df7bc090f096ad84aa952467025beb8b3bc906f77eca2e2728d07", size = 6229249, upload-time = "2026-05-21T12:06:01.512Z" }, - { url = "https://files.pythonhosted.org/packages/8f/f2/bfe3795f19b7868f41d4cfd98dbf551b771d78c9380adb39b97e08677f9f/blosc2-4.3.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee03608640c5e0363dd16f69ee8d5473d4a17d2a7b4a66bd9ac82fcfcaad6afd", size = 6519227, upload-time = "2026-05-21T12:06:03.338Z" }, - { url = "https://files.pythonhosted.org/packages/6f/a2/57590c5eba352614e4cff272d4b28b12b55bdd2fe1e1ed459b113d87cfe6/blosc2-4.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:322afea80b919e48b67b2730ee927449793d2856d65e54b9be67dfdfb610f2df", size = 4270602, upload-time = "2026-05-21T12:06:05.182Z" }, -] - -[[package]] -name = "boto3" -version = "1.43.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "botocore" }, - { name = "jmespath" }, - { name = "s3transfer" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0a/37/78c630d1308964aa9abf44951d9c4df776546ff37251ec2434944e205c4e/boto3-1.43.6.tar.gz", hash = "sha256:e6315effaf12b890b99956e6f8e2c3000a3f64e4ee91943cec3895ce9a836afb", size = 113153, upload-time = "2026-05-07T20:49:59.694Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c8/e2/3c2eef44f55eafab256836d1d9479bd6a74f70c26cbfdc0639a0e23e4327/boto3-1.43.6-py3-none-any.whl", hash = "sha256:179601ec2992726a718053bf41e43c223ceba397d31ceab11f64d9c910d9fc3a", size = 140502, upload-time = "2026-05-07T20:49:57.8Z" }, -] - -[[package]] -name = "botocore" -version = "1.43.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "jmespath" }, - { name = "python-dateutil" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/79/a7/23d0f5028011455096a1eeac0ddf3cbe147b3e855e127342f8202552194d/botocore-1.43.6.tar.gz", hash = "sha256:b1e395b347356860398da42e61c808cf1e34b6fa7180cf2b9d87d986e1a06ba0", size = 15336070, upload-time = "2026-05-07T20:49:48.14Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/c8/6f47223840e8d8cfa8c9f7c0ec1b77970417f257fc885169ff4f6326ce09/botocore-1.43.6-py3-none-any.whl", hash = "sha256:b6d1fdbc6f65a5fe0b7e947823aa37535d3f39f3ba4d21110fab1f55bbbcc04b", size = 15017094, upload-time = "2026-05-07T20:49:44.964Z" }, -] - -[[package]] -name = "census" -version = "0.8.26" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f7/e0/c1cde674716d836139550542febca6231616d776119ae73705036d741da7/census-0.8.26.tar.gz", hash = "sha256:c7f9944e38952b4ecc137d14d083018a1c2734f64d2fbc4a8946f35fd51888c2", size = 13019, upload-time = "2026-04-08T13:44:19.24Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/80/13/13dcc8a3142c3c73e5228c05e1ce6567378bc5c673d5567c116d4a8162d7/census-0.8.26-py3-none-any.whl", hash = "sha256:c341bbce4bcdd75c0ddecf75f28ab7eda26a47d7fecc95c4690a2d8ee5b6a727", size = 11364, upload-time = "2026-04-08T13:44:18.333Z" }, -] - -[[package]] -name = "certifi" -version = "2026.2.25" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, -] - -[[package]] -name = "cffi" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pycparser", marker = "implementation_name != 'PyPy'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" }, - { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" }, - { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" }, - { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, - { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" }, - { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" }, - { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, - { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, - { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, - { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" }, - { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" }, - { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" }, - { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" }, - { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" }, - { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" }, - { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" }, - { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" }, - { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" }, - { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" }, - { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" }, - { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" }, - { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" }, - { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" }, - { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" }, - { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" }, - { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" }, - { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" }, - { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" }, - { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" }, - { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" }, - { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" }, - { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" }, - { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" }, - { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" }, -] - -[[package]] -name = "charset-normalizer" -version = "3.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7b/60/e3bec1881450851b087e301bedc3daa9377a4d45f1c26aa90b0b235e38aa/charset_normalizer-3.4.6.tar.gz", hash = "sha256:1ae6b62897110aa7c79ea2f5dd38d1abca6db663687c0b1ad9aed6f6bae3d9d6", size = 143363, upload-time = "2026-03-15T18:53:25.478Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/1d/4fdabeef4e231153b6ed7567602f3b68265ec4e5b76d6024cf647d43d981/charset_normalizer-3.4.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:11afb56037cbc4b1555a34dd69151e8e069bee82e613a73bef6e714ce733585f", size = 294823, upload-time = "2026-03-15T18:51:15.755Z" }, - { url = "https://files.pythonhosted.org/packages/47/7b/20e809b89c69d37be748d98e84dce6820bf663cf19cf6b942c951a3e8f41/charset_normalizer-3.4.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:423fb7e748a08f854a08a222b983f4df1912b1daedce51a72bd24fe8f26a1843", size = 198527, upload-time = "2026-03-15T18:51:17.177Z" }, - { url = "https://files.pythonhosted.org/packages/37/a6/4f8d27527d59c039dce6f7622593cdcd3d70a8504d87d09eb11e9fdc6062/charset_normalizer-3.4.6-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d73beaac5e90173ac3deb9928a74763a6d230f494e4bfb422c217a0ad8e629bf", size = 218388, upload-time = "2026-03-15T18:51:18.934Z" }, - { url = "https://files.pythonhosted.org/packages/f6/9b/4770ccb3e491a9bacf1c46cc8b812214fe367c86a96353ccc6daf87b01ec/charset_normalizer-3.4.6-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d60377dce4511655582e300dc1e5a5f24ba0cb229005a1d5c8d0cb72bb758ab8", size = 214563, upload-time = "2026-03-15T18:51:20.374Z" }, - { url = "https://files.pythonhosted.org/packages/2b/58/a199d245894b12db0b957d627516c78e055adc3a0d978bc7f65ddaf7c399/charset_normalizer-3.4.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:530e8cebeea0d76bdcf93357aa5e41336f48c3dc709ac52da2bb167c5b8271d9", size = 206587, upload-time = "2026-03-15T18:51:21.807Z" }, - { url = "https://files.pythonhosted.org/packages/7e/70/3def227f1ec56f5c69dfc8392b8bd63b11a18ca8178d9211d7cc5e5e4f27/charset_normalizer-3.4.6-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:a26611d9987b230566f24a0a125f17fe0de6a6aff9f25c9f564aaa2721a5fb88", size = 194724, upload-time = "2026-03-15T18:51:23.508Z" }, - { url = "https://files.pythonhosted.org/packages/58/ab/9318352e220c05efd31c2779a23b50969dc94b985a2efa643ed9077bfca5/charset_normalizer-3.4.6-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:34315ff4fc374b285ad7f4a0bf7dcbfe769e1b104230d40f49f700d4ab6bbd84", size = 202956, upload-time = "2026-03-15T18:51:25.239Z" }, - { url = "https://files.pythonhosted.org/packages/75/13/f3550a3ac25b70f87ac98c40d3199a8503676c2f1620efbf8d42095cfc40/charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5f8ddd609f9e1af8c7bd6e2aca279c931aefecd148a14402d4e368f3171769fd", size = 201923, upload-time = "2026-03-15T18:51:26.682Z" }, - { url = "https://files.pythonhosted.org/packages/1b/db/c5c643b912740b45e8eec21de1bbab8e7fc085944d37e1e709d3dcd9d72f/charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:80d0a5615143c0b3225e5e3ef22c8d5d51f3f72ce0ea6fb84c943546c7b25b6c", size = 195366, upload-time = "2026-03-15T18:51:28.129Z" }, - { url = "https://files.pythonhosted.org/packages/5a/67/3b1c62744f9b2448443e0eb160d8b001c849ec3fef591e012eda6484787c/charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:92734d4d8d187a354a556626c221cd1a892a4e0802ccb2af432a1d85ec012194", size = 219752, upload-time = "2026-03-15T18:51:29.556Z" }, - { url = "https://files.pythonhosted.org/packages/f6/98/32ffbaf7f0366ffb0445930b87d103f6b406bc2c271563644bde8a2b1093/charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:613f19aa6e082cf96e17e3ffd89383343d0d589abda756b7764cf78361fd41dc", size = 203296, upload-time = "2026-03-15T18:51:30.921Z" }, - { url = "https://files.pythonhosted.org/packages/41/12/5d308c1bbe60cabb0c5ef511574a647067e2a1f631bc8634fcafaccd8293/charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:2b1a63e8224e401cafe7739f77efd3f9e7f5f2026bda4aead8e59afab537784f", size = 215956, upload-time = "2026-03-15T18:51:32.399Z" }, - { url = "https://files.pythonhosted.org/packages/53/e9/5f85f6c5e20669dbe56b165c67b0260547dea97dba7e187938833d791687/charset_normalizer-3.4.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6cceb5473417d28edd20c6c984ab6fee6c6267d38d906823ebfe20b03d607dc2", size = 208652, upload-time = "2026-03-15T18:51:34.214Z" }, - { url = "https://files.pythonhosted.org/packages/f1/11/897052ea6af56df3eef3ca94edafee410ca699ca0c7b87960ad19932c55e/charset_normalizer-3.4.6-cp313-cp313-win32.whl", hash = "sha256:d7de2637729c67d67cf87614b566626057e95c303bc0a55ffe391f5205e7003d", size = 143940, upload-time = "2026-03-15T18:51:36.15Z" }, - { url = "https://files.pythonhosted.org/packages/a1/5c/724b6b363603e419829f561c854b87ed7c7e31231a7908708ac086cdf3e2/charset_normalizer-3.4.6-cp313-cp313-win_amd64.whl", hash = "sha256:572d7c822caf521f0525ba1bce1a622a0b85cf47ffbdae6c9c19e3b5ac3c4389", size = 154101, upload-time = "2026-03-15T18:51:37.876Z" }, - { url = "https://files.pythonhosted.org/packages/01/a5/7abf15b4c0968e47020f9ca0935fb3274deb87cb288cd187cad92e8cdffd/charset_normalizer-3.4.6-cp313-cp313-win_arm64.whl", hash = "sha256:a4474d924a47185a06411e0064b803c68be044be2d60e50e8bddcc2649957c1f", size = 143109, upload-time = "2026-03-15T18:51:39.565Z" }, - { url = "https://files.pythonhosted.org/packages/25/6f/ffe1e1259f384594063ea1869bfb6be5cdb8bc81020fc36c3636bc8302a1/charset_normalizer-3.4.6-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:9cc6e6d9e571d2f863fa77700701dae73ed5f78881efc8b3f9a4398772ff53e8", size = 294458, upload-time = "2026-03-15T18:51:41.134Z" }, - { url = "https://files.pythonhosted.org/packages/56/60/09bb6c13a8c1016c2ed5c6a6488e4ffef506461aa5161662bd7636936fb1/charset_normalizer-3.4.6-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5960d965e67165d75b7c7ffc60a83ec5abfc5c11b764ec13ea54fbef8b4421", size = 199277, upload-time = "2026-03-15T18:51:42.953Z" }, - { url = "https://files.pythonhosted.org/packages/00/50/dcfbb72a5138bbefdc3332e8d81a23494bf67998b4b100703fd15fa52d81/charset_normalizer-3.4.6-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b3694e3f87f8ac7ce279d4355645b3c878d24d1424581b46282f24b92f5a4ae2", size = 218758, upload-time = "2026-03-15T18:51:44.339Z" }, - { url = "https://files.pythonhosted.org/packages/03/b3/d79a9a191bb75f5aa81f3aaaa387ef29ce7cb7a9e5074ba8ea095cc073c2/charset_normalizer-3.4.6-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5d11595abf8dd942a77883a39d81433739b287b6aa71620f15164f8096221b30", size = 215299, upload-time = "2026-03-15T18:51:45.871Z" }, - { url = "https://files.pythonhosted.org/packages/76/7e/bc8911719f7084f72fd545f647601ea3532363927f807d296a8c88a62c0d/charset_normalizer-3.4.6-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7bda6eebafd42133efdca535b04ccb338ab29467b3f7bf79569883676fc628db", size = 206811, upload-time = "2026-03-15T18:51:47.308Z" }, - { url = "https://files.pythonhosted.org/packages/e2/40/c430b969d41dda0c465aa36cc7c2c068afb67177bef50905ac371b28ccc7/charset_normalizer-3.4.6-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:bbc8c8650c6e51041ad1be191742b8b421d05bbd3410f43fa2a00c8db87678e8", size = 193706, upload-time = "2026-03-15T18:51:48.849Z" }, - { url = "https://files.pythonhosted.org/packages/48/15/e35e0590af254f7df984de1323640ef375df5761f615b6225ba8deb9799a/charset_normalizer-3.4.6-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:22c6f0c2fbc31e76c3b8a86fba1a56eda6166e238c29cdd3d14befdb4a4e4815", size = 202706, upload-time = "2026-03-15T18:51:50.257Z" }, - { url = "https://files.pythonhosted.org/packages/5e/bd/f736f7b9cc5e93a18b794a50346bb16fbfd6b37f99e8f306f7951d27c17c/charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7edbed096e4a4798710ed6bc75dcaa2a21b68b6c356553ac4823c3658d53743a", size = 202497, upload-time = "2026-03-15T18:51:52.012Z" }, - { url = "https://files.pythonhosted.org/packages/9d/ba/2cc9e3e7dfdf7760a6ed8da7446d22536f3d0ce114ac63dee2a5a3599e62/charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:7f9019c9cb613f084481bd6a100b12e1547cf2efe362d873c2e31e4035a6fa43", size = 193511, upload-time = "2026-03-15T18:51:53.723Z" }, - { url = "https://files.pythonhosted.org/packages/9e/cb/5be49b5f776e5613be07298c80e1b02a2d900f7a7de807230595c85a8b2e/charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:58c948d0d086229efc484fe2f30c2d382c86720f55cd9bc33591774348ad44e0", size = 220133, upload-time = "2026-03-15T18:51:55.333Z" }, - { url = "https://files.pythonhosted.org/packages/83/43/99f1b5dad345accb322c80c7821071554f791a95ee50c1c90041c157ae99/charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:419a9d91bd238052642a51938af8ac05da5b3343becde08d5cdeab9046df9ee1", size = 203035, upload-time = "2026-03-15T18:51:56.736Z" }, - { url = "https://files.pythonhosted.org/packages/87/9a/62c2cb6a531483b55dddff1a68b3d891a8b498f3ca555fbcf2978e804d9d/charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5273b9f0b5835ff0350c0828faea623c68bfa65b792720c453e22b25cc72930f", size = 216321, upload-time = "2026-03-15T18:51:58.17Z" }, - { url = "https://files.pythonhosted.org/packages/6e/79/94a010ff81e3aec7c293eb82c28f930918e517bc144c9906a060844462eb/charset_normalizer-3.4.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:0e901eb1049fdb80f5bd11ed5ea1e498ec423102f7a9b9e4645d5b8204ff2815", size = 208973, upload-time = "2026-03-15T18:51:59.998Z" }, - { url = "https://files.pythonhosted.org/packages/2a/57/4ecff6d4ec8585342f0c71bc03efaa99cb7468f7c91a57b105bcd561cea8/charset_normalizer-3.4.6-cp314-cp314-win32.whl", hash = "sha256:b4ff1d35e8c5bd078be89349b6f3a845128e685e751b6ea1169cf2160b344c4d", size = 144610, upload-time = "2026-03-15T18:52:02.213Z" }, - { url = "https://files.pythonhosted.org/packages/80/94/8434a02d9d7f168c25767c64671fead8d599744a05d6a6c877144c754246/charset_normalizer-3.4.6-cp314-cp314-win_amd64.whl", hash = "sha256:74119174722c4349af9708993118581686f343adc1c8c9c007d59be90d077f3f", size = 154962, upload-time = "2026-03-15T18:52:03.658Z" }, - { url = "https://files.pythonhosted.org/packages/46/4c/48f2cdbfd923026503dfd67ccea45c94fd8fe988d9056b468579c66ed62b/charset_normalizer-3.4.6-cp314-cp314-win_arm64.whl", hash = "sha256:e5bcc1a1ae744e0bb59641171ae53743760130600da8db48cbb6e4918e186e4e", size = 143595, upload-time = "2026-03-15T18:52:05.123Z" }, - { url = "https://files.pythonhosted.org/packages/31/93/8878be7569f87b14f1d52032946131bcb6ebbd8af3e20446bc04053dc3f1/charset_normalizer-3.4.6-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:ad8faf8df23f0378c6d527d8b0b15ea4a2e23c89376877c598c4870d1b2c7866", size = 314828, upload-time = "2026-03-15T18:52:06.831Z" }, - { url = "https://files.pythonhosted.org/packages/06/b6/fae511ca98aac69ecc35cde828b0a3d146325dd03d99655ad38fc2cc3293/charset_normalizer-3.4.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f5ea69428fa1b49573eef0cc44a1d43bebd45ad0c611eb7d7eac760c7ae771bc", size = 208138, upload-time = "2026-03-15T18:52:08.239Z" }, - { url = "https://files.pythonhosted.org/packages/54/57/64caf6e1bf07274a1e0b7c160a55ee9e8c9ec32c46846ce59b9c333f7008/charset_normalizer-3.4.6-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:06a7e86163334edfc5d20fe104db92fcd666e5a5df0977cb5680a506fe26cc8e", size = 224679, upload-time = "2026-03-15T18:52:10.043Z" }, - { url = "https://files.pythonhosted.org/packages/aa/cb/9ff5a25b9273ef160861b41f6937f86fae18b0792fe0a8e75e06acb08f1d/charset_normalizer-3.4.6-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e1f6e2f00a6b8edb562826e4632e26d063ac10307e80f7461f7de3ad8ef3f077", size = 223475, upload-time = "2026-03-15T18:52:11.854Z" }, - { url = "https://files.pythonhosted.org/packages/fc/97/440635fc093b8d7347502a377031f9605a1039c958f3cd18dcacffb37743/charset_normalizer-3.4.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:95b52c68d64c1878818687a473a10547b3292e82b6f6fe483808fb1468e2f52f", size = 215230, upload-time = "2026-03-15T18:52:13.325Z" }, - { url = "https://files.pythonhosted.org/packages/cd/24/afff630feb571a13f07c8539fbb502d2ab494019492aaffc78ef41f1d1d0/charset_normalizer-3.4.6-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:7504e9b7dc05f99a9bbb4525c67a2c155073b44d720470a148b34166a69c054e", size = 199045, upload-time = "2026-03-15T18:52:14.752Z" }, - { url = "https://files.pythonhosted.org/packages/e5/17/d1399ecdaf7e0498c327433e7eefdd862b41236a7e484355b8e0e5ebd64b/charset_normalizer-3.4.6-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:172985e4ff804a7ad08eebec0a1640ece87ba5041d565fff23c8f99c1f389484", size = 211658, upload-time = "2026-03-15T18:52:16.278Z" }, - { url = "https://files.pythonhosted.org/packages/b5/38/16baa0affb957b3d880e5ac2144caf3f9d7de7bc4a91842e447fbb5e8b67/charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4be9f4830ba8741527693848403e2c457c16e499100963ec711b1c6f2049b7c7", size = 210769, upload-time = "2026-03-15T18:52:17.782Z" }, - { url = "https://files.pythonhosted.org/packages/05/34/c531bc6ac4c21da9ddfddb3107be2287188b3ea4b53b70fc58f2a77ac8d8/charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:79090741d842f564b1b2827c0b82d846405b744d31e84f18d7a7b41c20e473ff", size = 201328, upload-time = "2026-03-15T18:52:19.553Z" }, - { url = "https://files.pythonhosted.org/packages/fa/73/a5a1e9ca5f234519c1953608a03fe109c306b97fdfb25f09182babad51a7/charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:87725cfb1a4f1f8c2fc9890ae2f42094120f4b44db9360be5d99a4c6b0e03a9e", size = 225302, upload-time = "2026-03-15T18:52:21.043Z" }, - { url = "https://files.pythonhosted.org/packages/ba/f6/cd782923d112d296294dea4bcc7af5a7ae0f86ab79f8fefbda5526b6cfc0/charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:fcce033e4021347d80ed9c66dcf1e7b1546319834b74445f561d2e2221de5659", size = 211127, upload-time = "2026-03-15T18:52:22.491Z" }, - { url = "https://files.pythonhosted.org/packages/0e/c5/0b6898950627af7d6103a449b22320372c24c6feda91aa24e201a478d161/charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:ca0276464d148c72defa8bb4390cce01b4a0e425f3b50d1435aa6d7a18107602", size = 222840, upload-time = "2026-03-15T18:52:24.113Z" }, - { url = "https://files.pythonhosted.org/packages/7d/25/c4bba773bef442cbdc06111d40daa3de5050a676fa26e85090fc54dd12f0/charset_normalizer-3.4.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:197c1a244a274bb016dd8b79204850144ef77fe81c5b797dc389327adb552407", size = 216890, upload-time = "2026-03-15T18:52:25.541Z" }, - { url = "https://files.pythonhosted.org/packages/35/1a/05dacadb0978da72ee287b0143097db12f2e7e8d3ffc4647da07a383b0b7/charset_normalizer-3.4.6-cp314-cp314t-win32.whl", hash = "sha256:2a24157fa36980478dd1770b585c0f30d19e18f4fb0c47c13aa568f871718579", size = 155379, upload-time = "2026-03-15T18:52:27.05Z" }, - { url = "https://files.pythonhosted.org/packages/5d/7a/d269d834cb3a76291651256f3b9a5945e81d0a49ab9f4a498964e83c0416/charset_normalizer-3.4.6-cp314-cp314t-win_amd64.whl", hash = "sha256:cd5e2801c89992ed8c0a3f0293ae83c159a60d9a5d685005383ef4caca77f2c4", size = 169043, upload-time = "2026-03-15T18:52:28.502Z" }, - { url = "https://files.pythonhosted.org/packages/23/06/28b29fba521a37a8932c6a84192175c34d49f84a6d4773fa63d05f9aff22/charset_normalizer-3.4.6-cp314-cp314t-win_arm64.whl", hash = "sha256:47955475ac79cc504ef2704b192364e51d0d473ad452caedd0002605f780101c", size = 148523, upload-time = "2026-03-15T18:52:29.956Z" }, - { url = "https://files.pythonhosted.org/packages/2a/68/687187c7e26cb24ccbd88e5069f5ef00eba804d36dde11d99aad0838ab45/charset_normalizer-3.4.6-py3-none-any.whl", hash = "sha256:947cf925bc916d90adba35a64c82aace04fa39b46b52d4630ece166655905a69", size = 61455, upload-time = "2026-03-15T18:53:23.833Z" }, -] - -[[package]] -name = "click" -version = "8.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, -] - -[[package]] -name = "colorama" -version = "0.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, -] - -[[package]] -name = "colorlog" -version = "6.10.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a2/61/f083b5ac52e505dfc1c624eafbf8c7589a0d7f32daa398d2e7590efa5fda/colorlog-6.10.1.tar.gz", hash = "sha256:eb4ae5cb65fe7fec7773c2306061a8e63e02efc2c72eba9d27b0fa23c94f1321", size = 17162, upload-time = "2025-10-16T16:14:11.978Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" }, -] - -[[package]] -name = "comm" -version = "0.2.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4c/13/7d740c5849255756bc17888787313b61fd38a0a8304fc4f073dfc46122aa/comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971", size = 6319, upload-time = "2025-07-25T14:02:04.452Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, -] - -[[package]] -name = "cuda-bindings" -version = "13.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cuda-pathfinder", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/df/93/eef988860a3ca985f82c4f3174fc0cdd94e07331ba9a92e8e064c260337f/cuda_bindings-13.2.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6629ca2df6f795b784752409bcaedbd22a7a651b74b56a165ebc0c9dcbd504d0", size = 5614610, upload-time = "2026-03-11T00:12:50.337Z" }, - { url = "https://files.pythonhosted.org/packages/18/23/6db3aba46864aee357ab2415135b3fe3da7e9f1fa0221fa2a86a5968099c/cuda_bindings-13.2.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7dca0da053d3b4cc4869eff49c61c03f3c5dbaa0bcd712317a358d5b8f3f385d", size = 6149914, upload-time = "2026-03-11T00:12:52.374Z" }, - { url = "https://files.pythonhosted.org/packages/c0/87/87a014f045b77c6de5c8527b0757fe644417b184e5367db977236a141602/cuda_bindings-13.2.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a6464b30f46692d6c7f65d4a0e0450d81dd29de3afc1bb515653973d01c2cd6e", size = 5685673, upload-time = "2026-03-11T00:12:56.371Z" }, - { url = "https://files.pythonhosted.org/packages/ee/5e/c0fe77a73aaefd3fff25ffaccaac69c5a63eafdf8b9a4c476626ef0ac703/cuda_bindings-13.2.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f4af9f3e1be603fa12d5ad6cfca7844c9d230befa9792b5abdf7dd79979c3626", size = 6191386, upload-time = "2026-03-11T00:12:58.965Z" }, - { url = "https://files.pythonhosted.org/packages/5f/58/ed2c3b39c8dd5f96aa7a4abef0d47a73932c7a988e30f5fa428f00ed0da1/cuda_bindings-13.2.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df850a1ff8ce1b3385257b08e47b70e959932f5f432d0a4e46a355962b4e4771", size = 5507469, upload-time = "2026-03-11T00:13:04.063Z" }, - { url = "https://files.pythonhosted.org/packages/1f/01/0c941b112ceeb21439b05895eace78ca1aa2eaaf695c8521a068fd9b4c00/cuda_bindings-13.2.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8a16384c6494e5485f39314b0b4afb04bee48d49edb16d5d8593fd35bbd231b", size = 6059693, upload-time = "2026-03-11T00:13:06.003Z" }, -] - -[[package]] -name = "cuda-pathfinder" -version = "1.4.4" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/66/7b2c3d23dac4bb9629b4d9702f1f796bd41c01142c2b47be6fcfdeaf4ee4/cuda_pathfinder-1.4.4-py3-none-any.whl", hash = "sha256:1a9e7feccae0d969ad88545d0462f2ed2750df8e6732309798dc1e1ca603a28b", size = 48834, upload-time = "2026-03-23T20:50:00.706Z" }, -] - -[[package]] -name = "cuda-toolkit" -version = "13.0.2" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/57/b2/453099f5f3b698d7d0eab38916aac44c7f76229f451709e2eb9db6615dcd/cuda_toolkit-13.0.2-py2.py3-none-any.whl", hash = "sha256:b198824cf2f54003f50d64ada3a0f184b42ca0846c1c94192fa269ecd97a66eb", size = 2364, upload-time = "2025-12-19T23:24:07.328Z" }, -] - -[package.optional-dependencies] -cublas = [ - { name = "nvidia-cublas", marker = "sys_platform == 'linux'" }, -] -cudart = [ - { name = "nvidia-cuda-runtime", marker = "sys_platform == 'linux'" }, -] -cufft = [ - { name = "nvidia-cufft", marker = "sys_platform == 'linux'" }, -] -cufile = [ - { name = "nvidia-cufile", marker = "sys_platform == 'linux'" }, -] -cupti = [ - { name = "nvidia-cuda-cupti", marker = "sys_platform == 'linux'" }, -] -curand = [ - { name = "nvidia-curand", marker = "sys_platform == 'linux'" }, -] -cusolver = [ - { name = "nvidia-cusolver", marker = "sys_platform == 'linux'" }, -] -cusparse = [ - { name = "nvidia-cusparse", marker = "sys_platform == 'linux'" }, -] -nvjitlink = [ - { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" }, -] -nvrtc = [ - { name = "nvidia-cuda-nvrtc", marker = "sys_platform == 'linux'" }, -] -nvtx = [ - { name = "nvidia-nvtx", marker = "sys_platform == 'linux'" }, -] - -[[package]] -name = "debugpy" -version = "1.8.20" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e0/b7/cd8080344452e4874aae67c40d8940e2b4d47b01601a8fd9f44786c757c7/debugpy-1.8.20.tar.gz", hash = "sha256:55bc8701714969f1ab89a6d5f2f3d40c36f91b2cbe2f65d98bf8196f6a6a2c33", size = 1645207, upload-time = "2026-01-29T23:03:28.199Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/15/e2/fc500524cc6f104a9d049abc85a0a8b3f0d14c0a39b9c140511c61e5b40b/debugpy-1.8.20-cp313-cp313-macosx_15_0_universal2.whl", hash = "sha256:5dff4bb27027821fdfcc9e8f87309a28988231165147c31730128b1c983e282a", size = 2539560, upload-time = "2026-01-29T23:03:48.738Z" }, - { url = "https://files.pythonhosted.org/packages/90/83/fb33dcea789ed6018f8da20c5a9bc9d82adc65c0c990faed43f7c955da46/debugpy-1.8.20-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:84562982dd7cf5ebebfdea667ca20a064e096099997b175fe204e86817f64eaf", size = 4293272, upload-time = "2026-01-29T23:03:50.169Z" }, - { url = "https://files.pythonhosted.org/packages/a6/25/b1e4a01bfb824d79a6af24b99ef291e24189080c93576dfd9b1a2815cd0f/debugpy-1.8.20-cp313-cp313-win32.whl", hash = "sha256:da11dea6447b2cadbf8ce2bec59ecea87cc18d2c574980f643f2d2dfe4862393", size = 5331208, upload-time = "2026-01-29T23:03:51.547Z" }, - { url = "https://files.pythonhosted.org/packages/13/f7/a0b368ce54ffff9e9028c098bd2d28cfc5b54f9f6c186929083d4c60ba58/debugpy-1.8.20-cp313-cp313-win_amd64.whl", hash = "sha256:eb506e45943cab2efb7c6eafdd65b842f3ae779f020c82221f55aca9de135ed7", size = 5372930, upload-time = "2026-01-29T23:03:53.585Z" }, - { url = "https://files.pythonhosted.org/packages/33/2e/f6cb9a8a13f5058f0a20fe09711a7b726232cd5a78c6a7c05b2ec726cff9/debugpy-1.8.20-cp314-cp314-macosx_15_0_universal2.whl", hash = "sha256:9c74df62fc064cd5e5eaca1353a3ef5a5d50da5eb8058fcef63106f7bebe6173", size = 2538066, upload-time = "2026-01-29T23:03:54.999Z" }, - { url = "https://files.pythonhosted.org/packages/c5/56/6ddca50b53624e1ca3ce1d1e49ff22db46c47ea5fb4c0cc5c9b90a616364/debugpy-1.8.20-cp314-cp314-manylinux_2_34_x86_64.whl", hash = "sha256:077a7447589ee9bc1ff0cdf443566d0ecf540ac8aa7333b775ebcb8ce9f4ecad", size = 4269425, upload-time = "2026-01-29T23:03:56.518Z" }, - { url = "https://files.pythonhosted.org/packages/c5/d9/d64199c14a0d4c476df46c82470a3ce45c8d183a6796cfb5e66533b3663c/debugpy-1.8.20-cp314-cp314-win32.whl", hash = "sha256:352036a99dd35053b37b7803f748efc456076f929c6a895556932eaf2d23b07f", size = 5331407, upload-time = "2026-01-29T23:03:58.481Z" }, - { url = "https://files.pythonhosted.org/packages/e0/d9/1f07395b54413432624d61524dfd98c1a7c7827d2abfdb8829ac92638205/debugpy-1.8.20-cp314-cp314-win_amd64.whl", hash = "sha256:a98eec61135465b062846112e5ecf2eebb855305acc1dfbae43b72903b8ab5be", size = 5372521, upload-time = "2026-01-29T23:03:59.864Z" }, - { url = "https://files.pythonhosted.org/packages/e0/c3/7f67dea8ccf8fdcb9c99033bbe3e90b9e7395415843accb81428c441be2d/debugpy-1.8.20-py2.py3-none-any.whl", hash = "sha256:5be9bed9ae3be00665a06acaa48f8329d2b9632f15fd09f6a9a8c8d9907e54d7", size = 5337658, upload-time = "2026-01-29T23:04:17.404Z" }, -] - -[[package]] -name = "decorator" -version = "5.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, -] - -[[package]] -name = "docutils" -version = "0.18.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/57/b1/b880503681ea1b64df05106fc7e3c4e3801736cf63deffc6fa7fc5404cf5/docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06", size = 2043249, upload-time = "2021-11-23T17:49:42.043Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/14/69b4bad34e3f250afe29a854da03acb6747711f3df06c359fa053fae4e76/docutils-0.18.1-py2.py3-none-any.whl", hash = "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c", size = 570050, upload-time = "2021-11-23T17:49:38.556Z" }, -] - -[[package]] -name = "dpath" -version = "2.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b5/ce/e1fd64d36e4a5717bd5e6b2ad188f5eaa2e902fde871ea73a79875793fc9/dpath-2.2.0.tar.gz", hash = "sha256:34f7e630dc55ea3f219e555726f5da4b4b25f2200319c8e6902c394258dd6a3e", size = 28266, upload-time = "2024-06-12T22:08:03.686Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/05/d1/8952806fbf9583004ab479d8f58a9496c3d35f6b6009ddd458bdd9978eaf/dpath-2.2.0-py3-none-any.whl", hash = "sha256:b330a375ded0a0d2ed404440f6c6a715deae5313af40bbb01c8a41d891900576", size = 17618, upload-time = "2024-06-12T22:08:01.881Z" }, -] - -[[package]] -name = "duckdb" -version = "1.5.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ae/62/590caabec6c41003f46a244b6fd707d35ca2e552e0c70cbf454e08bf6685/duckdb-1.5.1.tar.gz", hash = "sha256:b370d1620a34a4538ef66524fcee9de8171fa263c701036a92bc0b4c1f2f9c6d", size = 17995082, upload-time = "2026-03-23T12:12:15.894Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a5/f2/af476945e3b97417945b0f660b5efa661863547c0ea104251bb6387342b1/duckdb-1.5.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:26e56b5f0c96189e3288d83cf7b476e23615987902f801e5788dee15ee9f24a9", size = 30113759, upload-time = "2026-03-23T12:11:26.5Z" }, - { url = "https://files.pythonhosted.org/packages/fe/9d/5a542b3933647369e601175190093597ce0ac54909aea0dd876ec51ffad4/duckdb-1.5.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:972d0dbf283508f9bc446ee09c3838cb7c7f114b5bdceee41753288c97fe2f7c", size = 15991463, upload-time = "2026-03-23T12:11:30.025Z" }, - { url = "https://files.pythonhosted.org/packages/53/a5/b59cff67f5e0420b8f337ad86406801cffacae219deed83961dcceefda67/duckdb-1.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:482f8a13f2600f527e427f73c42b5aa75536f9892868068f0aaf573055a0135f", size = 14246482, upload-time = "2026-03-23T12:11:33.33Z" }, - { url = "https://files.pythonhosted.org/packages/e9/12/d72a82fe502aae82b97b481bf909be8e22db5a403290799ad054b4f90eb4/duckdb-1.5.1-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da137802688190835b4c863cafa77fd7e29dff662ee6d905a9ffc14f00299c91", size = 19270816, upload-time = "2026-03-23T12:11:36.79Z" }, - { url = "https://files.pythonhosted.org/packages/f9/c3/ee49319b15f139e04c067378f0e763f78336fbab38ba54b0852467dd9da4/duckdb-1.5.1-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5d4147422d91ccdc2d2abf6ed24196025e020259d1d267970ae20c13c2ce84b1", size = 21385695, upload-time = "2026-03-23T12:11:40.465Z" }, - { url = "https://files.pythonhosted.org/packages/a8/f5/a15498e75a27a136c791ca1889beade96d388dadf9811375db155fc96d1a/duckdb-1.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:05fc91767d0cfc4cf2fa68966ab5b479ac07561752e42dd0ae30327bd160f64a", size = 13084065, upload-time = "2026-03-23T12:11:43.763Z" }, - { url = "https://files.pythonhosted.org/packages/93/81/b3612d2bbe237f75791095e16767c61067ea5d31c76e8591c212dac13bd0/duckdb-1.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:a28531cee2a5a42d89f9ba4da53bfeb15681f12acc0263476c8705380dadce07", size = 13892892, upload-time = "2026-03-23T12:11:47.222Z" }, - { url = "https://files.pythonhosted.org/packages/ad/75/e9e7893542ca738bcde2d41d459e3438950219c71c57ad28b049dc2ae616/duckdb-1.5.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:eba81e0b3011c1f23df7ea47ef4ffaa8239817959ae291515b6efd068bde2161", size = 30123677, upload-time = "2026-03-23T12:11:51.511Z" }, - { url = "https://files.pythonhosted.org/packages/df/db/f7420ee7109a922124c02f377ae1c56156e9e4aa434f4726848adaef0219/duckdb-1.5.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:afab8b4b1f4469c3879bb049dd039f8fce402712050324e9524a43d7324c5e87", size = 15996808, upload-time = "2026-03-23T12:11:54.964Z" }, - { url = "https://files.pythonhosted.org/packages/df/57/2c4c3de1f1110417592741863ba58b4eca2f7690a421712762ddbdcd72e6/duckdb-1.5.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:71dddcebbc5a70e946a06c30b59b5dd7999c9833d307168f90fb4e4b672ab63e", size = 14248990, upload-time = "2026-03-23T12:11:58.576Z" }, - { url = "https://files.pythonhosted.org/packages/2b/81/e173b33ffac53124a3e39e97fb60a538f26651a0df6e393eb9bf7540126c/duckdb-1.5.1-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac2804043bd1bc10b5da18f8f4c706877197263a510c41be9b4c0062f5783dcc", size = 19276013, upload-time = "2026-03-23T12:12:02.034Z" }, - { url = "https://files.pythonhosted.org/packages/d4/4c/47e838393aa90d3d78549c8c04cb09452efeb14aaae0ee24dc0bd61c3a41/duckdb-1.5.1-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8843bd9594e1387f1e601439e19ad73abdf57356104fd1e53a708255bb95a13d", size = 21387569, upload-time = "2026-03-23T12:12:05.693Z" }, - { url = "https://files.pythonhosted.org/packages/f4/9b/ce65743e0e85f5c984d2f7e8a81bc908d0bac345d6d8b6316436b29430e7/duckdb-1.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:d68c5a01a283cb13b79eafe016fe5869aa11bff8c46e7141c70aa0aac808010f", size = 13603876, upload-time = "2026-03-23T12:12:09.344Z" }, - { url = "https://files.pythonhosted.org/packages/e6/ac/f9e4e731635192571f86f52d86234f537c7f8ca4f6917c56b29051c077ef/duckdb-1.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:a3be2072315982e232bfe49c9d3db0a59ba67b2240a537ef42656cc772a887c7", size = 14370790, upload-time = "2026-03-23T12:12:12.497Z" }, -] - -[[package]] -name = "et-xmlfile" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, -] - -[[package]] -name = "executing" -version = "2.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, -] - -[[package]] -name = "fastjsonschema" -version = "2.21.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/20/b5/23b216d9d985a956623b6bd12d4086b60f0059b27799f23016af04a74ea1/fastjsonschema-2.21.2.tar.gz", hash = "sha256:b1eb43748041c880796cd077f1a07c3d94e93ae84bba5ed36800a33554ae05de", size = 374130, upload-time = "2025-08-14T18:49:36.666Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/a8/20d0723294217e47de6d9e2e40fd4a9d2f7c4b6ef974babd482a59743694/fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463", size = 24024, upload-time = "2025-08-14T18:49:34.776Z" }, -] - -[[package]] -name = "filelock" -version = "3.25.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694", size = 40480, upload-time = "2026-03-11T20:45:38.487Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" }, -] - -[[package]] -name = "fsspec" -version = "2026.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, -] - -[[package]] -name = "greenlet" -version = "3.3.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a3/51/1664f6b78fc6ebbd98019a1fd730e83fa78f2db7058f72b1463d3612b8db/greenlet-3.3.2.tar.gz", hash = "sha256:2eaf067fc6d886931c7962e8c6bede15d2f01965560f3359b27c80bde2d151f2", size = 188267, upload-time = "2026-02-20T20:54:15.531Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/48/f8b875fa7dea7dd9b33245e37f065af59df6a25af2f9561efa8d822fde51/greenlet-3.3.2-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:aa6ac98bdfd716a749b84d4034486863fd81c3abde9aa3cf8eff9127981a4ae4", size = 279120, upload-time = "2026-02-20T20:19:01.9Z" }, - { url = "https://files.pythonhosted.org/packages/49/8d/9771d03e7a8b1ee456511961e1b97a6d77ae1dea4a34a5b98eee706689d3/greenlet-3.3.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ab0c7e7901a00bc0a7284907273dc165b32e0d109a6713babd04471327ff7986", size = 603238, upload-time = "2026-02-20T20:47:32.873Z" }, - { url = "https://files.pythonhosted.org/packages/59/0e/4223c2bbb63cd5c97f28ffb2a8aee71bdfb30b323c35d409450f51b91e3e/greenlet-3.3.2-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d248d8c23c67d2291ffd47af766e2a3aa9fa1c6703155c099feb11f526c63a92", size = 614219, upload-time = "2026-02-20T20:55:59.817Z" }, - { url = "https://files.pythonhosted.org/packages/7a/34/259b28ea7a2a0c904b11cd36c79b8cef8019b26ee5dbe24e73b469dea347/greenlet-3.3.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6997d360a4e6a4e936c0f9625b1c20416b8a0ea18a8e19cabbefc712e7397ab", size = 616774, upload-time = "2026-02-20T20:21:02.454Z" }, - { url = "https://files.pythonhosted.org/packages/0a/03/996c2d1689d486a6e199cb0f1cf9e4aa940c500e01bdf201299d7d61fa69/greenlet-3.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:64970c33a50551c7c50491671265d8954046cb6e8e2999aacdd60e439b70418a", size = 1571277, upload-time = "2026-02-20T20:49:34.795Z" }, - { url = "https://files.pythonhosted.org/packages/d9/c4/2570fc07f34a39f2caf0bf9f24b0a1a0a47bc2e8e465b2c2424821389dfc/greenlet-3.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1a9172f5bf6bd88e6ba5a84e0a68afeac9dc7b6b412b245dd64f52d83c81e55b", size = 1640455, upload-time = "2026-02-20T20:21:10.261Z" }, - { url = "https://files.pythonhosted.org/packages/91/39/5ef5aa23bc545aa0d31e1b9b55822b32c8da93ba657295840b6b34124009/greenlet-3.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:a7945dd0eab63ded0a48e4dcade82939783c172290a7903ebde9e184333ca124", size = 230961, upload-time = "2026-02-20T20:16:58.461Z" }, - { url = "https://files.pythonhosted.org/packages/62/6b/a89f8456dcb06becff288f563618e9f20deed8dd29beea14f9a168aef64b/greenlet-3.3.2-cp313-cp313-win_arm64.whl", hash = "sha256:394ead29063ee3515b4e775216cb756b2e3b4a7e55ae8fd884f17fa579e6b327", size = 230221, upload-time = "2026-02-20T20:17:37.152Z" }, - { url = "https://files.pythonhosted.org/packages/3f/ae/8bffcbd373b57a5992cd077cbe8858fff39110480a9d50697091faea6f39/greenlet-3.3.2-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:8d1658d7291f9859beed69a776c10822a0a799bc4bfe1bd4272bb60e62507dab", size = 279650, upload-time = "2026-02-20T20:18:00.783Z" }, - { url = "https://files.pythonhosted.org/packages/d1/c0/45f93f348fa49abf32ac8439938726c480bd96b2a3c6f4d949ec0124b69f/greenlet-3.3.2-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18cb1b7337bca281915b3c5d5ae19f4e76d35e1df80f4ad3c1a7be91fadf1082", size = 650295, upload-time = "2026-02-20T20:47:34.036Z" }, - { url = "https://files.pythonhosted.org/packages/b3/de/dd7589b3f2b8372069ab3e4763ea5329940fc7ad9dcd3e272a37516d7c9b/greenlet-3.3.2-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2e47408e8ce1c6f1ceea0dffcdf6ebb85cc09e55c7af407c99f1112016e45e9", size = 662163, upload-time = "2026-02-20T20:56:01.295Z" }, - { url = "https://files.pythonhosted.org/packages/d2/d8/09bfa816572a4d83bccd6750df1926f79158b1c36c5f73786e26dbe4ee38/greenlet-3.3.2-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63d10328839d1973e5ba35e98cccbca71b232b14051fd957b6f8b6e8e80d0506", size = 664160, upload-time = "2026-02-20T20:21:04.015Z" }, - { url = "https://files.pythonhosted.org/packages/48/cf/56832f0c8255d27f6c35d41b5ec91168d74ec721d85f01a12131eec6b93c/greenlet-3.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8e4ab3cfb02993c8cc248ea73d7dae6cec0253e9afa311c9b37e603ca9fad2ce", size = 1619181, upload-time = "2026-02-20T20:49:36.052Z" }, - { url = "https://files.pythonhosted.org/packages/0a/23/b90b60a4aabb4cec0796e55f25ffbfb579a907c3898cd2905c8918acaa16/greenlet-3.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:94ad81f0fd3c0c0681a018a976e5c2bd2ca2d9d94895f23e7bb1af4e8af4e2d5", size = 1687713, upload-time = "2026-02-20T20:21:11.684Z" }, - { url = "https://files.pythonhosted.org/packages/f3/ca/2101ca3d9223a1dc125140dbc063644dca76df6ff356531eb27bc267b446/greenlet-3.3.2-cp314-cp314-win_amd64.whl", hash = "sha256:8c4dd0f3997cf2512f7601563cc90dfb8957c0cff1e3a1b23991d4ea1776c492", size = 232034, upload-time = "2026-02-20T20:20:08.186Z" }, - { url = "https://files.pythonhosted.org/packages/f6/4a/ecf894e962a59dea60f04877eea0fd5724618da89f1867b28ee8b91e811f/greenlet-3.3.2-cp314-cp314-win_arm64.whl", hash = "sha256:cd6f9e2bbd46321ba3bbb4c8a15794d32960e3b0ae2cc4d49a1a53d314805d71", size = 231437, upload-time = "2026-02-20T20:18:59.722Z" }, - { url = "https://files.pythonhosted.org/packages/98/6d/8f2ef704e614bcf58ed43cfb8d87afa1c285e98194ab2cfad351bf04f81e/greenlet-3.3.2-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:e26e72bec7ab387ac80caa7496e0f908ff954f31065b0ffc1f8ecb1338b11b54", size = 286617, upload-time = "2026-02-20T20:19:29.856Z" }, - { url = "https://files.pythonhosted.org/packages/5e/0d/93894161d307c6ea237a43988f27eba0947b360b99ac5239ad3fe09f0b47/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b466dff7a4ffda6ca975979bab80bdadde979e29fc947ac3be4451428d8b0e4", size = 655189, upload-time = "2026-02-20T20:47:35.742Z" }, - { url = "https://files.pythonhosted.org/packages/f5/2c/d2d506ebd8abcb57386ec4f7ba20f4030cbe56eae541bc6fd6ef399c0b41/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b8bddc5b73c9720bea487b3bffdb1840fe4e3656fba3bd40aa1489e9f37877ff", size = 658225, upload-time = "2026-02-20T20:56:02.527Z" }, - { url = "https://files.pythonhosted.org/packages/8e/30/3a09155fbf728673a1dea713572d2d31159f824a37c22da82127056c44e4/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b26b0f4428b871a751968285a1ac9648944cea09807177ac639b030bddebcea4", size = 657907, upload-time = "2026-02-20T20:21:05.259Z" }, - { url = "https://files.pythonhosted.org/packages/f3/fd/d05a4b7acd0154ed758797f0a43b4c0962a843bedfe980115e842c5b2d08/greenlet-3.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1fb39a11ee2e4d94be9a76671482be9398560955c9e568550de0224e41104727", size = 1618857, upload-time = "2026-02-20T20:49:37.309Z" }, - { url = "https://files.pythonhosted.org/packages/6f/e1/50ee92a5db521de8f35075b5eff060dd43d39ebd46c2181a2042f7070385/greenlet-3.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:20154044d9085151bc309e7689d6f7ba10027f8f5a8c0676ad398b951913d89e", size = 1680010, upload-time = "2026-02-20T20:21:13.427Z" }, - { url = "https://files.pythonhosted.org/packages/29/4b/45d90626aef8e65336bed690106d1382f7a43665e2249017e9527df8823b/greenlet-3.3.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c04c5e06ec3e022cbfe2cd4a846e1d4e50087444f875ff6d2c2ad8445495cf1a", size = 237086, upload-time = "2026-02-20T20:20:45.786Z" }, -] - -[[package]] -name = "h11" -version = "0.16.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, -] - -[[package]] -name = "h5py" -version = "3.16.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/db/33/acd0ce6863b6c0d7735007df01815403f5589a21ff8c2e1ee2587a38f548/h5py-3.16.0.tar.gz", hash = "sha256:a0dbaad796840ccaa67a4c144a0d0c8080073c34c76d5a6941d6818678ef2738", size = 446526, upload-time = "2026-03-06T13:49:08.07Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0f/9e/6142ebfda0cb6e9349c091eae73c2e01a770b7659255248d637bec54a88b/h5py-3.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:370a845f432c2c9619db8eed334d1e610c6015796122b0e57aa46312c22617d9", size = 3671808, upload-time = "2026-03-06T13:48:19.737Z" }, - { url = "https://files.pythonhosted.org/packages/b0/65/5e088a45d0f43cd814bc5bec521c051d42005a472e804b1a36c48dada09b/h5py-3.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:42108e93326c50c2810025aade9eac9d6827524cdccc7d4b75a546e5ab308edb", size = 3045837, upload-time = "2026-03-06T13:48:21.854Z" }, - { url = "https://files.pythonhosted.org/packages/da/1e/6172269e18cc5a484e2913ced33339aad588e02ba407fafd00d369e22ef3/h5py-3.16.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:099f2525c9dcf28de366970a5fb34879aab20491589fa89ce2863a84218bb524", size = 5193860, upload-time = "2026-03-06T13:48:24.071Z" }, - { url = "https://files.pythonhosted.org/packages/bd/98/ef2b6fe2903e377cbe870c3b2800d62552f1e3dbe81ce49e1923c53d1c5c/h5py-3.16.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9300ad32dea9dfc5171f94d5f6948e159ed93e4701280b0f508773b3f582f402", size = 5400417, upload-time = "2026-03-06T13:48:25.728Z" }, - { url = "https://files.pythonhosted.org/packages/bc/81/5b62d760039eed64348c98129d17061fdfc7839fc9c04eaaad6dee1004e4/h5py-3.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:171038f23bccddfc23f344cadabdfc9917ff554db6a0d417180d2747fe4c75a7", size = 5185214, upload-time = "2026-03-06T13:48:27.436Z" }, - { url = "https://files.pythonhosted.org/packages/28/c4/532123bcd9080e250696779c927f2cb906c8bf3447df98f5ceb8dcded539/h5py-3.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7e420b539fb6023a259a1b14d4c9f6df8cf50d7268f48e161169987a57b737ff", size = 5414598, upload-time = "2026-03-06T13:48:29.49Z" }, - { url = "https://files.pythonhosted.org/packages/c3/d9/a27997f84341fc0dfcdd1fe4179b6ba6c32a7aa880fdb8c514d4dad6fba3/h5py-3.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:18f2bbcd545e6991412253b98727374c356d67caa920e68dc79eab36bf5fedad", size = 3175509, upload-time = "2026-03-06T13:48:31.131Z" }, - { url = "https://files.pythonhosted.org/packages/a5/23/bb8647521d4fd770c30a76cfc6cb6a2f5495868904054e92f2394c5a78ff/h5py-3.16.0-cp313-cp313-win_arm64.whl", hash = "sha256:656f00e4d903199a1d58df06b711cf3ca632b874b4207b7dbec86185b5c8c7d4", size = 2647362, upload-time = "2026-03-06T13:48:33.411Z" }, - { url = "https://files.pythonhosted.org/packages/48/3c/7fcd9b4c9eed82e91fb15568992561019ae7a829d1f696b2c844355d95dd/h5py-3.16.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9c9d307c0ef862d1cd5714f72ecfafe0a5d7529c44845afa8de9f46e5ba8bd65", size = 3678608, upload-time = "2026-03-06T13:48:35.183Z" }, - { url = "https://files.pythonhosted.org/packages/6a/b7/9366ed44ced9b7ef357ab48c94205280276db9d7f064aa3012a97227e966/h5py-3.16.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8c1eff849cdd53cbc73c214c30ebdb6f1bb8b64790b4b4fc36acdb5e43570210", size = 3054773, upload-time = "2026-03-06T13:48:37.139Z" }, - { url = "https://files.pythonhosted.org/packages/58/a5/4964bc0e91e86340c2bbda83420225b2f770dcf1eb8a39464871ad769436/h5py-3.16.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:e2c04d129f180019e216ee5f9c40b78a418634091c8782e1f723a6ca3658b965", size = 5198886, upload-time = "2026-03-06T13:48:38.879Z" }, - { url = "https://files.pythonhosted.org/packages/f1/16/d905e7f53e661ce2c24686c38048d8e2b750ffc4350009d41c4e6c6c9826/h5py-3.16.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:e4360f15875a532bc7b98196c7592ed4fc92672a57c0a621355961cafb17a6dd", size = 5404883, upload-time = "2026-03-06T13:48:41.324Z" }, - { url = "https://files.pythonhosted.org/packages/4b/f2/58f34cb74af46d39f4cd18ea20909a8514960c5a3e5b92fd06a28161e0a8/h5py-3.16.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:3fae9197390c325e62e0a1aa977f2f62d994aa87aab182abbea85479b791197c", size = 5192039, upload-time = "2026-03-06T13:48:43.117Z" }, - { url = "https://files.pythonhosted.org/packages/ce/ca/934a39c24ce2e2db017268c08da0537c20fa0be7e1549be3e977313fc8f5/h5py-3.16.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:43259303989ac8adacc9986695b31e35dba6fd1e297ff9c6a04b7da5542139cc", size = 5421526, upload-time = "2026-03-06T13:48:44.838Z" }, - { url = "https://files.pythonhosted.org/packages/3e/14/615a450205e1b56d16c6783f5ccd116cde05550faad70ae077c955654a75/h5py-3.16.0-cp314-cp314-win_amd64.whl", hash = "sha256:fa48993a0b799737ba7fd21e2350fa0a60701e58180fae9f2de834bc39a147ab", size = 3183263, upload-time = "2026-03-06T13:48:47.117Z" }, - { url = "https://files.pythonhosted.org/packages/7b/48/a6faef5ed632cae0c65ac6b214a6614a0b510c3183532c521bdb0055e117/h5py-3.16.0-cp314-cp314-win_arm64.whl", hash = "sha256:1897a771a7f40d05c262fc8f37376ec37873218544b70216872876c627640f63", size = 2663450, upload-time = "2026-03-06T13:48:48.707Z" }, - { url = "https://files.pythonhosted.org/packages/5d/32/0c8bb8aedb62c772cf7c1d427c7d1951477e8c2835f872bc0a13d1f85f86/h5py-3.16.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:15922e485844f77c0b9d275396d435db3baa58292a9c2176a386e072e0cf2491", size = 3760693, upload-time = "2026-03-06T13:48:50.453Z" }, - { url = "https://files.pythonhosted.org/packages/1d/1f/fcc5977d32d6387c5c9a694afee716a5e20658ac08b3ff24fdec79fb05f2/h5py-3.16.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:df02dd29bd247f98674634dfe41f89fd7c16ba3d7de8695ec958f58404a4e618", size = 3181305, upload-time = "2026-03-06T13:48:52.221Z" }, - { url = "https://files.pythonhosted.org/packages/f5/a1/af87f64b9f986889884243643621ebbd4ac72472ba8ec8cec891ac8e2ca1/h5py-3.16.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0f456f556e4e2cebeebd9d66adf8dc321770a42593494a0b6f0af54a7567b242", size = 5074061, upload-time = "2026-03-06T13:48:54.089Z" }, - { url = "https://files.pythonhosted.org/packages/cc/d0/146f5eaff3dc246a9c7f6e5e4f42bd45cc613bce16693bcd4d1f7c958bf5/h5py-3.16.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:3e6cb3387c756de6a9492d601553dffea3fe11b5f22b443aac708c69f3f55e16", size = 5279216, upload-time = "2026-03-06T13:48:56.75Z" }, - { url = "https://files.pythonhosted.org/packages/a1/9d/12a13424f1e604fc7df9497b73c0356fb78c2fb206abd7465ce47226e8fd/h5py-3.16.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8389e13a1fd745ad2856873e8187fd10268b2d9677877bb667b41aebd771d8b7", size = 5070068, upload-time = "2026-03-06T13:48:59.169Z" }, - { url = "https://files.pythonhosted.org/packages/41/8c/bbe98f813722b4873818a8db3e15aa3e625b59278566905ac439725e8070/h5py-3.16.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:346df559a0f7dcb31cf8e44805319e2ab24b8957c45e7708ce503b2ec79ba725", size = 5300253, upload-time = "2026-03-06T13:49:02.033Z" }, - { url = "https://files.pythonhosted.org/packages/32/9e/87e6705b4d6890e7cecdf876e2a7d3e40654a2ae37482d79a6f1b87f7b92/h5py-3.16.0-cp314-cp314t-win_amd64.whl", hash = "sha256:4c6ab014ab704b4feaa719ae783b86522ed0bf1f82184704ed3c9e4e3228796e", size = 3381671, upload-time = "2026-03-06T13:49:04.351Z" }, - { url = "https://files.pythonhosted.org/packages/96/91/9fad90cfc5f9b2489c7c26ad897157bce82f0e9534a986a221b99760b23b/h5py-3.16.0-cp314-cp314t-win_arm64.whl", hash = "sha256:faca8fb4e4319c09d83337adc80b2ca7d5c5a343c2d6f1b6388f32cfecca13c1", size = 2740706, upload-time = "2026-03-06T13:49:06.347Z" }, -] - -[[package]] -name = "hf-xet" -version = "1.4.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/09/08/23c84a26716382c89151b5b447b4beb19e3345f3a93d3b73009a71a57ad3/hf_xet-1.4.2.tar.gz", hash = "sha256:b7457b6b482d9e0743bd116363239b1fa904a5e65deede350fbc0c4ea67c71ea", size = 672357, upload-time = "2026-03-13T06:58:51.077Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/18/06/e8cf74c3c48e5485c7acc5a990d0d8516cdfb5fdf80f799174f1287cc1b5/hf_xet-1.4.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ac8202ae1e664b2c15cdfc7298cbb25e80301ae596d602ef7870099a126fcad4", size = 3796125, upload-time = "2026-03-13T06:58:33.177Z" }, - { url = "https://files.pythonhosted.org/packages/66/d4/b73ebab01cbf60777323b7de9ef05550790451eb5172a220d6b9845385ec/hf_xet-1.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6d2f8ee39fa9fba9af929f8c0d0482f8ee6e209179ad14a909b6ad78ffcb7c81", size = 3555985, upload-time = "2026-03-13T06:58:31.797Z" }, - { url = "https://files.pythonhosted.org/packages/ff/e7/ded6d1bd041c3f2bca9e913a0091adfe32371988e047dd3a68a2463c15a2/hf_xet-1.4.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4642a6cf249c09da8c1f87fe50b24b2a3450b235bf8adb55700b52f0ea6e2eb6", size = 4212085, upload-time = "2026-03-13T06:58:24.323Z" }, - { url = "https://files.pythonhosted.org/packages/97/c1/a0a44d1f98934f7bdf17f7a915b934f9fca44bb826628c553589900f6df8/hf_xet-1.4.2-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:769431385e746c92dc05492dde6f687d304584b89c33d79def8367ace06cb555", size = 3988266, upload-time = "2026-03-13T06:58:22.887Z" }, - { url = "https://files.pythonhosted.org/packages/7a/82/be713b439060e7d1f1d93543c8053d4ef2fe7e6922c5b31642eaa26f3c4b/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c9dd1c1bc4cc56168f81939b0e05b4c36dd2d28c13dc1364b17af89aa0082496", size = 4188513, upload-time = "2026-03-13T06:58:40.858Z" }, - { url = "https://files.pythonhosted.org/packages/21/a6/cbd4188b22abd80ebd0edbb2b3e87f2633e958983519980815fb8314eae5/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fca58a2ae4e6f6755cc971ac6fcdf777ea9284d7e540e350bb000813b9a3008d", size = 4428287, upload-time = "2026-03-13T06:58:42.601Z" }, - { url = "https://files.pythonhosted.org/packages/b2/4e/84e45b25e2e3e903ed3db68d7eafa96dae9a1d1f6d0e7fc85120347a852f/hf_xet-1.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:163aab46854ccae0ab6a786f8edecbbfbaa38fcaa0184db6feceebf7000c93c0", size = 3665574, upload-time = "2026-03-13T06:58:53.881Z" }, - { url = "https://files.pythonhosted.org/packages/ee/71/c5ac2b9a7ae39c14e91973035286e73911c31980fe44e7b1d03730c00adc/hf_xet-1.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:09b138422ecbe50fd0c84d4da5ff537d27d487d3607183cd10e3e53f05188e82", size = 3528760, upload-time = "2026-03-13T06:58:52.187Z" }, - { url = "https://files.pythonhosted.org/packages/1e/0f/fcd2504015eab26358d8f0f232a1aed6b8d363a011adef83fe130bff88f7/hf_xet-1.4.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:949dcf88b484bb9d9276ca83f6599e4aa03d493c08fc168c124ad10b2e6f75d7", size = 3796493, upload-time = "2026-03-13T06:58:39.267Z" }, - { url = "https://files.pythonhosted.org/packages/82/56/19c25105ff81731ca6d55a188b5de2aa99d7a2644c7aa9de1810d5d3b726/hf_xet-1.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:41659966020d59eb9559c57de2cde8128b706a26a64c60f0531fa2318f409418", size = 3555797, upload-time = "2026-03-13T06:58:37.546Z" }, - { url = "https://files.pythonhosted.org/packages/bf/e3/8933c073186849b5e06762aa89847991d913d10a95d1603eb7f2c3834086/hf_xet-1.4.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c588e21d80010119458dd5d02a69093f0d115d84e3467efe71ffb2c67c19146", size = 4212127, upload-time = "2026-03-13T06:58:30.539Z" }, - { url = "https://files.pythonhosted.org/packages/eb/01/f89ebba4e369b4ed699dcb60d3152753870996f41c6d22d3d7cac01310e1/hf_xet-1.4.2-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a296744d771a8621ad1d50c098d7ab975d599800dae6d48528ba3944e5001ba0", size = 3987788, upload-time = "2026-03-13T06:58:29.139Z" }, - { url = "https://files.pythonhosted.org/packages/84/4d/8a53e5ffbc2cc33bbf755382ac1552c6d9af13f623ed125fe67cc3e6772f/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f563f7efe49588b7d0629d18d36f46d1658fe7e08dce3fa3d6526e1c98315e2d", size = 4188315, upload-time = "2026-03-13T06:58:48.017Z" }, - { url = "https://files.pythonhosted.org/packages/d1/b8/b7a1c1b5592254bd67050632ebbc1b42cc48588bf4757cb03c2ef87e704a/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5b2e0132c56d7ee1bf55bdb638c4b62e7106f6ac74f0b786fed499d5548c5570", size = 4428306, upload-time = "2026-03-13T06:58:49.502Z" }, - { url = "https://files.pythonhosted.org/packages/a0/0c/40779e45b20e11c7c5821a94135e0207080d6b3d76e7b78ccb413c6f839b/hf_xet-1.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:2f45c712c2fa1215713db10df6ac84b49d0e1c393465440e9cb1de73ecf7bbf6", size = 3665826, upload-time = "2026-03-13T06:58:59.88Z" }, - { url = "https://files.pythonhosted.org/packages/51/4c/e2688c8ad1760d7c30f7c429c79f35f825932581bc7c9ec811436d2f21a0/hf_xet-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:6d53df40616f7168abfccff100d232e9d460583b9d86fa4912c24845f192f2b8", size = 3529113, upload-time = "2026-03-13T06:58:58.491Z" }, - { url = "https://files.pythonhosted.org/packages/b4/86/b40b83a2ff03ef05c4478d2672b1fc2b9683ff870e2b25f4f3af240f2e7b/hf_xet-1.4.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:71f02d6e4cdd07f344f6844845d78518cc7186bd2bc52d37c3b73dc26a3b0bc5", size = 3800339, upload-time = "2026-03-13T06:58:36.245Z" }, - { url = "https://files.pythonhosted.org/packages/64/2e/af4475c32b4378b0e92a587adb1aa3ec53e3450fd3e5fe0372a874531c00/hf_xet-1.4.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e9b38d876e94d4bdcf650778d6ebbaa791dd28de08db9736c43faff06ede1b5a", size = 3559664, upload-time = "2026-03-13T06:58:34.787Z" }, - { url = "https://files.pythonhosted.org/packages/3c/4c/781267da3188db679e601de18112021a5cb16506fe86b246e22c5401a9c4/hf_xet-1.4.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:77e8c180b7ef12d8a96739a4e1e558847002afe9ea63b6f6358b2271a8bdda1c", size = 4217422, upload-time = "2026-03-13T06:58:27.472Z" }, - { url = "https://files.pythonhosted.org/packages/68/47/d6cf4a39ecf6c7705f887a46f6ef5c8455b44ad9eb0d391aa7e8a2ff7fea/hf_xet-1.4.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c3b3c6a882016b94b6c210957502ff7877802d0dbda8ad142c8595db8b944271", size = 3992847, upload-time = "2026-03-13T06:58:25.989Z" }, - { url = "https://files.pythonhosted.org/packages/2d/ef/e80815061abff54697239803948abc665c6b1d237102c174f4f7a9a5ffc5/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9d9a634cc929cfbaf2e1a50c0e532ae8c78fa98618426769480c58501e8c8ac2", size = 4193843, upload-time = "2026-03-13T06:58:44.59Z" }, - { url = "https://files.pythonhosted.org/packages/54/75/07f6aa680575d9646c4167db6407c41340cbe2357f5654c4e72a1b01ca14/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6b0932eb8b10317ea78b7da6bab172b17be03bbcd7809383d8d5abd6a2233e04", size = 4432751, upload-time = "2026-03-13T06:58:46.533Z" }, - { url = "https://files.pythonhosted.org/packages/cd/71/193eabd7e7d4b903c4aa983a215509c6114915a5a237525ec562baddb868/hf_xet-1.4.2-cp37-abi3-win_amd64.whl", hash = "sha256:ad185719fb2e8ac26f88c8100562dbf9dbdcc3d9d2add00faa94b5f106aea53f", size = 3671149, upload-time = "2026-03-13T06:58:57.07Z" }, - { url = "https://files.pythonhosted.org/packages/b4/7e/ccf239da366b37ba7f0b36095450efae4a64980bdc7ec2f51354205fdf39/hf_xet-1.4.2-cp37-abi3-win_arm64.whl", hash = "sha256:32c012286b581f783653e718c1862aea5b9eb140631685bb0c5e7012c8719a87", size = 3533426, upload-time = "2026-03-13T06:58:55.46Z" }, -] - -[[package]] -name = "httpcore" -version = "1.0.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "h11" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, -] - -[[package]] -name = "httpx" -version = "0.28.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "anyio" }, - { name = "certifi" }, - { name = "httpcore" }, - { name = "idna" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, -] - -[[package]] -name = "huggingface-hub" -version = "1.7.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "filelock" }, - { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, - { name = "httpx" }, - { name = "packaging" }, - { name = "pyyaml" }, - { name = "tqdm" }, - { name = "typer" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/19/15/eafc1c57bf0f8afffb243dcd4c0cceb785e956acc17bba4d9bf2ae21fc9c/huggingface_hub-1.7.2.tar.gz", hash = "sha256:7f7e294e9bbb822e025bdb2ada025fa4344d978175a7f78e824d86e35f7ab43b", size = 724684, upload-time = "2026-03-20T10:36:08.767Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/08/de/3ad061a05f74728927ded48c90b73521b9a9328c85d841bdefb30e01fb85/huggingface_hub-1.7.2-py3-none-any.whl", hash = "sha256:288f33a0a17b2a73a1359e2a5fd28d1becb2c121748c6173ab8643fb342c850e", size = 618036, upload-time = "2026-03-20T10:36:06.824Z" }, -] - -[[package]] -name = "idna" -version = "3.11" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, -] - -[[package]] -name = "imagesize" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6c/e6/7bf14eeb8f8b7251141944835abd42eb20a658d89084b7e1f3e5fe394090/imagesize-2.0.0.tar.gz", hash = "sha256:8e8358c4a05c304f1fccf7ff96f036e7243a189e9e42e90851993c558cfe9ee3", size = 1773045, upload-time = "2026-03-03T14:18:29.941Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5f/53/fb7122b71361a0d121b669dcf3d31244ef75badbbb724af388948de543e2/imagesize-2.0.0-py2.py3-none-any.whl", hash = "sha256:5667c5bbb57ab3f1fa4bc366f4fbc971db3d5ed011fd2715fd8001f782718d96", size = 9441, upload-time = "2026-03-03T14:18:27.892Z" }, -] - -[[package]] -name = "importlib-metadata" -version = "9.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "zipp" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a9/01/15bb152d77b21318514a96f43af312635eb2500c96b55398d020c93d86ea/importlib_metadata-9.0.0.tar.gz", hash = "sha256:a4f57ab599e6a2e3016d7595cfd72eb4661a5106e787a95bcc90c7105b831efc", size = 56405, upload-time = "2026-03-20T06:42:56.999Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/38/3d/2d244233ac4f76e38533cfcb2991c9eb4c7bf688ae0a036d30725b8faafe/importlib_metadata-9.0.0-py3-none-any.whl", hash = "sha256:2d21d1cc5a017bd0559e36150c21c830ab1dc304dedd1b7ea85d20f45ef3edd7", size = 27789, upload-time = "2026-03-20T06:42:55.665Z" }, -] - -[[package]] -name = "iniconfig" -version = "2.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, -] - -[[package]] -name = "ipykernel" -version = "7.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "appnope", marker = "sys_platform == 'darwin'" }, - { name = "comm" }, - { name = "debugpy" }, - { name = "ipython" }, - { name = "jupyter-client" }, - { name = "jupyter-core" }, - { name = "matplotlib-inline" }, - { name = "nest-asyncio" }, - { name = "packaging" }, - { name = "psutil" }, - { name = "pyzmq" }, - { name = "tornado" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ca/8d/b68b728e2d06b9e0051019640a40a9eb7a88fcd82c2e1b5ce70bef5ff044/ipykernel-7.2.0.tar.gz", hash = "sha256:18ed160b6dee2cbb16e5f3575858bc19d8f1fe6046a9a680c708494ce31d909e", size = 176046, upload-time = "2026-02-06T16:43:27.403Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/b9/e73d5d9f405cba7706c539aa8b311b49d4c2f3d698d9c12f815231169c71/ipykernel-7.2.0-py3-none-any.whl", hash = "sha256:3bbd4420d2b3cc105cbdf3756bfc04500b1e52f090a90716851f3916c62e1661", size = 118788, upload-time = "2026-02-06T16:43:25.149Z" }, -] - -[[package]] -name = "ipython" -version = "8.38.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "decorator" }, - { name = "jedi" }, - { name = "matplotlib-inline" }, - { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, - { name = "prompt-toolkit" }, - { name = "pygments" }, - { name = "stack-data" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e5/61/1810830e8b93c72dcd3c0f150c80a00c3deb229562d9423807ec92c3a539/ipython-8.38.0.tar.gz", hash = "sha256:9cfea8c903ce0867cc2f23199ed8545eb741f3a69420bfcf3743ad1cec856d39", size = 5513996, upload-time = "2026-01-05T10:59:06.901Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9f/df/db59624f4c71b39717c423409950ac3f2c8b2ce4b0aac843112c7fb3f721/ipython-8.38.0-py3-none-any.whl", hash = "sha256:750162629d800ac65bb3b543a14e7a74b0e88063eac9b92124d4b2aa3f6d8e86", size = 831813, upload-time = "2026-01-05T10:59:04.239Z" }, -] - -[[package]] -name = "jedi" -version = "0.19.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "parso" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, -] - -[[package]] -name = "jellyfish" -version = "1.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0b/14/fc5bdb637996df181e5c4fa3b15dcc27d33215e6c41753564ae453bdb40f/jellyfish-1.2.1.tar.gz", hash = "sha256:72d2fda61b23babe862018729be73c8b0dc12e3e6601f36f6e65d905e249f4db", size = 364417, upload-time = "2025-10-11T19:36:37.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5c/e6/75feeda1c3634525296aa56265db151f896005b139e177f8b1a285546a1f/jellyfish-1.2.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:4b3e3223aaad74e18aacc74775e01815e68af810258ceea6fa6a81b19f384312", size = 322958, upload-time = "2025-10-11T19:35:29.906Z" }, - { url = "https://files.pythonhosted.org/packages/0e/66/4b92bb55b545ebefbf085e45cbcda576d2a2a3dc48fd61dae469c27e73a6/jellyfish-1.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e967e67058b78189d2b20a9586c7720a05ec4a580d6a98c796cd5cd2b7b11303", size = 317859, upload-time = "2025-10-11T19:35:31.312Z" }, - { url = "https://files.pythonhosted.org/packages/fe/8e/9d0055f921c884605bf22a96e376b016993928126e8a4c7fd8698260fb4e/jellyfish-1.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32581c50b34a09889b2d96796170e53da313a1e7fde32be63c82e50e7e791e3c", size = 353222, upload-time = "2025-10-11T19:35:32.352Z" }, - { url = "https://files.pythonhosted.org/packages/4f/d2/deca58a62e57f7e2b2172ab39f522831279ee08ec0943fc0d0e33cd6e6f9/jellyfish-1.2.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07b022412ebece96759006cb015d46b8218d7f896d8b327c6bbee784ddf38ed9", size = 362392, upload-time = "2025-10-11T19:35:33.305Z" }, - { url = "https://files.pythonhosted.org/packages/12/40/9a7f62d367f5a862950ce3598188fe0e22e11d1f5d6eaad6eda5adc354b0/jellyfish-1.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80a49eb817eaa6591f43a31e5c93d79904de62537f029907ef88c050d781a638", size = 360358, upload-time = "2025-10-11T19:35:34.585Z" }, - { url = "https://files.pythonhosted.org/packages/a5/e5/6b44a1058df3dfa3dd1174c9f86685c78f780d0b68851a057075aea14587/jellyfish-1.2.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:e1b990fb15985571616f7f40a12d6fa062897b19fb5359b6dec3cd811d802c24", size = 533945, upload-time = "2025-10-11T19:35:35.764Z" }, - { url = "https://files.pythonhosted.org/packages/50/4c/2397f43ad2692a1052299607838b41a4c2dd5707fde4ce459d686e763eb1/jellyfish-1.2.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:dd895cf63fac0a9f11b524fff810d9a6081dcf3c518b34172ac8684eb504dd43", size = 553707, upload-time = "2025-10-11T19:35:36.926Z" }, - { url = "https://files.pythonhosted.org/packages/de/aa/dc7cf053c8c40035791de1dc2f45b1f57772a14b0dc53318720e87073831/jellyfish-1.2.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:6d2bac5982d7a08759ea487bfa00149e6aa8a3be7cd43c4ed1be1e3505425c69", size = 523323, upload-time = "2025-10-11T19:35:37.981Z" }, - { url = "https://files.pythonhosted.org/packages/2b/1a/610c7f1f7777646322f489b5ed1e4631370c9fa4fb40a8246af71b496b6d/jellyfish-1.2.1-cp313-cp313-win32.whl", hash = "sha256:509355ebedec69a8bf0cc113a6bf9c01820d12fe2eea44f47dfa809faf2d5463", size = 209143, upload-time = "2025-10-11T19:35:39.276Z" }, - { url = "https://files.pythonhosted.org/packages/80/9a/6102b23b03a6df779fee76c979c0eb819b300c83b468900df78bb574b944/jellyfish-1.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:9c747ae5c0fb4bd519f6abbfe4bd704b2f1c63fd4dd3dbb8d8864478974e1571", size = 213466, upload-time = "2025-10-11T19:35:40.24Z" }, - { url = "https://files.pythonhosted.org/packages/89/c3/92190ff494881008ff127d67aba80245a5071ec7c3ff1181ceddc6c9d636/jellyfish-1.2.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:212aaf177236192a735bbbf5938717aa8518d14a25b08b015e47e783e70be060", size = 322379, upload-time = "2025-10-11T19:35:41.21Z" }, - { url = "https://files.pythonhosted.org/packages/d4/db/993c81f3e95e06e2a5cb71aaf9af063d8798a34c9715c8059707ddc12b86/jellyfish-1.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b8986d9768daddd5e87abf513ae168ea0afe690a444d4c82d5b1b14b0d045820", size = 317270, upload-time = "2025-10-11T19:35:43.367Z" }, - { url = "https://files.pythonhosted.org/packages/fc/6a/0f521b098e136c43c7ae1e77db4a792f9e65167fe818820502996488b926/jellyfish-1.2.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fa0ba0946f3c274f6a87aaa3c631dc70a363bd46cceea828ce777e8db653b6f", size = 352931, upload-time = "2025-10-11T19:35:44.402Z" }, - { url = "https://files.pythonhosted.org/packages/a0/c4/5d2242a650f890384b435610ef2962b1ac6091c070912a81a97020d2502a/jellyfish-1.2.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6e76b23431a667cd485fb562428d1ad29bae9fdd0fcdfb5a51cc8087bae0e88c", size = 362473, upload-time = "2025-10-11T19:35:45.427Z" }, - { url = "https://files.pythonhosted.org/packages/d5/fe/831fc45a4d3e497bccc4735809551320968360d14b89eb3d7cb892549316/jellyfish-1.2.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a058f4c6a591d5e5a47569f5648a26303ba19c76a960fef7e0beba2aa959e52e", size = 359772, upload-time = "2025-10-11T19:35:46.65Z" }, - { url = "https://files.pythonhosted.org/packages/b4/0f/d132265e299947e4462c1485f829a08a513c97c41bdfe758754e4a5c1dfe/jellyfish-1.2.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:6a49ce2a580edd3b16b69421137deef464e2f8907f9ef906d49950b1a52908c1", size = 533628, upload-time = "2025-10-11T19:35:47.691Z" }, - { url = "https://files.pythonhosted.org/packages/52/2a/d51dbf0aceb9b141dd8318ce6a41ab08a5deaae56be16a8bf3d8685ac817/jellyfish-1.2.1-cp314-cp314-musllinux_1_1_i686.whl", hash = "sha256:c85aa2bc76a36d92a3197f406f86636664d5b323727dfec4fa2842a8a24a06ae", size = 553614, upload-time = "2025-10-11T19:35:52.928Z" }, - { url = "https://files.pythonhosted.org/packages/f9/e1/fcc7c5919d871537942425f707b764af65b76c7b88377aa71083c5280e37/jellyfish-1.2.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:29cfa8bfb72aacf2d611a3313b358ed4d4140fa3d3efcffea750c8e7f8acb1aa", size = 523057, upload-time = "2025-10-11T19:35:54.423Z" }, - { url = "https://files.pythonhosted.org/packages/95/65/ee5289540b2015643493cc29b50350dbe63ca1977a902de5295a4df8c25a/jellyfish-1.2.1-cp314-cp314-win32.whl", hash = "sha256:f121218dc33fb318c34ddd889dc7362606ce1316af2bb63b73cc1df81523ca34", size = 209340, upload-time = "2025-10-11T19:35:55.69Z" }, - { url = "https://files.pythonhosted.org/packages/bc/e2/fa5de38380b0f5bd531b27a78acb0dc6118dab0b21f56d36008b829aa7de/jellyfish-1.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:9a73b5c6425a70ebd440579a677eb4f03b327b2f59090db34e6c937aeea5aabd", size = 213399, upload-time = "2025-10-11T19:35:56.776Z" }, -] - -[[package]] -name = "jinja2" -version = "3.1.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markupsafe" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, -] - -[[package]] -name = "jmespath" -version = "1.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, -] - -[[package]] -name = "joblib" -version = "1.5.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, -] - -[[package]] -name = "jsonpickle" -version = "4.1.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e4/a6/d07afcfdef402900229bcca795f80506b207af13a838d4d99ad45abf530c/jsonpickle-4.1.1.tar.gz", hash = "sha256:f86e18f13e2b96c1c1eede0b7b90095bbb61d99fedc14813c44dc2f361dbbae1", size = 316885, upload-time = "2025-06-02T20:36:11.57Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/73/04df8a6fa66d43a9fd45c30f283cc4afff17da671886e451d52af60bdc7e/jsonpickle-4.1.1-py3-none-any.whl", hash = "sha256:bb141da6057898aa2438ff268362b126826c812a1721e31cf08a6e142910dc91", size = 47125, upload-time = "2025-06-02T20:36:08.647Z" }, -] - -[[package]] -name = "jsonschema" -version = "4.26.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "jsonschema-specifications" }, - { name = "referencing" }, - { name = "rpds-py" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, -] - -[[package]] -name = "jsonschema-specifications" -version = "2025.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "referencing" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, -] - -[[package]] -name = "jupyter-book" -version = "0.15.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "docutils" }, - { name = "jinja2" }, - { name = "jsonschema" }, - { name = "linkify-it-py" }, - { name = "myst-nb" }, - { name = "pyyaml" }, - { name = "sphinx" }, - { name = "sphinx-book-theme" }, - { name = "sphinx-comments" }, - { name = "sphinx-copybutton" }, - { name = "sphinx-design" }, - { name = "sphinx-external-toc" }, - { name = "sphinx-jupyterbook-latex" }, - { name = "sphinx-multitoc-numbering" }, - { name = "sphinx-thebe" }, - { name = "sphinx-togglebutton" }, - { name = "sphinxcontrib-bibtex" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/1a/e38481ea2cd9988234eeee4dd21bec64f235c78004797e48643592018490/jupyter-book-0.15.1.tar.gz", hash = "sha256:8a1634ec16f7eedee0d116f1e5fb7c48203289ad92da42e09519dc71d956c010", size = 63027, upload-time = "2023-03-14T01:32:07.066Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ab/ef/c56470847067fd6d4c059265543188c63b56d01aa2107b18895404f82a9c/jupyter_book-0.15.1-py3-none-any.whl", hash = "sha256:7671264952abd1ca3f5e713b03e138dda710c92a985c49154f398817fe089968", size = 43698, upload-time = "2023-03-14T01:32:05.453Z" }, -] - -[[package]] -name = "jupyter-cache" -version = "0.6.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "click" }, - { name = "importlib-metadata" }, - { name = "nbclient" }, - { name = "nbformat" }, - { name = "pyyaml" }, - { name = "sqlalchemy" }, - { name = "tabulate" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/69/64/08dcc1f6fc54a263525edd23b5d2754793470c1c41a8dd82d52406f8d876/jupyter-cache-0.6.1.tar.gz", hash = "sha256:26f83901143edf4af2f3ff5a91e2d2ad298e46e2cee03c8071d37a23a63ccbfc", size = 31953, upload-time = "2023-04-22T15:38:06.006Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/da/8e/918b115bb3b4b821e2d43315e1a08b909219723191623ffbae9072fd226a/jupyter_cache-0.6.1-py3-none-any.whl", hash = "sha256:2fce7d4975805c77f75bdfc1bc2e82bc538b8e5b1af27f2f5e06d55b9f996a82", size = 33886, upload-time = "2023-04-22T15:38:04.33Z" }, -] - -[[package]] -name = "jupyter-client" -version = "8.8.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "jupyter-core" }, - { name = "python-dateutil" }, - { name = "pyzmq" }, - { name = "tornado" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/05/e4/ba649102a3bc3fbca54e7239fb924fd434c766f855693d86de0b1f2bec81/jupyter_client-8.8.0.tar.gz", hash = "sha256:d556811419a4f2d96c869af34e854e3f059b7cc2d6d01a9cd9c85c267691be3e", size = 348020, upload-time = "2026-01-08T13:55:47.938Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2d/0b/ceb7694d864abc0a047649aec263878acb9f792e1fec3e676f22dc9015e3/jupyter_client-8.8.0-py3-none-any.whl", hash = "sha256:f93a5b99c5e23a507b773d3a1136bd6e16c67883ccdbd9a829b0bbdb98cd7d7a", size = 107371, upload-time = "2026-01-08T13:55:45.562Z" }, -] - -[[package]] -name = "jupyter-core" -version = "5.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "platformdirs" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/02/49/9d1284d0dc65e2c757b74c6687b6d319b02f822ad039e5c512df9194d9dd/jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508", size = 89814, upload-time = "2025-10-16T19:19:18.444Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/e7/80988e32bf6f73919a113473a604f5a8f09094de312b9d52b79c2df7612b/jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407", size = 29032, upload-time = "2025-10-16T19:19:16.783Z" }, -] - -[[package]] -name = "l0-python" -version = "0.6.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "scipy" }, - { name = "torch" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2a/fe/3929e39c6e30b7b22730a2021cc108f00d0da611b48854eb67b0d49be94e/l0_python-0.6.1.tar.gz", hash = "sha256:8fbea10059813ef408255c93dcd5a61dfdd893612efb7e62c934a93f5701d45a", size = 37782, upload-time = "2026-02-25T16:59:39.84Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1b/ea/28fb7d49b4113953a5938c8bd39904d4aa709b619710aa27311ccf11b669/l0_python-0.6.1-py3-none-any.whl", hash = "sha256:5a8282760bf4b48b1e7ad2e435a6878f15dcc614e97f5ec1aa5690c66510733e", size = 23912, upload-time = "2026-02-25T16:59:37.953Z" }, -] - -[[package]] -name = "latexcodec" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/27/dd/4270b2c5e2ee49316c3859e62293bd2ea8e382339d63ab7bbe9f39c0ec3b/latexcodec-3.0.1.tar.gz", hash = "sha256:e78a6911cd72f9dec35031c6ec23584de6842bfbc4610a9678868d14cdfb0357", size = 31222, upload-time = "2025-06-17T18:47:34.051Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b5/40/23569737873cc9637fd488606347e9dd92b9fa37ba4fcda1f98ee5219a97/latexcodec-3.0.1-py3-none-any.whl", hash = "sha256:a9eb8200bff693f0437a69581f7579eb6bca25c4193515c09900ce76451e452e", size = 18532, upload-time = "2025-06-17T18:47:30.726Z" }, -] - -[[package]] -name = "linkify-it-py" -version = "2.0.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "uc-micro-py" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2a/ae/bb56c6828e4797ba5a4821eec7c43b8bf40f69cda4d4f5f8c8a2810ec96a/linkify-it-py-2.0.3.tar.gz", hash = "sha256:68cda27e162e9215c17d786649d1da0021a451bdc436ef9e0fa0ba5234b9b048", size = 27946, upload-time = "2024-02-04T14:48:04.179Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/1e/b832de447dee8b582cac175871d2f6c3d5077cc56d5575cadba1fd1cccfa/linkify_it_py-2.0.3-py3-none-any.whl", hash = "sha256:6bcbc417b0ac14323382aef5c5192c0075bf8a9d6b41820a2b66371eac6b6d79", size = 19820, upload-time = "2024-02-04T14:48:02.496Z" }, -] - -[[package]] -name = "mako" -version = "1.3.10" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markupsafe" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474, upload-time = "2025-04-10T12:44:31.16Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload-time = "2025-04-10T12:50:53.297Z" }, -] - -[[package]] -name = "markdown-it-py" -version = "2.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mdurl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e4/c0/59bd6d0571986f72899288a95d9d6178d0eebd70b6650f1bb3f0da90f8f7/markdown-it-py-2.2.0.tar.gz", hash = "sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1", size = 67120, upload-time = "2023-02-22T05:54:30.899Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bf/25/2d88e8feee8e055d015343f9b86e370a1ccbec546f2865c98397aaef24af/markdown_it_py-2.2.0-py3-none-any.whl", hash = "sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30", size = 84466, upload-time = "2023-02-22T05:54:29.508Z" }, -] - -[[package]] -name = "markupsafe" -version = "3.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" }, - { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" }, - { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" }, - { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" }, - { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" }, - { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" }, - { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" }, - { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" }, - { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" }, - { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" }, - { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" }, - { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" }, - { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" }, - { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" }, - { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" }, - { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" }, - { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" }, - { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" }, - { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" }, - { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" }, - { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" }, - { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" }, - { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" }, - { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" }, - { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" }, - { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" }, - { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" }, - { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" }, - { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" }, - { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" }, - { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" }, - { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" }, - { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" }, - { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" }, - { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" }, - { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" }, - { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" }, - { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" }, - { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" }, - { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" }, - { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, - { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, - { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, -] - -[[package]] -name = "matplotlib-inline" -version = "0.2.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, -] - -[[package]] -name = "mdit-py-plugins" -version = "0.3.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown-it-py" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/49/e7/cc2720da8a32724b36d04c6dba5644154cdf883a1482b3bbb81959a642ed/mdit-py-plugins-0.3.5.tar.gz", hash = "sha256:eee0adc7195e5827e17e02d2a258a2ba159944a0748f59c5099a4a27f78fcf6a", size = 39871, upload-time = "2023-03-02T17:42:50.654Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fe/4c/a9b222f045f98775034d243198212cbea36d3524c3ee1e8ab8c0346d6953/mdit_py_plugins-0.3.5-py3-none-any.whl", hash = "sha256:ca9a0714ea59a24b2b044a1831f48d817dd0c817e84339f20e7889f392d77c4e", size = 52087, upload-time = "2023-03-02T17:42:48.841Z" }, -] - -[[package]] -name = "mdurl" -version = "0.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, -] - -[[package]] -name = "microcalibrate" -version = "0.22.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "l0-python" }, - { name = "numpy" }, - { name = "optuna" }, - { name = "pandas" }, - { name = "torch" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b7/11/dc170c33ab42a1c6437c9094696c149ec780161a2cdb2630b6a70c8234dc/microcalibrate-0.22.0.tar.gz", hash = "sha256:360eb241156f3731902a9aa73aea1d39437d97a6a40db1ddd0ab85ef636596ea", size = 216545, upload-time = "2026-04-18T15:21:59.591Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/7f/36882ae748084bb7e570417cb81f2791a2d3f29fddeeaa7616c2a100c8ad/microcalibrate-0.22.0-py3-none-any.whl", hash = "sha256:c713220bfe24661fd3fba9d94ccf4352c1b961f7f7a1871d437ac15527dcf431", size = 31563, upload-time = "2026-04-18T15:21:58.69Z" }, -] - -[[package]] -name = "microdf-python" -version = "1.2.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "pandas" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/dd/70/29702ec0d482efb08049a7bec4ebfc8dc4754bf088fe7491a0260aa050ad/microdf_python-1.2.3.tar.gz", hash = "sha256:86b72532ade5fa78d12c6e05dee029206ba7f19f17a9744db6a92d3c9567e756", size = 20089, upload-time = "2026-03-06T12:50:48.02Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ba/88/6f23347d9f0ccab5b24faf88aaf8824cbfad4c5d876ac9692d664235c930/microdf_python-1.2.3-py3-none-any.whl", hash = "sha256:8e26815fce6f9e43eed9eacf957aa6b4e865609b0c852390989124bdf7873b2e", size = 21358, upload-time = "2026-03-06T12:50:46.954Z" }, -] - -[[package]] -name = "microimpute" -version = "3.1.1" -source = { git = "https://github.com/PolicyEngine/microimpute.git?rev=90be828eb442c48ee86bb91bb83a75da4b0f0f89#90be828eb442c48ee86bb91bb83a75da4b0f0f89" } -dependencies = [ - { name = "joblib" }, - { name = "numpy" }, - { name = "optuna" }, - { name = "pandas" }, - { name = "plotly" }, - { name = "psutil" }, - { name = "pydantic" }, - { name = "quantile-forest" }, - { name = "requests" }, - { name = "scikit-learn" }, - { name = "scipy" }, - { name = "statsmodels" }, - { name = "tqdm" }, -] - -[[package]] -name = "microplex" -version = "0.2.0" -source = { git = "https://github.com/PolicyEngine/microplex.git?rev=490c717b36a5ef1721b01b7dceaddbc0372c6a0a#490c717b36a5ef1721b01b7dceaddbc0372c6a0a" } -dependencies = [ - { name = "httpx" }, - { name = "huggingface-hub" }, - { name = "microimpute", marker = "python_full_version < '3.15'" }, - { name = "numpy" }, - { name = "pandas" }, - { name = "polars" }, - { name = "prdc" }, - { name = "pyarrow" }, - { name = "pydantic" }, - { name = "pyyaml" }, - { name = "quantile-forest" }, - { name = "scikit-learn" }, - { name = "scipy" }, - { name = "torch" }, -] - -[package.optional-dependencies] -calibrate = [ - { name = "microcalibrate" }, -] - -[[package]] -name = "microplex-us" -version = "0.2.0" -source = { editable = "." } -dependencies = [ - { name = "duckdb" }, - { name = "h5py" }, - { name = "microplex", extra = ["calibrate"] }, - { name = "requests" }, -] - -[package.optional-dependencies] -dev = [ - { name = "pytest" }, - { name = "ruff" }, -] -docs = [ - { name = "jupyter-book" }, - { name = "standard-imghdr" }, -] -hf = [ - { name = "huggingface-hub" }, -] -policyengine = [ - { name = "microimpute", marker = "python_full_version < '3.15'" }, - { name = "microunit" }, - { name = "policyengine-us", marker = "python_full_version < '3.15'" }, - { name = "spm-calculator" }, -] -r2 = [ - { name = "boto3" }, -] - -[package.metadata] -requires-dist = [ - { name = "boto3", marker = "extra == 'r2'", specifier = ">=1.34" }, - { name = "duckdb", specifier = ">=1.2" }, - { name = "h5py", specifier = ">=3.10" }, - { name = "huggingface-hub", marker = "extra == 'hf'", specifier = ">=0.24" }, - { name = "jupyter-book", marker = "extra == 'docs'", specifier = ">=0.15,<0.16" }, - { name = "microimpute", marker = "python_full_version >= '3.12' and python_full_version < '3.15' and extra == 'policyengine'", git = "https://github.com/PolicyEngine/microimpute.git?rev=90be828eb442c48ee86bb91bb83a75da4b0f0f89" }, - { name = "microplex", extras = ["calibrate"], git = "https://github.com/PolicyEngine/microplex.git?rev=490c717b36a5ef1721b01b7dceaddbc0372c6a0a" }, - { name = "microunit", marker = "extra == 'policyengine'", specifier = ">=0.1.0" }, - { name = "policyengine-us", marker = "python_full_version >= '3.11' and python_full_version < '3.15' and extra == 'policyengine'", specifier = "==1.715.2" }, - { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" }, - { name = "requests", specifier = ">=2.31" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1" }, - { name = "spm-calculator", marker = "extra == 'policyengine'", specifier = ">=0.3.1" }, - { name = "standard-imghdr", marker = "python_full_version >= '3.13' and extra == 'docs'", specifier = ">=3.13" }, -] -provides-extras = ["dev", "docs", "r2", "hf", "policyengine"] - -[[package]] -name = "microunit" -version = "0.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "pandas" }, - { name = "pyyaml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/58/c1/6a8a1a1f7e90e41295e813808f170c71f0d20d36c6203722fd682d0a3387/microunit-0.1.0.tar.gz", hash = "sha256:a1e90f525e0a1a3921a3ed62ce291620bd45242f829cbd7892253dfff307eeb3", size = 21638, upload-time = "2026-05-30T18:51:35.59Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/cf/a38de31d10b1029923daa7f9271a78c965deb94519627f6dd4d9c3fbf359/microunit-0.1.0-py3-none-any.whl", hash = "sha256:1652fd43b57fb6fc803089d0da0fc4d28948d9e7d5e742e3327afd376e0a3060", size = 23581, upload-time = "2026-05-30T18:51:34.376Z" }, -] - -[[package]] -name = "mpmath" -version = "1.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, -] - -[[package]] -name = "msgpack" -version = "1.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/31/b46518ecc604d7edf3a4f94cb3bf021fc62aa301f0cb849936968164ef23/msgpack-1.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4efd7b5979ccb539c221a4c4e16aac1a533efc97f3b759bb5a5ac9f6d10383bf", size = 81212, upload-time = "2025-10-08T09:15:14.552Z" }, - { url = "https://files.pythonhosted.org/packages/92/dc/c385f38f2c2433333345a82926c6bfa5ecfff3ef787201614317b58dd8be/msgpack-1.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:42eefe2c3e2af97ed470eec850facbe1b5ad1d6eacdbadc42ec98e7dcf68b4b7", size = 84315, upload-time = "2025-10-08T09:15:15.543Z" }, - { url = "https://files.pythonhosted.org/packages/d3/68/93180dce57f684a61a88a45ed13047558ded2be46f03acb8dec6d7c513af/msgpack-1.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1fdf7d83102bf09e7ce3357de96c59b627395352a4024f6e2458501f158bf999", size = 412721, upload-time = "2025-10-08T09:15:16.567Z" }, - { url = "https://files.pythonhosted.org/packages/5d/ba/459f18c16f2b3fc1a1ca871f72f07d70c07bf768ad0a507a698b8052ac58/msgpack-1.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fac4be746328f90caa3cd4bc67e6fe36ca2bf61d5c6eb6d895b6527e3f05071e", size = 424657, upload-time = "2025-10-08T09:15:17.825Z" }, - { url = "https://files.pythonhosted.org/packages/38/f8/4398c46863b093252fe67368b44edc6c13b17f4e6b0e4929dbf0bdb13f23/msgpack-1.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fffee09044073e69f2bad787071aeec727183e7580443dfeb8556cbf1978d162", size = 402668, upload-time = "2025-10-08T09:15:19.003Z" }, - { url = "https://files.pythonhosted.org/packages/28/ce/698c1eff75626e4124b4d78e21cca0b4cc90043afb80a507626ea354ab52/msgpack-1.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5928604de9b032bc17f5099496417f113c45bc6bc21b5c6920caf34b3c428794", size = 419040, upload-time = "2025-10-08T09:15:20.183Z" }, - { url = "https://files.pythonhosted.org/packages/67/32/f3cd1667028424fa7001d82e10ee35386eea1408b93d399b09fb0aa7875f/msgpack-1.1.2-cp313-cp313-win32.whl", hash = "sha256:a7787d353595c7c7e145e2331abf8b7ff1e6673a6b974ded96e6d4ec09f00c8c", size = 65037, upload-time = "2025-10-08T09:15:21.416Z" }, - { url = "https://files.pythonhosted.org/packages/74/07/1ed8277f8653c40ebc65985180b007879f6a836c525b3885dcc6448ae6cb/msgpack-1.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:a465f0dceb8e13a487e54c07d04ae3ba131c7c5b95e2612596eafde1dccf64a9", size = 72631, upload-time = "2025-10-08T09:15:22.431Z" }, - { url = "https://files.pythonhosted.org/packages/e5/db/0314e4e2db56ebcf450f277904ffd84a7988b9e5da8d0d61ab2d057df2b6/msgpack-1.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:e69b39f8c0aa5ec24b57737ebee40be647035158f14ed4b40e6f150077e21a84", size = 64118, upload-time = "2025-10-08T09:15:23.402Z" }, - { url = "https://files.pythonhosted.org/packages/22/71/201105712d0a2ff07b7873ed3c220292fb2ea5120603c00c4b634bcdafb3/msgpack-1.1.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e23ce8d5f7aa6ea6d2a2b326b4ba46c985dbb204523759984430db7114f8aa00", size = 81127, upload-time = "2025-10-08T09:15:24.408Z" }, - { url = "https://files.pythonhosted.org/packages/1b/9f/38ff9e57a2eade7bf9dfee5eae17f39fc0e998658050279cbb14d97d36d9/msgpack-1.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6c15b7d74c939ebe620dd8e559384be806204d73b4f9356320632d783d1f7939", size = 84981, upload-time = "2025-10-08T09:15:25.812Z" }, - { url = "https://files.pythonhosted.org/packages/8e/a9/3536e385167b88c2cc8f4424c49e28d49a6fc35206d4a8060f136e71f94c/msgpack-1.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99e2cb7b9031568a2a5c73aa077180f93dd2e95b4f8d3b8e14a73ae94a9e667e", size = 411885, upload-time = "2025-10-08T09:15:27.22Z" }, - { url = "https://files.pythonhosted.org/packages/2f/40/dc34d1a8d5f1e51fc64640b62b191684da52ca469da9cd74e84936ffa4a6/msgpack-1.1.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:180759d89a057eab503cf62eeec0aa61c4ea1200dee709f3a8e9397dbb3b6931", size = 419658, upload-time = "2025-10-08T09:15:28.4Z" }, - { url = "https://files.pythonhosted.org/packages/3b/ef/2b92e286366500a09a67e03496ee8b8ba00562797a52f3c117aa2b29514b/msgpack-1.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:04fb995247a6e83830b62f0b07bf36540c213f6eac8e851166d8d86d83cbd014", size = 403290, upload-time = "2025-10-08T09:15:29.764Z" }, - { url = "https://files.pythonhosted.org/packages/78/90/e0ea7990abea5764e4655b8177aa7c63cdfa89945b6e7641055800f6c16b/msgpack-1.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8e22ab046fa7ede9e36eeb4cfad44d46450f37bb05d5ec482b02868f451c95e2", size = 415234, upload-time = "2025-10-08T09:15:31.022Z" }, - { url = "https://files.pythonhosted.org/packages/72/4e/9390aed5db983a2310818cd7d3ec0aecad45e1f7007e0cda79c79507bb0d/msgpack-1.1.2-cp314-cp314-win32.whl", hash = "sha256:80a0ff7d4abf5fecb995fcf235d4064b9a9a8a40a3ab80999e6ac1e30b702717", size = 66391, upload-time = "2025-10-08T09:15:32.265Z" }, - { url = "https://files.pythonhosted.org/packages/6e/f1/abd09c2ae91228c5f3998dbd7f41353def9eac64253de3c8105efa2082f7/msgpack-1.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:9ade919fac6a3e7260b7f64cea89df6bec59104987cbea34d34a2fa15d74310b", size = 73787, upload-time = "2025-10-08T09:15:33.219Z" }, - { url = "https://files.pythonhosted.org/packages/6a/b0/9d9f667ab48b16ad4115c1935d94023b82b3198064cb84a123e97f7466c1/msgpack-1.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:59415c6076b1e30e563eb732e23b994a61c159cec44deaf584e5cc1dd662f2af", size = 66453, upload-time = "2025-10-08T09:15:34.225Z" }, - { url = "https://files.pythonhosted.org/packages/16/67/93f80545eb1792b61a217fa7f06d5e5cb9e0055bed867f43e2b8e012e137/msgpack-1.1.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:897c478140877e5307760b0ea66e0932738879e7aa68144d9b78ea4c8302a84a", size = 85264, upload-time = "2025-10-08T09:15:35.61Z" }, - { url = "https://files.pythonhosted.org/packages/87/1c/33c8a24959cf193966ef11a6f6a2995a65eb066bd681fd085afd519a57ce/msgpack-1.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a668204fa43e6d02f89dbe79a30b0d67238d9ec4c5bd8a940fc3a004a47b721b", size = 89076, upload-time = "2025-10-08T09:15:36.619Z" }, - { url = "https://files.pythonhosted.org/packages/fc/6b/62e85ff7193663fbea5c0254ef32f0c77134b4059f8da89b958beb7696f3/msgpack-1.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5559d03930d3aa0f3aacb4c42c776af1a2ace2611871c84a75afe436695e6245", size = 435242, upload-time = "2025-10-08T09:15:37.647Z" }, - { url = "https://files.pythonhosted.org/packages/c1/47/5c74ecb4cc277cf09f64e913947871682ffa82b3b93c8dad68083112f412/msgpack-1.1.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70c5a7a9fea7f036b716191c29047374c10721c389c21e9ffafad04df8c52c90", size = 432509, upload-time = "2025-10-08T09:15:38.794Z" }, - { url = "https://files.pythonhosted.org/packages/24/a4/e98ccdb56dc4e98c929a3f150de1799831c0a800583cde9fa022fa90602d/msgpack-1.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f2cb069d8b981abc72b41aea1c580ce92d57c673ec61af4c500153a626cb9e20", size = 415957, upload-time = "2025-10-08T09:15:40.238Z" }, - { url = "https://files.pythonhosted.org/packages/da/28/6951f7fb67bc0a4e184a6b38ab71a92d9ba58080b27a77d3e2fb0be5998f/msgpack-1.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d62ce1f483f355f61adb5433ebfd8868c5f078d1a52d042b0a998682b4fa8c27", size = 422910, upload-time = "2025-10-08T09:15:41.505Z" }, - { url = "https://files.pythonhosted.org/packages/f0/03/42106dcded51f0a0b5284d3ce30a671e7bd3f7318d122b2ead66ad289fed/msgpack-1.1.2-cp314-cp314t-win32.whl", hash = "sha256:1d1418482b1ee984625d88aa9585db570180c286d942da463533b238b98b812b", size = 75197, upload-time = "2025-10-08T09:15:42.954Z" }, - { url = "https://files.pythonhosted.org/packages/15/86/d0071e94987f8db59d4eeb386ddc64d0bb9b10820a8d82bcd3e53eeb2da6/msgpack-1.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:5a46bf7e831d09470ad92dff02b8b1ac92175ca36b087f904a0519857c6be3ff", size = 85772, upload-time = "2025-10-08T09:15:43.954Z" }, - { url = "https://files.pythonhosted.org/packages/81/f2/08ace4142eb281c12701fc3b93a10795e4d4dc7f753911d836675050f886/msgpack-1.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d99ef64f349d5ec3293688e91486c5fdb925ed03807f64d98d205d2713c60b46", size = 70868, upload-time = "2025-10-08T09:15:44.959Z" }, -] - -[[package]] -name = "myst-nb" -version = "0.17.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "importlib-metadata" }, - { name = "ipykernel" }, - { name = "ipython" }, - { name = "jupyter-cache" }, - { name = "myst-parser" }, - { name = "nbclient" }, - { name = "nbformat" }, - { name = "pyyaml" }, - { name = "sphinx" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/15/e6/6ec454f930f1d542f1c1d9562d939acc41220408ff996b7c5b3b957fba1d/myst-nb-0.17.2.tar.gz", hash = "sha256:0f61386515fab07c73646adca97fff2f69f41e90d313a260217c5bbe419d858b", size = 74184, upload-time = "2023-04-21T12:38:17.712Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/72/84/38b6468146945cf0466a149763d62bd0264cb221eaa74c979498ea215f22/myst_nb-0.17.2-py3-none-any.whl", hash = "sha256:132ca4d0f5c308fdd4b6fdaba077712e28e119ccdafd04d6e41b51aac5483494", size = 78636, upload-time = "2023-04-21T12:38:15.807Z" }, -] - -[[package]] -name = "myst-parser" -version = "0.18.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "docutils" }, - { name = "jinja2" }, - { name = "markdown-it-py" }, - { name = "mdit-py-plugins" }, - { name = "pyyaml" }, - { name = "sphinx" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/68/13/91438d3b835a022fcacd858a7106d4813cfccf98b1fd9a6196cfa2c859df/myst-parser-0.18.1.tar.gz", hash = "sha256:79317f4bb2c13053dd6e64f9da1ba1da6cd9c40c8a430c447a7b146a594c246d", size = 64147, upload-time = "2022-09-27T09:57:45.183Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/72/fd/594c936c65e707deda5670e8fff5ca2c948a12e922813eab5d316694e9ca/myst_parser-0.18.1-py3-none-any.whl", hash = "sha256:61b275b85d9f58aa327f370913ae1bec26ebad372cc99f3ab85c8ec3ee8d9fb8", size = 58157, upload-time = "2022-09-27T09:57:42.689Z" }, -] - -[[package]] -name = "nbclient" -version = "0.7.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "jupyter-client" }, - { name = "jupyter-core" }, - { name = "nbformat" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c8/ee/b9351110fbbc8229863cbc54454f1db91f7836c730018d674a188ede5efd/nbclient-0.7.4.tar.gz", hash = "sha256:d447f0e5a4cfe79d462459aec1b3dc5c2e9152597262be8ee27f7d4c02566a0d", size = 60682, upload-time = "2023-04-25T14:38:02.404Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f3/97/d35da363d1df4a68f1b3d44335f80235487d7ca77d1f606b0c3523118f34/nbclient-0.7.4-py3-none-any.whl", hash = "sha256:c817c0768c5ff0d60e468e017613e6eae27b6fa31e43f905addd2d24df60c125", size = 73120, upload-time = "2023-04-25T14:38:00.327Z" }, -] - -[[package]] -name = "nbformat" -version = "5.10.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "fastjsonschema" }, - { name = "jsonschema" }, - { name = "jupyter-core" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/6d/fd/91545e604bc3dad7dca9ed03284086039b294c6b3d75c0d2fa45f9e9caf3/nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a", size = 142749, upload-time = "2024-04-04T11:20:37.371Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b", size = 78454, upload-time = "2024-04-04T11:20:34.895Z" }, -] - -[[package]] -name = "ndindex" -version = "1.10.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f5/92/4b9d2f4e0f3eabcfc7b02b48261f6e5ad36a3e2c1bbdcc4e3b7b6c768fa6/ndindex-1.10.1.tar.gz", hash = "sha256:0f6113c1f031248f8818cbee1aa92aa3c9472b7701debcce9fddebcd2f610f11", size = 271395, upload-time = "2025-11-19T20:40:08.899Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/60/ea/03676266cb38cc671679a9d258cc59bfc58c69726db87b0d6eeafb308895/ndindex-1.10.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:157b5c34a1b779f5d27b790d9bd7e7b156d284e76be83c591a3ba003984f4956", size = 176323, upload-time = "2025-11-19T20:38:53.528Z" }, - { url = "https://files.pythonhosted.org/packages/89/f4/2d350439031b108b0bb8897cad315390c5ad88c14d87419a54c2ffa95c80/ndindex-1.10.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f99b3e89220da3244d03c9c5473669c7107d361c129fd9b064622744dee1ce15", size = 175584, upload-time = "2025-11-19T20:38:57.968Z" }, - { url = "https://files.pythonhosted.org/packages/77/34/a51b7c6f7159718a6a0a694fc1058b94d793c416d9a4fd649f1924cce5f8/ndindex-1.10.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6928e47fb008903f2e41309b7ff1e59b16abbcd59e2e945454571c28b2433c9e", size = 524127, upload-time = "2025-11-19T20:38:59.412Z" }, - { url = "https://files.pythonhosted.org/packages/21/91/d8f19f0b8fc9c5585b50fda44c05415da0bdc5fa9c9c69011015dac27880/ndindex-1.10.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e69a2cb1ac7be955c3c77f1def83f410775a81525c9ce2d4c0a3f2a61589ed47", size = 528213, upload-time = "2025-11-19T20:39:00.882Z" }, - { url = "https://files.pythonhosted.org/packages/2c/a9/77d9d037e871a3faa8579b354ca2dd09cc5bbf3e085d9e3c67f786d55ee3/ndindex-1.10.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cb76e0f3f235d8b1c768b17e771de48775d281713795c3aa045e8114ad61bdda", size = 1492172, upload-time = "2025-11-19T20:39:02.387Z" }, - { url = "https://files.pythonhosted.org/packages/ac/29/ad13676fc9312e0aa1a80a7c04bcb0b502b877ed4956136117ad663eced0/ndindex-1.10.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7da34a78410c14341d5fff73be5ce924bd36500bf7f640fc59b8607d3a0df95e", size = 1552614, upload-time = "2025-11-19T20:39:04.232Z" }, - { url = "https://files.pythonhosted.org/packages/63/34/e6e6fd81423810c07ae623c4d36e099f42a812994977e8e3bfa182c02472/ndindex-1.10.1-cp313-cp313-win32.whl", hash = "sha256:9599fcb7411ffe601c367f0a5d4bc0ed588e3e7d9dc7604bdb32c8f669456b9e", size = 149330, upload-time = "2025-11-19T20:39:05.727Z" }, - { url = "https://files.pythonhosted.org/packages/4d/d3/830a20626e2ec0e31a926be90e67068a029930f99e6cfebf2f9768e7b7b1/ndindex-1.10.1-cp313-cp313-win_amd64.whl", hash = "sha256:ef3ef22390a892d16286505083ee5b326317b21c255a0c7f744b1290a0b964a6", size = 157309, upload-time = "2025-11-19T20:39:07.394Z" }, - { url = "https://files.pythonhosted.org/packages/4a/73/3bdeecd1f6ec0ad81478a53d96da4ba9be74ed297c95f2b4fbe2b80843e1/ndindex-1.10.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:72af787dcee3661f36fff9d144d989aacefe32e2c8b51ceef9babd46afb93a18", size = 181022, upload-time = "2025-11-19T20:39:10.487Z" }, - { url = "https://files.pythonhosted.org/packages/b9/b1/0d97ba134b5aa71b5ed638fac193a7ec4d987e091e2f4e4162ebdaacbda1/ndindex-1.10.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fa60637dfae1ee3fc057e420a52cc4ace38cf2c0d1a0451af2a3cba84d281842", size = 181289, upload-time = "2025-11-19T20:39:11.793Z" }, - { url = "https://files.pythonhosted.org/packages/e2/d7/1df02df24880ce3f3c8137b6f3ca5a901a58d9079dcfd8c818419277ff87/ndindex-1.10.1-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d0ebdba2fade3f6916fe21fd49e2a0935af4f58c56100a60f3f2eb26e20baee7", size = 632517, upload-time = "2025-11-19T20:39:13.259Z" }, - { url = "https://files.pythonhosted.org/packages/34/96/b509c2b14e9b10710fe6ab6ba8bda1ee6ce36ab16397ff2f5bbb33bbbba3/ndindex-1.10.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:346a4bf09f5771548665c8206e81daadb6b9925d409746e709894bdd98adc701", size = 616179, upload-time = "2025-11-19T20:39:14.757Z" }, - { url = "https://files.pythonhosted.org/packages/38/e3/f89d60cf351c33a484bf1a4546a5dee6f4e7a6a973613ffa12bd316b14ad/ndindex-1.10.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:23d35696f802548143b5cc199bf2f171efb0061aa7934959251dd3bae56d038c", size = 1588373, upload-time = "2025-11-19T20:39:16.62Z" }, - { url = "https://files.pythonhosted.org/packages/ee/19/002fc1e6a4abeef8d92e9aa2e43aea4d462f6b170090f7752ea8887f4897/ndindex-1.10.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a91e1a0398120233d5c3b23ccb2d4b78e970d66136f1a7221fa9a53873c3d5c5", size = 1636436, upload-time = "2025-11-19T20:39:18.266Z" }, - { url = "https://files.pythonhosted.org/packages/5f/8f/28b1ad78c787ac8fafd6e26419a80366617784b1779e3857fa687492f6bc/ndindex-1.10.1-cp313-cp313t-win32.whl", hash = "sha256:78bfe25941d2dac406391ddd9baf0b0fce163807b98ecc2c47a3030ee8466319", size = 158780, upload-time = "2025-11-19T20:39:20.454Z" }, - { url = "https://files.pythonhosted.org/packages/d0/56/b81060607a19865bb8be8d705b1b3e8aefb8747c0fbd383e38b4cae4bd71/ndindex-1.10.1-cp313-cp313t-win_amd64.whl", hash = "sha256:08bfdc1f7a0b408d15b3ce61d141ebbebdb47a25341967e425e104c5bd512a5c", size = 167485, upload-time = "2025-11-19T20:39:21.733Z" }, - { url = "https://files.pythonhosted.org/packages/da/9b/aac1131e9f3a5635ba7b0312c3bfa610511ab4108f85c0d914a32887aa00/ndindex-1.10.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9b5297f207ebc068c7cdf9e3cd7b95aa5c9ec04295d0a7e56b529f66787d4685", size = 176478, upload-time = "2025-11-19T20:39:23.747Z" }, - { url = "https://files.pythonhosted.org/packages/1a/05/a0d8ca0432c84550bc17af6d6479a803936895b8b8403a1216c5a55475fb/ndindex-1.10.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c5e9762452b163e33cfb6e821f86e45ba0b53bdfcd23ab5d57b48a8f566898cb", size = 175480, upload-time = "2025-11-19T20:39:25.365Z" }, - { url = "https://files.pythonhosted.org/packages/09/4a/028ab78a9f29fd2a7e86a90337cde4658eaa77b425c63045d83a1d2e4f26/ndindex-1.10.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cf80241b40adffdc3276b2c9fb63a96c6c98b4a9d941892738de8add65083962", size = 528125, upload-time = "2025-11-19T20:39:26.798Z" }, - { url = "https://files.pythonhosted.org/packages/00/a9/bd823b345fb06c83ade6ef1c1933521d4357cd04490e684d4fa30126926c/ndindex-1.10.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf5855881884b8467dfcf45764ccf2e4279075be14b155b89c96994bb08d2e6f", size = 527328, upload-time = "2025-11-19T20:39:28.292Z" }, - { url = "https://files.pythonhosted.org/packages/91/4f/40b9c15588cbf9dde43c4fb88a31dd1f636a913fa29649f18f8e3ebca36a/ndindex-1.10.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e81a9bd36fe054b6c9fcc53d26bc9a28cf15d1ab52a0f5b854f894116f3a54e1", size = 1497508, upload-time = "2025-11-19T20:39:30.735Z" }, - { url = "https://files.pythonhosted.org/packages/24/8f/b8048f7837d2e9dff0af507b398307fa84a2aa9ea3db71b4aa800b21da4a/ndindex-1.10.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:588e8875d836a93b3cd9af482c8074bb02288ae1aff92cf277e1f02d9ae0f992", size = 1552625, upload-time = "2025-11-19T20:39:32.404Z" }, - { url = "https://files.pythonhosted.org/packages/20/aa/0ecb53c7e690a44769f2f92a843723ccb1d0ce080d93ba1ea811304cca12/ndindex-1.10.1-cp314-cp314-win32.whl", hash = "sha256:28741daca5926adff402247cd406f453ed5bb6042e82d6855938f805190e5ce9", size = 151237, upload-time = "2025-11-19T20:39:34.847Z" }, - { url = "https://files.pythonhosted.org/packages/8c/4e/197982fa8b4e6e6b9d15c38505c41076d1c552921f09f4d35acbbbbc0b70/ndindex-1.10.1-cp314-cp314-win_amd64.whl", hash = "sha256:59a3222befc0f7cdc85fb9b90a567ae890f70a864bdeb660517e9ebcb36bf1bc", size = 158925, upload-time = "2025-11-19T20:39:37.149Z" }, - { url = "https://files.pythonhosted.org/packages/24/ad/116b6154046a69fc04e2d4490905801d3839a3f21290c0b4d49b1044e251/ndindex-1.10.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:967b87b88dadb62555ec1039695c347254eccb8ca3d124c0e5dbe084c525fa93", size = 181724, upload-time = "2025-11-19T20:39:38.635Z" }, - { url = "https://files.pythonhosted.org/packages/c4/00/3ce4351366c890bcc87a5e9f1f90102547962eef356ac7c799bfdd0dddce/ndindex-1.10.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c67dde588c0fb89d872931a4ed5f9b4d21c1c70a3d92fdf0812a1de154239816", size = 181653, upload-time = "2025-11-19T20:39:40.048Z" }, - { url = "https://files.pythonhosted.org/packages/4d/05/a6fda696a2f02a3f8dd2ee9d816cb2edff6423bf0110a4876cc3b1259732/ndindex-1.10.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c65ca639a7abf72d79f22424f4abd18dece1f289a2b7b028a0ca455edd2168d4", size = 630898, upload-time = "2025-11-19T20:39:41.495Z" }, - { url = "https://files.pythonhosted.org/packages/73/78/eb2e5d067d4c054451e33eaece74cbdcb58236dc60516e73d783dae34c7e/ndindex-1.10.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c3634a8df43e7928122225a3d64d850c8957bd1edf2e403907deacb478af27b", size = 614419, upload-time = "2025-11-19T20:39:43.254Z" }, - { url = "https://files.pythonhosted.org/packages/78/51/261bfb49eb7920c2a7314cacba5821930a529911dce48c7c6cd786096a5a/ndindex-1.10.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9d581f931e61f182478f18bdf5edd3955899df5da4892ed0d5de547a4cfd5b6f", size = 1587517, upload-time = "2025-11-19T20:39:44.809Z" }, - { url = "https://files.pythonhosted.org/packages/ec/37/084a332ecdf8b0049151bd78001a7baf2daf7f500d043beb8a1f95d0f4e3/ndindex-1.10.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:78ce45106ebf67aeba99714818c721d8fd5fb9534daebd2565665a2d64b50fc9", size = 1635372, upload-time = "2025-11-19T20:39:47.231Z" }, - { url = "https://files.pythonhosted.org/packages/28/f4/716580fbb03018ab1daa86ed12c1925c67e79689db5fee82393e840758a2/ndindex-1.10.1-cp314-cp314t-win32.whl", hash = "sha256:fe5341e24dc992b09c258456ac90a09a6d25efdc2cb86dcc91d32c8891e1df9a", size = 162186, upload-time = "2025-11-19T20:39:48.81Z" }, - { url = "https://files.pythonhosted.org/packages/4d/20/28f669c09a470e7f523b0cc10b94336664d9648594015e3f2a1ec29047b1/ndindex-1.10.1-cp314-cp314t-win_amd64.whl", hash = "sha256:37f87f0e7690ae0324334740e0661d6297f2e62c9bf925127d249fb7eddd0ad8", size = 171077, upload-time = "2025-11-19T20:39:50.108Z" }, -] - -[[package]] -name = "nest-asyncio" -version = "1.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" }, -] - -[[package]] -name = "networkx" -version = "3.6.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, -] - -[[package]] -name = "numexpr" -version = "2.14.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/cb/2f/fdba158c9dbe5caca9c3eca3eaffffb251f2fb8674bf8e2d0aed5f38d319/numexpr-2.14.1.tar.gz", hash = "sha256:4be00b1086c7b7a5c32e31558122b7b80243fe098579b170967da83f3152b48b", size = 119400, upload-time = "2025-10-13T16:17:27.351Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/73/b4/9f6d637fd79df42be1be29ee7ba1f050fab63b7182cb922a0e08adc12320/numexpr-2.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:09078ba73cffe94745abfbcc2d81ab8b4b4e9d7bfbbde6cac2ee5dbf38eee222", size = 162794, upload-time = "2025-10-13T16:16:38.291Z" }, - { url = "https://files.pythonhosted.org/packages/35/ae/d58558d8043de0c49f385ea2fa789e3cfe4d436c96be80200c5292f45f15/numexpr-2.14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:dce0b5a0447baa7b44bc218ec2d7dcd175b8eee6083605293349c0c1d9b82fb6", size = 152203, upload-time = "2025-10-13T16:16:39.907Z" }, - { url = "https://files.pythonhosted.org/packages/13/65/72b065f9c75baf8f474fd5d2b768350935989d4917db1c6c75b866d4067c/numexpr-2.14.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:06855053de7a3a8425429bd996e8ae3c50b57637ad3e757e0fa0602a7874be30", size = 455860, upload-time = "2025-10-13T16:13:35.811Z" }, - { url = "https://files.pythonhosted.org/packages/fc/f9/c9457652dfe28e2eb898372da2fe786c6db81af9540c0f853ee04a0699cc/numexpr-2.14.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f9366d23a2e991fd5a8b5e61a17558f028ba86158a4552f8f239b005cdf83c", size = 446574, upload-time = "2025-10-13T16:15:17.367Z" }, - { url = "https://files.pythonhosted.org/packages/b6/99/8d3879c4d67d3db5560cf2de65ce1778b80b75f6fa415eb5c3e7bd37ba27/numexpr-2.14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c5f1b1605695778896534dfc6e130d54a65cd52be7ed2cd0cfee3981fd676bf5", size = 1417306, upload-time = "2025-10-13T16:13:42.813Z" }, - { url = "https://files.pythonhosted.org/packages/ea/05/6bddac9f18598ba94281e27a6943093f7d0976544b0cb5d92272c64719bd/numexpr-2.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a4ba71db47ea99c659d88ee6233fa77b6dc83392f1d324e0c90ddf617ae3f421", size = 1466145, upload-time = "2025-10-13T16:15:27.464Z" }, - { url = "https://files.pythonhosted.org/packages/24/5d/cbeb67aca0c5a76ead13df7e8bd8dd5e0d49145f90da697ba1d9f07005b0/numexpr-2.14.1-cp313-cp313-win32.whl", hash = "sha256:638dce8320f4a1483d5ca4fda69f60a70ed7e66be6e68bc23fb9f1a6b78a9e3b", size = 166996, upload-time = "2025-10-13T16:17:13.803Z" }, - { url = "https://files.pythonhosted.org/packages/cc/23/9281bceaeb282cead95f0aa5f7f222ffc895670ea689cc1398355f6e3001/numexpr-2.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:9fdcd4735121658a313f878fd31136d1bfc6a5b913219e7274e9fca9f8dac3bb", size = 160189, upload-time = "2025-10-13T16:17:15.417Z" }, - { url = "https://files.pythonhosted.org/packages/f3/76/7aac965fd93a56803cbe502aee2adcad667253ae34b0badf6c5af7908b6c/numexpr-2.14.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:557887ad7f5d3c2a40fd7310e50597045a68e66b20a77b3f44d7bc7608523b4b", size = 163524, upload-time = "2025-10-13T16:16:42.213Z" }, - { url = "https://files.pythonhosted.org/packages/58/65/79d592d5e63fbfab3b59a60c386853d9186a44a3fa3c87ba26bdc25b6195/numexpr-2.14.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:af111c8fe6fc55d15e4c7cab11920fc50740d913636d486545b080192cd0ad73", size = 152919, upload-time = "2025-10-13T16:16:44.229Z" }, - { url = "https://files.pythonhosted.org/packages/84/78/3c8335f713d4aeb99fa758d7c62f0be1482d4947ce5b508e2052bb7aeee9/numexpr-2.14.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:33265294376e7e2ae4d264d75b798a915d2acf37b9dd2b9405e8b04f84d05cfc", size = 465972, upload-time = "2025-10-13T16:13:45.061Z" }, - { url = "https://files.pythonhosted.org/packages/35/81/9ee5f69b811e8f18746c12d6f71848617684edd3161927f95eee7a305631/numexpr-2.14.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:83647d846d3eeeb9a9255311236135286728b398d0d41d35dedb532dca807fe9", size = 456953, upload-time = "2025-10-13T16:15:31.186Z" }, - { url = "https://files.pythonhosted.org/packages/6d/39/9b8bc6e294d85cbb54a634e47b833e9f3276a8bdf7ce92aa808718a0212d/numexpr-2.14.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6e575fd3ad41ddf3355d0c7ef6bd0168619dc1779a98fe46693cad5e95d25e6e", size = 1426199, upload-time = "2025-10-13T16:13:48.231Z" }, - { url = "https://files.pythonhosted.org/packages/1e/ce/0d4fcd31ab49319740d934fba1734d7dad13aa485532ca754e555ca16c8b/numexpr-2.14.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:67ea4771029ce818573b1998f5ca416bd255156feea017841b86176a938f7d19", size = 1474214, upload-time = "2025-10-13T16:15:38.893Z" }, - { url = "https://files.pythonhosted.org/packages/b7/47/b2a93cbdb3ba4e009728ad1b9ef1550e2655ea2c86958ebaf03b9615f275/numexpr-2.14.1-cp313-cp313t-win32.whl", hash = "sha256:15015d47d3d1487072d58c0e7682ef2eb608321e14099c39d52e2dd689483611", size = 167676, upload-time = "2025-10-13T16:17:17.351Z" }, - { url = "https://files.pythonhosted.org/packages/86/99/ee3accc589ed032eea68e12172515ed96a5568534c213ad109e1f4411df1/numexpr-2.14.1-cp313-cp313t-win_amd64.whl", hash = "sha256:94c711f6d8f17dfb4606842b403699603aa591ab9f6bf23038b488ea9cfb0f09", size = 161096, upload-time = "2025-10-13T16:17:19.174Z" }, - { url = "https://files.pythonhosted.org/packages/ac/36/9db78dfbfdfa1f8bf0872993f1a334cdd8fca5a5b6567e47dcb128bcb7c2/numexpr-2.14.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ede79f7ff06629f599081de644546ce7324f1581c09b0ac174da88a470d39c21", size = 162848, upload-time = "2025-10-13T16:16:46.216Z" }, - { url = "https://files.pythonhosted.org/packages/13/c1/a5c78ae637402c5550e2e0ba175275d2515d432ec28af0cdc23c9b476e65/numexpr-2.14.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2eac7a5a2f70b3768c67056445d1ceb4ecd9b853c8eda9563823b551aeaa5082", size = 152270, upload-time = "2025-10-13T16:16:47.92Z" }, - { url = "https://files.pythonhosted.org/packages/9a/ed/aabd8678077848dd9a751c5558c2057839f5a09e2a176d8dfcd0850ee00e/numexpr-2.14.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5aedf38d4c0c19d3cecfe0334c3f4099fb496f54c146223d30fa930084bc8574", size = 455918, upload-time = "2025-10-13T16:13:50.338Z" }, - { url = "https://files.pythonhosted.org/packages/88/e1/3db65117f02cdefb0e5e4c440daf1c30beb45051b7f47aded25b7f4f2f34/numexpr-2.14.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439ec4d57b853792ebe5456e3160312281c3a7071ecac5532ded3278ede614de", size = 446512, upload-time = "2025-10-13T16:15:42.313Z" }, - { url = "https://files.pythonhosted.org/packages/9a/fb/7ceb9ee55b5f67e4a3e4d73d5af4c7e37e3c9f37f54bee90361b64b17e3f/numexpr-2.14.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e23b87f744e04e302d82ac5e2189ae20a533566aec76a46885376e20b0645bf8", size = 1417845, upload-time = "2025-10-13T16:13:53.836Z" }, - { url = "https://files.pythonhosted.org/packages/45/2d/9b5764d0eafbbb2889288f80de773791358acf6fad1a55767538d8b79599/numexpr-2.14.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:44f84e0e5af219dbb62a081606156420815890e041b87252fbcea5df55214c4c", size = 1466211, upload-time = "2025-10-13T16:15:48.985Z" }, - { url = "https://files.pythonhosted.org/packages/5d/21/204db708eccd71aa8bc55bcad55bc0fc6c5a4e01ad78e14ee5714a749386/numexpr-2.14.1-cp314-cp314-win32.whl", hash = "sha256:1f1a5e817c534539351aa75d26088e9e1e0ef1b3a6ab484047618a652ccc4fc3", size = 168835, upload-time = "2025-10-13T16:17:20.82Z" }, - { url = "https://files.pythonhosted.org/packages/4f/3e/d83e9401a1c3449a124f7d4b3fb44084798e0d30f7c11e60712d9b94cf11/numexpr-2.14.1-cp314-cp314-win_amd64.whl", hash = "sha256:587c41509bc373dfb1fe6086ba55a73147297247bedb6d588cda69169fc412f2", size = 162608, upload-time = "2025-10-13T16:17:22.228Z" }, - { url = "https://files.pythonhosted.org/packages/7f/d6/ec947806bb57836d6379a8c8a253c2aeaa602b12fef2336bfd2462bb4ed5/numexpr-2.14.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:ec368819502b64f190c3f71be14a304780b5935c42aae5bf22c27cc2cbba70b5", size = 163525, upload-time = "2025-10-13T16:16:50.133Z" }, - { url = "https://files.pythonhosted.org/packages/0d/77/048f30dcf661a3d52963a88c29b52b6d5ce996d38e9313a56a922451c1e0/numexpr-2.14.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7e87f6d203ac57239de32261c941e9748f9309cbc0da6295eabd0c438b920d3a", size = 152917, upload-time = "2025-10-13T16:16:52.055Z" }, - { url = "https://files.pythonhosted.org/packages/9e/d3/956a13e628d722d649fbf2fded615134a308c082e122a48bad0e90a99ce9/numexpr-2.14.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd72d8c2a165fe45ea7650b16eb8cc1792a94a722022006bb97c86fe51fd2091", size = 466242, upload-time = "2025-10-13T16:13:55.795Z" }, - { url = "https://files.pythonhosted.org/packages/d6/dd/abe848678d82486940892f2cacf39e82eec790e8930d4d713d3f9191063b/numexpr-2.14.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70d80fcb418a54ca208e9a38e58ddc425c07f66485176b261d9a67c7f2864f73", size = 457149, upload-time = "2025-10-13T16:15:52.036Z" }, - { url = "https://files.pythonhosted.org/packages/fd/bb/797b583b5fb9da5700a5708ca6eb4f889c94d81abb28de4d642c0f4b3258/numexpr-2.14.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:edea2f20c2040df8b54ee8ca8ebda63de9545b2112872466118e9df4d0ae99f3", size = 1426493, upload-time = "2025-10-13T16:13:59.244Z" }, - { url = "https://files.pythonhosted.org/packages/77/c4/0519ab028fdc35e3e7ee700def7f2b4631b175cd9e1202bd7966c1695c33/numexpr-2.14.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:790447be6879a6c51b9545f79612d24c9ea0a41d537a84e15e6a8ddef0b6268e", size = 1474413, upload-time = "2025-10-13T16:15:59.211Z" }, - { url = "https://files.pythonhosted.org/packages/d4/4a/33044878c8f4a75213cfe9c11d4c02058bb710a7a063fe14f362e8de1077/numexpr-2.14.1-cp314-cp314t-win32.whl", hash = "sha256:538961096c2300ea44240209181e31fae82759d26b51713b589332b9f2a4117e", size = 169502, upload-time = "2025-10-13T16:17:23.829Z" }, - { url = "https://files.pythonhosted.org/packages/41/a2/5a1a2c72528b429337f49911b18c302ecd36eeab00f409147e1aa4ae4519/numexpr-2.14.1-cp314-cp314t-win_amd64.whl", hash = "sha256:a40b350cd45b4446076fa11843fa32bbe07024747aeddf6d467290bf9011b392", size = 163589, upload-time = "2025-10-13T16:17:25.696Z" }, -] - -[[package]] -name = "numpy" -version = "2.4.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/10/8b/c265f4823726ab832de836cdd184d0986dcf94480f81e8739692a7ac7af2/numpy-2.4.3.tar.gz", hash = "sha256:483a201202b73495f00dbc83796c6ae63137a9bdade074f7648b3e32613412dd", size = 20727743, upload-time = "2026-03-09T07:58:53.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/d0/1fe47a98ce0df229238b77611340aff92d52691bcbc10583303181abf7fc/numpy-2.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b346845443716c8e542d54112966383b448f4a3ba5c66409771b8c0889485dd3", size = 16665297, upload-time = "2026-03-09T07:56:52.296Z" }, - { url = "https://files.pythonhosted.org/packages/27/d9/4e7c3f0e68dfa91f21c6fb6cf839bc829ec920688b1ce7ec722b1a6202fb/numpy-2.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2629289168f4897a3c4e23dc98d6f1731f0fc0fe52fb9db19f974041e4cc12b9", size = 14691853, upload-time = "2026-03-09T07:56:54.992Z" }, - { url = "https://files.pythonhosted.org/packages/3a/66/bd096b13a87549683812b53ab211e6d413497f84e794fb3c39191948da97/numpy-2.4.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:bb2e3cf95854233799013779216c57e153c1ee67a0bf92138acca0e429aefaee", size = 5198435, upload-time = "2026-03-09T07:56:57.184Z" }, - { url = "https://files.pythonhosted.org/packages/a2/2f/687722910b5a5601de2135c891108f51dfc873d8e43c8ed9f4ebb440b4a2/numpy-2.4.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:7f3408ff897f8ab07a07fbe2823d7aee6ff644c097cc1f90382511fe982f647f", size = 6546347, upload-time = "2026-03-09T07:56:59.531Z" }, - { url = "https://files.pythonhosted.org/packages/bf/ec/7971c4e98d86c564750393fab8d7d83d0a9432a9d78bb8a163a6dc59967a/numpy-2.4.3-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:decb0eb8a53c3b009b0962378065589685d66b23467ef5dac16cbe818afde27f", size = 15664626, upload-time = "2026-03-09T07:57:01.385Z" }, - { url = "https://files.pythonhosted.org/packages/7e/eb/7daecbea84ec935b7fc732e18f532073064a3816f0932a40a17f3349185f/numpy-2.4.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5f51900414fc9204a0e0da158ba2ac52b75656e7dce7e77fb9f84bfa343b4cc", size = 16608916, upload-time = "2026-03-09T07:57:04.008Z" }, - { url = "https://files.pythonhosted.org/packages/df/58/2a2b4a817ffd7472dca4421d9f0776898b364154e30c95f42195041dc03b/numpy-2.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6bd06731541f89cdc01b261ba2c9e037f1543df7472517836b78dfb15bd6e476", size = 17015824, upload-time = "2026-03-09T07:57:06.347Z" }, - { url = "https://files.pythonhosted.org/packages/4a/ca/627a828d44e78a418c55f82dd4caea8ea4a8ef24e5144d9e71016e52fb40/numpy-2.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:22654fe6be0e5206f553a9250762c653d3698e46686eee53b399ab90da59bd92", size = 18334581, upload-time = "2026-03-09T07:57:09.114Z" }, - { url = "https://files.pythonhosted.org/packages/cd/c0/76f93962fc79955fcba30a429b62304332345f22d4daec1cb33653425643/numpy-2.4.3-cp313-cp313-win32.whl", hash = "sha256:d71e379452a2f670ccb689ec801b1218cd3983e253105d6e83780967e899d687", size = 5958618, upload-time = "2026-03-09T07:57:11.432Z" }, - { url = "https://files.pythonhosted.org/packages/b1/3c/88af0040119209b9b5cb59485fa48b76f372c73068dbf9254784b975ac53/numpy-2.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:0a60e17a14d640f49146cb38e3f105f571318db7826d9b6fef7e4dce758faecd", size = 12312824, upload-time = "2026-03-09T07:57:13.586Z" }, - { url = "https://files.pythonhosted.org/packages/58/ce/3d07743aced3d173f877c3ef6a454c2174ba42b584ab0b7e6d99374f51ed/numpy-2.4.3-cp313-cp313-win_arm64.whl", hash = "sha256:c9619741e9da2059cd9c3f206110b97583c7152c1dc9f8aafd4beb450ac1c89d", size = 10221218, upload-time = "2026-03-09T07:57:16.183Z" }, - { url = "https://files.pythonhosted.org/packages/62/09/d96b02a91d09e9d97862f4fc8bfebf5400f567d8eb1fe4b0cc4795679c15/numpy-2.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7aa4e54f6469300ebca1d9eb80acd5253cdfa36f2c03d79a35883687da430875", size = 14819570, upload-time = "2026-03-09T07:57:18.564Z" }, - { url = "https://files.pythonhosted.org/packages/b5/ca/0b1aba3905fdfa3373d523b2b15b19029f4f3031c87f4066bd9d20ef6c6b/numpy-2.4.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d1b90d840b25874cf5cd20c219af10bac3667db3876d9a495609273ebe679070", size = 5326113, upload-time = "2026-03-09T07:57:21.052Z" }, - { url = "https://files.pythonhosted.org/packages/c0/63/406e0fd32fcaeb94180fd6a4c41e55736d676c54346b7efbce548b94a914/numpy-2.4.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:a749547700de0a20a6718293396ec237bb38218049cfce788e08fcb716e8cf73", size = 6646370, upload-time = "2026-03-09T07:57:22.804Z" }, - { url = "https://files.pythonhosted.org/packages/b6/d0/10f7dc157d4b37af92720a196be6f54f889e90dcd30dce9dc657ed92c257/numpy-2.4.3-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94f3c4a151a2e529adf49c1d54f0f57ff8f9b233ee4d44af623a81553ab86368", size = 15723499, upload-time = "2026-03-09T07:57:24.693Z" }, - { url = "https://files.pythonhosted.org/packages/66/f1/d1c2bf1161396629701bc284d958dc1efa3a5a542aab83cf11ee6eb4cba5/numpy-2.4.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22c31dc07025123aedf7f2db9e91783df13f1776dc52c6b22c620870dc0fab22", size = 16657164, upload-time = "2026-03-09T07:57:27.676Z" }, - { url = "https://files.pythonhosted.org/packages/1a/be/cca19230b740af199ac47331a21c71e7a3d0ba59661350483c1600d28c37/numpy-2.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:148d59127ac95979d6f07e4d460f934ebdd6eed641db9c0db6c73026f2b2101a", size = 17081544, upload-time = "2026-03-09T07:57:30.664Z" }, - { url = "https://files.pythonhosted.org/packages/b9/c5/9602b0cbb703a0936fb40f8a95407e8171935b15846de2f0776e08af04c7/numpy-2.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a97cbf7e905c435865c2d939af3d93f99d18eaaa3cabe4256f4304fb51604349", size = 18380290, upload-time = "2026-03-09T07:57:33.763Z" }, - { url = "https://files.pythonhosted.org/packages/ed/81/9f24708953cd30be9ee36ec4778f4b112b45165812f2ada4cc5ea1c1f254/numpy-2.4.3-cp313-cp313t-win32.whl", hash = "sha256:be3b8487d725a77acccc9924f65fd8bce9af7fac8c9820df1049424a2115af6c", size = 6082814, upload-time = "2026-03-09T07:57:36.491Z" }, - { url = "https://files.pythonhosted.org/packages/e2/9e/52f6eaa13e1a799f0ab79066c17f7016a4a8ae0c1aefa58c82b4dab690b4/numpy-2.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1ec84fd7c8e652b0f4aaaf2e6e9cc8eaa9b1b80a537e06b2e3a2fb176eedcb26", size = 12452673, upload-time = "2026-03-09T07:57:38.281Z" }, - { url = "https://files.pythonhosted.org/packages/c4/04/b8cece6ead0b30c9fbd99bb835ad7ea0112ac5f39f069788c5558e3b1ab2/numpy-2.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:120df8c0a81ebbf5b9020c91439fccd85f5e018a927a39f624845be194a2be02", size = 10290907, upload-time = "2026-03-09T07:57:40.747Z" }, - { url = "https://files.pythonhosted.org/packages/70/ae/3936f79adebf8caf81bd7a599b90a561334a658be4dcc7b6329ebf4ee8de/numpy-2.4.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:5884ce5c7acfae1e4e1b6fde43797d10aa506074d25b531b4f54bde33c0c31d4", size = 16664563, upload-time = "2026-03-09T07:57:43.817Z" }, - { url = "https://files.pythonhosted.org/packages/9b/62/760f2b55866b496bb1fa7da2a6db076bef908110e568b02fcfc1422e2a3a/numpy-2.4.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:297837823f5bc572c5f9379b0c9f3a3365f08492cbdc33bcc3af174372ebb168", size = 14702161, upload-time = "2026-03-09T07:57:46.169Z" }, - { url = "https://files.pythonhosted.org/packages/32/af/a7a39464e2c0a21526fb4fb76e346fb172ebc92f6d1c7a07c2c139cc17b1/numpy-2.4.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:a111698b4a3f8dcbe54c64a7708f049355abd603e619013c346553c1fd4ca90b", size = 5208738, upload-time = "2026-03-09T07:57:48.506Z" }, - { url = "https://files.pythonhosted.org/packages/29/8c/2a0cf86a59558fa078d83805589c2de490f29ed4fb336c14313a161d358a/numpy-2.4.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:4bd4741a6a676770e0e97fe9ab2e51de01183df3dcbcec591d26d331a40de950", size = 6543618, upload-time = "2026-03-09T07:57:50.591Z" }, - { url = "https://files.pythonhosted.org/packages/aa/b8/612ce010c0728b1c363fa4ea3aa4c22fe1c5da1de008486f8c2f5cb92fae/numpy-2.4.3-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54f29b877279d51e210e0c80709ee14ccbbad647810e8f3d375561c45ef613dd", size = 15680676, upload-time = "2026-03-09T07:57:52.34Z" }, - { url = "https://files.pythonhosted.org/packages/a9/7e/4f120ecc54ba26ddf3dc348eeb9eb063f421de65c05fc961941798feea18/numpy-2.4.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:679f2a834bae9020f81534671c56fd0cc76dd7e5182f57131478e23d0dc59e24", size = 16613492, upload-time = "2026-03-09T07:57:54.91Z" }, - { url = "https://files.pythonhosted.org/packages/2c/86/1b6020db73be330c4b45d5c6ee4295d59cfeef0e3ea323959d053e5a6909/numpy-2.4.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d84f0f881cb2225c2dfd7f78a10a5645d487a496c6668d6cc39f0f114164f3d0", size = 17031789, upload-time = "2026-03-09T07:57:57.641Z" }, - { url = "https://files.pythonhosted.org/packages/07/3a/3b90463bf41ebc21d1b7e06079f03070334374208c0f9a1f05e4ae8455e7/numpy-2.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d213c7e6e8d211888cc359bab7199670a00f5b82c0978b9d1c75baf1eddbeac0", size = 18339941, upload-time = "2026-03-09T07:58:00.577Z" }, - { url = "https://files.pythonhosted.org/packages/a8/74/6d736c4cd962259fd8bae9be27363eb4883a2f9069763747347544c2a487/numpy-2.4.3-cp314-cp314-win32.whl", hash = "sha256:52077feedeff7c76ed7c9f1a0428558e50825347b7545bbb8523da2cd55c547a", size = 6007503, upload-time = "2026-03-09T07:58:03.331Z" }, - { url = "https://files.pythonhosted.org/packages/48/39/c56ef87af669364356bb011922ef0734fc49dad51964568634c72a009488/numpy-2.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:0448e7f9caefb34b4b7dd2b77f21e8906e5d6f0365ad525f9f4f530b13df2afc", size = 12444915, upload-time = "2026-03-09T07:58:06.353Z" }, - { url = "https://files.pythonhosted.org/packages/9d/1f/ab8528e38d295fd349310807496fabb7cf9fe2e1f70b97bc20a483ea9d4a/numpy-2.4.3-cp314-cp314-win_arm64.whl", hash = "sha256:b44fd60341c4d9783039598efadd03617fa28d041fc37d22b62d08f2027fa0e7", size = 10494875, upload-time = "2026-03-09T07:58:08.734Z" }, - { url = "https://files.pythonhosted.org/packages/e6/ef/b7c35e4d5ef141b836658ab21a66d1a573e15b335b1d111d31f26c8ef80f/numpy-2.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0a195f4216be9305a73c0e91c9b026a35f2161237cf1c6de9b681637772ea657", size = 14822225, upload-time = "2026-03-09T07:58:11.034Z" }, - { url = "https://files.pythonhosted.org/packages/cd/8d/7730fa9278cf6648639946cc816e7cc89f0d891602584697923375f801ed/numpy-2.4.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:cd32fbacb9fd1bf041bf8e89e4576b6f00b895f06d00914820ae06a616bdfef7", size = 5328769, upload-time = "2026-03-09T07:58:13.67Z" }, - { url = "https://files.pythonhosted.org/packages/47/01/d2a137317c958b074d338807c1b6a383406cdf8b8e53b075d804cc3d211d/numpy-2.4.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:2e03c05abaee1f672e9d67bc858f300b5ccba1c21397211e8d77d98350972093", size = 6649461, upload-time = "2026-03-09T07:58:15.912Z" }, - { url = "https://files.pythonhosted.org/packages/5c/34/812ce12bc0f00272a4b0ec0d713cd237cb390666eb6206323d1cc9cedbb2/numpy-2.4.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d1ce23cce91fcea443320a9d0ece9b9305d4368875bab09538f7a5b4131938a", size = 15725809, upload-time = "2026-03-09T07:58:17.787Z" }, - { url = "https://files.pythonhosted.org/packages/25/c0/2aed473a4823e905e765fee3dc2cbf504bd3e68ccb1150fbdabd5c39f527/numpy-2.4.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c59020932feb24ed49ffd03704fbab89f22aa9c0d4b180ff45542fe8918f5611", size = 16655242, upload-time = "2026-03-09T07:58:20.476Z" }, - { url = "https://files.pythonhosted.org/packages/f2/c8/7e052b2fc87aa0e86de23f20e2c42bd261c624748aa8efd2c78f7bb8d8c6/numpy-2.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9684823a78a6cd6ad7511fc5e25b07947d1d5b5e2812c93fe99d7d4195130720", size = 17080660, upload-time = "2026-03-09T07:58:23.067Z" }, - { url = "https://files.pythonhosted.org/packages/f3/3d/0876746044db2adcb11549f214d104f2e1be00f07a67edbb4e2812094847/numpy-2.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0200b25c687033316fb39f0ff4e3e690e8957a2c3c8d22499891ec58c37a3eb5", size = 18380384, upload-time = "2026-03-09T07:58:25.839Z" }, - { url = "https://files.pythonhosted.org/packages/07/12/8160bea39da3335737b10308df4f484235fd297f556745f13092aa039d3b/numpy-2.4.3-cp314-cp314t-win32.whl", hash = "sha256:5e10da9e93247e554bb1d22f8edc51847ddd7dde52d85ce31024c1b4312bfba0", size = 6154547, upload-time = "2026-03-09T07:58:28.289Z" }, - { url = "https://files.pythonhosted.org/packages/42/f3/76534f61f80d74cc9cdf2e570d3d4eeb92c2280a27c39b0aaf471eda7b48/numpy-2.4.3-cp314-cp314t-win_amd64.whl", hash = "sha256:45f003dbdffb997a03da2d1d0cb41fbd24a87507fb41605c0420a3db5bd4667b", size = 12633645, upload-time = "2026-03-09T07:58:30.384Z" }, - { url = "https://files.pythonhosted.org/packages/1f/b6/7c0d4334c15983cec7f92a69e8ce9b1e6f31857e5ee3a413ac424e6bd63d/numpy-2.4.3-cp314-cp314t-win_arm64.whl", hash = "sha256:4d382735cecd7bcf090172489a525cd7d4087bc331f7df9f60ddc9a296cf208e", size = 10565454, upload-time = "2026-03-09T07:58:33.031Z" }, -] - -[[package]] -name = "nvidia-cublas" -version = "13.1.0.3" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e1/a5/fce49e2ae977e0ccc084e5adafceb4f0ac0c8333cb6863501618a7277f67/nvidia_cublas-13.1.0.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c86fc7f7ae36d7528288c5d88098edcb7b02c633d262e7ddbb86b0ad91be5df2", size = 542851226, upload-time = "2025-10-09T08:59:04.818Z" }, - { url = "https://files.pythonhosted.org/packages/e7/44/423ac00af4dd95a5aeb27207e2c0d9b7118702149bf4704c3ddb55bb7429/nvidia_cublas-13.1.0.3-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:ee8722c1f0145ab246bccb9e452153b5e0515fd094c3678df50b2a0888b8b171", size = 423133236, upload-time = "2025-10-09T08:59:32.536Z" }, -] - -[[package]] -name = "nvidia-cuda-cupti" -version = "13.0.85" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/2a/80353b103fc20ce05ef51e928daed4b6015db4aaa9162ed0997090fe2250/nvidia_cuda_cupti-13.0.85-py3-none-manylinux_2_25_aarch64.whl", hash = "sha256:796bd679890ee55fb14a94629b698b6db54bcfd833d391d5e94017dd9d7d3151", size = 10310827, upload-time = "2025-09-04T08:26:42.012Z" }, - { url = "https://files.pythonhosted.org/packages/33/6d/737d164b4837a9bbd202f5ae3078975f0525a55730fe871d8ed4e3b952b0/nvidia_cuda_cupti-13.0.85-py3-none-manylinux_2_25_x86_64.whl", hash = "sha256:4eb01c08e859bf924d222250d2e8f8b8ff6d3db4721288cf35d14252a4d933c8", size = 10715597, upload-time = "2025-09-04T08:26:51.312Z" }, -] - -[[package]] -name = "nvidia-cuda-nvrtc" -version = "13.0.88" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c3/68/483a78f5e8f31b08fb1bb671559968c0ca3a065ac7acabfc7cee55214fd6/nvidia_cuda_nvrtc-13.0.88-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:ad9b6d2ead2435f11cbb6868809d2adeeee302e9bb94bcf0539c7a40d80e8575", size = 90215200, upload-time = "2025-09-04T08:28:44.204Z" }, - { url = "https://files.pythonhosted.org/packages/b7/dc/6bb80850e0b7edd6588d560758f17e0550893a1feaf436807d64d2da040f/nvidia_cuda_nvrtc-13.0.88-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d27f20a0ca67a4bb34268a5e951033496c5b74870b868bacd046b1b8e0c3267b", size = 43015449, upload-time = "2025-09-04T08:28:20.239Z" }, -] - -[[package]] -name = "nvidia-cuda-runtime" -version = "13.0.96" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/4f/17d7b9b8e285199c58ce28e31b5c5bbaa4d8271af06a89b6405258245de2/nvidia_cuda_runtime-13.0.96-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ef9bcbe90493a2b9d810e43d249adb3d02e98dd30200d86607d8d02687c43f55", size = 2261060, upload-time = "2025-10-09T08:55:15.78Z" }, - { url = "https://files.pythonhosted.org/packages/2e/24/d1558f3b68b1d26e706813b1d10aa1d785e4698c425af8db8edc3dced472/nvidia_cuda_runtime-13.0.96-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7f82250d7782aa23b6cfe765ecc7db554bd3c2870c43f3d1821f1d18aebf0548", size = 2243632, upload-time = "2025-10-09T08:55:36.117Z" }, -] - -[[package]] -name = "nvidia-cudnn-cu13" -version = "9.19.0.56" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-cublas", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/84/26025437c1e6b61a707442184fa0c03d083b661adf3a3eecfd6d21677740/nvidia_cudnn_cu13-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:6ed29ffaee1176c612daf442e4dd6cfeb6a0caa43ddcbeb59da94953030b1be4", size = 433781201, upload-time = "2026-02-03T20:40:53.805Z" }, - { url = "https://files.pythonhosted.org/packages/a3/22/0b4b932655d17a6da1b92fa92ab12844b053bb2ac2475e179ba6f043da1e/nvidia_cudnn_cu13-9.19.0.56-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:d20e1734305e9d68889a96e3f35094d733ff1f83932ebe462753973e53a572bf", size = 366066321, upload-time = "2026-02-03T20:44:52.837Z" }, -] - -[[package]] -name = "nvidia-cufft" -version = "12.0.0.61" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/8b/ae/f417a75c0259e85c1d2f83ca4e960289a5f814ed0cea74d18c353d3e989d/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2708c852ef8cd89d1d2068bdbece0aa188813a0c934db3779b9b1faa8442e5f5", size = 214053554, upload-time = "2025-09-04T08:31:38.196Z" }, - { url = "https://files.pythonhosted.org/packages/a8/2f/7b57e29836ea8714f81e9898409196f47d772d5ddedddf1592eadb8ab743/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6c44f692dce8fd5ffd3e3df134b6cdb9c2f72d99cf40b62c32dde45eea9ddad3", size = 214085489, upload-time = "2025-09-04T08:31:56.044Z" }, -] - -[[package]] -name = "nvidia-cufile" -version = "1.15.1.6" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3f/70/4f193de89a48b71714e74602ee14d04e4019ad36a5a9f20c425776e72cd6/nvidia_cufile-1.15.1.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08a3ecefae5a01c7f5117351c64f17c7c62efa5fffdbe24fc7d298da19cd0b44", size = 1223672, upload-time = "2025-09-04T08:32:22.779Z" }, - { url = "https://files.pythonhosted.org/packages/ab/73/cc4a14c9813a8a0d509417cf5f4bdaba76e924d58beb9864f5a7baceefbf/nvidia_cufile-1.15.1.6-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:bdc0deedc61f548bddf7733bdc216456c2fdb101d020e1ab4b88d232d5e2f6d1", size = 1136992, upload-time = "2025-09-04T08:32:14.119Z" }, -] - -[[package]] -name = "nvidia-curand" -version = "10.4.0.35" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/72/7c2ae24fb6b63a32e6ae5d241cc65263ea18d08802aaae087d9f013335a2/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:133df5a7509c3e292aaa2b477afd0194f06ce4ea24d714d616ff36439cee349a", size = 61962106, upload-time = "2025-08-04T10:21:41.128Z" }, - { url = "https://files.pythonhosted.org/packages/a5/9f/be0a41ca4a4917abf5cb9ae0daff1a6060cc5de950aec0396de9f3b52bc5/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:1aee33a5da6e1db083fe2b90082def8915f30f3248d5896bcec36a579d941bfc", size = 59544258, upload-time = "2025-08-04T10:22:03.992Z" }, -] - -[[package]] -name = "nvidia-cusolver" -version = "12.0.4.66" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-cublas", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, - { name = "nvidia-cusparse", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, - { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/c8/c3/b30c9e935fc01e3da443ec0116ed1b2a009bb867f5324d3f2d7e533e776b/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:02c2457eaa9e39de20f880f4bd8820e6a1cfb9f9a34f820eb12a155aa5bc92d2", size = 223467760, upload-time = "2025-09-04T08:33:04.222Z" }, - { url = "https://files.pythonhosted.org/packages/5f/67/cba3777620cdacb99102da4042883709c41c709f4b6323c10781a9c3aa34/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:0a759da5dea5c0ea10fd307de75cdeb59e7ea4fcb8add0924859b944babf1112", size = 200941980, upload-time = "2025-09-04T08:33:22.767Z" }, -] - -[[package]] -name = "nvidia-cusparse" -version = "12.6.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/f8/94/5c26f33738ae35276672f12615a64bd008ed5be6d1ebcb23579285d960a9/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:80bcc4662f23f1054ee334a15c72b8940402975e0eab63178fc7e670aa59472c", size = 162155568, upload-time = "2025-09-04T08:33:42.864Z" }, - { url = "https://files.pythonhosted.org/packages/fa/18/623c77619c31d62efd55302939756966f3ecc8d724a14dab2b75f1508850/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b3c89c88d01ee0e477cb7f82ef60a11a4bcd57b6b87c33f789350b59759360b", size = 145942937, upload-time = "2025-09-04T08:33:58.029Z" }, -] - -[[package]] -name = "nvidia-cusparselt-cu13" -version = "0.8.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/46/10/8dcd1175260706a2fc92a16a52e306b71d4c1ea0b0cc4a9484183399818a/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:400c6ed1cf6780fc6efedd64ec9f1345871767e6a1a0a552a1ea0578117ea77c", size = 220791277, upload-time = "2025-08-13T19:22:40.982Z" }, - { url = "https://files.pythonhosted.org/packages/fd/53/43b0d71f4e702fa9733f8b4571fdca50a8813f1e450b656c239beff12315/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:25e30a8a7323935d4ad0340b95a0b69926eee755767e8e0b1cf8dd85b197d3fd", size = 169884119, upload-time = "2025-08-13T19:23:41.967Z" }, -] - -[[package]] -name = "nvidia-nccl-cu13" -version = "2.28.9" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/39/55/1920646a2e43ffd4fc958536b276197ed740e9e0c54105b4bb3521591fc7/nvidia_nccl_cu13-2.28.9-py3-none-manylinux_2_18_aarch64.whl", hash = "sha256:01c873ba1626b54caa12272ed228dc5b2781545e0ae8ba3f432a8ef1c6d78643", size = 196561677, upload-time = "2025-11-18T05:49:03.45Z" }, - { url = "https://files.pythonhosted.org/packages/b0/b4/878fefaad5b2bcc6fcf8d474a25e3e3774bc5133e4b58adff4d0bca238bc/nvidia_nccl_cu13-2.28.9-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:e4553a30f34195f3fa1da02a6da3d6337d28f2003943aa0a3d247bbc25fefc42", size = 196493177, upload-time = "2025-11-18T05:49:17.677Z" }, -] - -[[package]] -name = "nvidia-nvjitlink" -version = "13.0.88" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/56/7a/123e033aaff487c77107195fa5a2b8686795ca537935a24efae476c41f05/nvidia_nvjitlink-13.0.88-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:13a74f429e23b921c1109976abefacc69835f2f433ebd323d3946e11d804e47b", size = 40713933, upload-time = "2025-09-04T08:35:43.553Z" }, - { url = "https://files.pythonhosted.org/packages/ab/2c/93c5250e64df4f894f1cbb397c6fd71f79813f9fd79d7cd61de3f97b3c2d/nvidia_nvjitlink-13.0.88-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e931536ccc7d467a98ba1d8b89ff7fa7f1fa3b13f2b0069118cd7f47bff07d0c", size = 38768748, upload-time = "2025-09-04T08:35:20.008Z" }, -] - -[[package]] -name = "nvidia-nvshmem-cu13" -version = "3.4.5" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/0f/05cc9c720236dcd2db9c1ab97fff629e96821be2e63103569da0c9b72f19/nvidia_nvshmem_cu13-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dc2a197f38e5d0376ad52cd1a2a3617d3cdc150fd5966f4aee9bcebb1d68fe9", size = 60215947, upload-time = "2025-09-06T00:32:20.022Z" }, - { url = "https://files.pythonhosted.org/packages/3c/35/a9bf80a609e74e3b000fef598933235c908fcefcef9026042b8e6dfde2a9/nvidia_nvshmem_cu13-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:290f0a2ee94c9f3687a02502f3b9299a9f9fe826e6d0287ee18482e78d495b80", size = 60412546, upload-time = "2025-09-06T00:32:41.564Z" }, -] - -[[package]] -name = "nvidia-nvtx" -version = "13.0.85" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/f3/d86c845465a2723ad7e1e5c36dcd75ddb82898b3f53be47ebd429fb2fa5d/nvidia_nvtx-13.0.85-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4936d1d6780fbe68db454f5e72a42ff64d1fd6397df9f363ae786930fd5c1cd4", size = 148047, upload-time = "2025-09-04T08:29:01.761Z" }, - { url = "https://files.pythonhosted.org/packages/a8/64/3708a90d1ebe202ffdeb7185f878a3c84d15c2b2c31858da2ce0583e2def/nvidia_nvtx-13.0.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb7780edb6b14107373c835bf8b72e7a178bac7367e23da7acb108f973f157a6", size = 148878, upload-time = "2025-09-04T08:28:53.627Z" }, -] - -[[package]] -name = "openpyxl" -version = "3.1.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "et-xmlfile" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, -] - -[[package]] -name = "optuna" -version = "4.8.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "alembic" }, - { name = "colorlog" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pyyaml" }, - { name = "sqlalchemy" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bf/9b/62f120fb2ecbc4338bee70c5a3671c8e561714f3aa1a046b897ff142050e/optuna-4.8.0.tar.gz", hash = "sha256:6f7043e9f8ecb5e607af86a7eb00fb5ec2be26c3b08c201209a73d36aff37a38", size = 482603, upload-time = "2026-03-16T04:59:58.659Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/24/7c731839566d30dc70556d9824ef17692d896c15e3df627bce8c16f753e1/optuna-4.8.0-py3-none-any.whl", hash = "sha256:c57a7682679c36bfc9bca0da430698179e513874074b71bebedb0334964ab930", size = 419456, upload-time = "2026-03-16T04:59:56.977Z" }, -] - -[[package]] -name = "packaging" -version = "26.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, -] - -[[package]] -name = "pandas" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "python-dateutil" }, - { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2e/0c/b28ed414f080ee0ad153f848586d61d1878f91689950f037f976ce15f6c8/pandas-3.0.1.tar.gz", hash = "sha256:4186a699674af418f655dbd420ed87f50d56b4cd6603784279d9eef6627823c8", size = 4641901, upload-time = "2026-02-17T22:20:16.434Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/48/aad6ec4f8d007534c091e9a7172b3ec1b1ee6d99a9cbb936b5eab6c6cf58/pandas-3.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5272627187b5d9c20e55d27caf5f2cd23e286aba25cadf73c8590e432e2b7262", size = 10317509, upload-time = "2026-02-17T22:18:59.498Z" }, - { url = "https://files.pythonhosted.org/packages/a8/14/5990826f779f79148ae9d3a2c39593dc04d61d5d90541e71b5749f35af95/pandas-3.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:661e0f665932af88c7877f31da0dc743fe9c8f2524bdffe23d24fdcb67ef9d56", size = 9860561, upload-time = "2026-02-17T22:19:02.265Z" }, - { url = "https://files.pythonhosted.org/packages/fa/80/f01ff54664b6d70fed71475543d108a9b7c888e923ad210795bef04ffb7d/pandas-3.0.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:75e6e292ff898679e47a2199172593d9f6107fd2dd3617c22c2946e97d5df46e", size = 10365506, upload-time = "2026-02-17T22:19:05.017Z" }, - { url = "https://files.pythonhosted.org/packages/f2/85/ab6d04733a7d6ff32bfc8382bf1b07078228f5d6ebec5266b91bfc5c4ff7/pandas-3.0.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1ff8cf1d2896e34343197685f432450ec99a85ba8d90cce2030c5eee2ef98791", size = 10873196, upload-time = "2026-02-17T22:19:07.204Z" }, - { url = "https://files.pythonhosted.org/packages/48/a9/9301c83d0b47c23ac5deab91c6b39fd98d5b5db4d93b25df8d381451828f/pandas-3.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eca8b4510f6763f3d37359c2105df03a7a221a508f30e396a51d0713d462e68a", size = 11370859, upload-time = "2026-02-17T22:19:09.436Z" }, - { url = "https://files.pythonhosted.org/packages/59/fe/0c1fc5bd2d29c7db2ab372330063ad555fb83e08422829c785f5ec2176ca/pandas-3.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:06aff2ad6f0b94a17822cf8b83bbb563b090ed82ff4fe7712db2ce57cd50d9b8", size = 11924584, upload-time = "2026-02-17T22:19:11.562Z" }, - { url = "https://files.pythonhosted.org/packages/d6/7d/216a1588b65a7aa5f4535570418a599d943c85afb1d95b0876fc00aa1468/pandas-3.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:9fea306c783e28884c29057a1d9baa11a349bbf99538ec1da44c8476563d1b25", size = 9742769, upload-time = "2026-02-17T22:19:13.926Z" }, - { url = "https://files.pythonhosted.org/packages/c4/cb/810a22a6af9a4e97c8ab1c946b47f3489c5bca5adc483ce0ffc84c9cc768/pandas-3.0.1-cp313-cp313-win_arm64.whl", hash = "sha256:a8d37a43c52917427e897cb2e429f67a449327394396a81034a4449b99afda59", size = 9043855, upload-time = "2026-02-17T22:19:16.09Z" }, - { url = "https://files.pythonhosted.org/packages/92/fa/423c89086cca1f039cf1253c3ff5b90f157b5b3757314aa635f6bf3e30aa/pandas-3.0.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d54855f04f8246ed7b6fc96b05d4871591143c46c0b6f4af874764ed0d2d6f06", size = 10752673, upload-time = "2026-02-17T22:19:18.304Z" }, - { url = "https://files.pythonhosted.org/packages/22/23/b5a08ec1f40020397f0faba72f1e2c11f7596a6169c7b3e800abff0e433f/pandas-3.0.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4e1b677accee34a09e0dc2ce5624e4a58a1870ffe56fc021e9caf7f23cd7668f", size = 10404967, upload-time = "2026-02-17T22:19:20.726Z" }, - { url = "https://files.pythonhosted.org/packages/5c/81/94841f1bb4afdc2b52a99daa895ac2c61600bb72e26525ecc9543d453ebc/pandas-3.0.1-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9cabbdcd03f1b6cd254d6dda8ae09b0252524be1592594c00b7895916cb1324", size = 10320575, upload-time = "2026-02-17T22:19:24.919Z" }, - { url = "https://files.pythonhosted.org/packages/0a/8b/2ae37d66a5342a83adadfd0cb0b4bf9c3c7925424dd5f40d15d6cfaa35ee/pandas-3.0.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ae2ab1f166668b41e770650101e7090824fd34d17915dd9cd479f5c5e0065e9", size = 10710921, upload-time = "2026-02-17T22:19:27.181Z" }, - { url = "https://files.pythonhosted.org/packages/a2/61/772b2e2757855e232b7ccf7cb8079a5711becb3a97f291c953def15a833f/pandas-3.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6bf0603c2e30e2cafac32807b06435f28741135cb8697eae8b28c7d492fc7d76", size = 11334191, upload-time = "2026-02-17T22:19:29.411Z" }, - { url = "https://files.pythonhosted.org/packages/1b/08/b16c6df3ef555d8495d1d265a7963b65be166785d28f06a350913a4fac78/pandas-3.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6c426422973973cae1f4a23e51d4ae85974f44871b24844e4f7de752dd877098", size = 11782256, upload-time = "2026-02-17T22:19:32.34Z" }, - { url = "https://files.pythonhosted.org/packages/55/80/178af0594890dee17e239fca96d3d8670ba0f5ff59b7d0439850924a9c09/pandas-3.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b03f91ae8c10a85c1613102c7bef5229b5379f343030a3ccefeca8a33414cf35", size = 10485047, upload-time = "2026-02-17T22:19:34.605Z" }, - { url = "https://files.pythonhosted.org/packages/bb/8b/4bb774a998b97e6c2fd62a9e6cfdaae133b636fd1c468f92afb4ae9a447a/pandas-3.0.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:99d0f92ed92d3083d140bf6b97774f9f13863924cf3f52a70711f4e7588f9d0a", size = 10322465, upload-time = "2026-02-17T22:19:36.803Z" }, - { url = "https://files.pythonhosted.org/packages/72/3a/5b39b51c64159f470f1ca3b1c2a87da290657ca022f7cd11442606f607d1/pandas-3.0.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3b66857e983208654294bb6477b8a63dee26b37bdd0eb34d010556e91261784f", size = 9910632, upload-time = "2026-02-17T22:19:39.001Z" }, - { url = "https://files.pythonhosted.org/packages/4e/f7/b449ffb3f68c11da12fc06fbf6d2fa3a41c41e17d0284d23a79e1c13a7e4/pandas-3.0.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56cf59638bf24dc9bdf2154c81e248b3289f9a09a6d04e63608c159022352749", size = 10440535, upload-time = "2026-02-17T22:19:41.157Z" }, - { url = "https://files.pythonhosted.org/packages/55/77/6ea82043db22cb0f2bbfe7198da3544000ddaadb12d26be36e19b03a2dc5/pandas-3.0.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c1a9f55e0f46951874b863d1f3906dcb57df2d9be5c5847ba4dfb55b2c815249", size = 10893940, upload-time = "2026-02-17T22:19:43.493Z" }, - { url = "https://files.pythonhosted.org/packages/03/30/f1b502a72468c89412c1b882a08f6eed8a4ee9dc033f35f65d0663df6081/pandas-3.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1849f0bba9c8a2fb0f691d492b834cc8dadf617e29015c66e989448d58d011ee", size = 11442711, upload-time = "2026-02-17T22:19:46.074Z" }, - { url = "https://files.pythonhosted.org/packages/0d/f0/ebb6ddd8fc049e98cabac5c2924d14d1dda26a20adb70d41ea2e428d3ec4/pandas-3.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3d288439e11b5325b02ae6e9cc83e6805a62c40c5a6220bea9beb899c073b1c", size = 11963918, upload-time = "2026-02-17T22:19:48.838Z" }, - { url = "https://files.pythonhosted.org/packages/09/f8/8ce132104074f977f907442790eaae24e27bce3b3b454e82faa3237ff098/pandas-3.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:93325b0fe372d192965f4cca88d97667f49557398bbf94abdda3bf1b591dbe66", size = 9862099, upload-time = "2026-02-17T22:19:51.081Z" }, - { url = "https://files.pythonhosted.org/packages/e6/b7/6af9aac41ef2456b768ef0ae60acf8abcebb450a52043d030a65b4b7c9bd/pandas-3.0.1-cp314-cp314-win_arm64.whl", hash = "sha256:97ca08674e3287c7148f4858b01136f8bdfe7202ad25ad04fec602dd1d29d132", size = 9185333, upload-time = "2026-02-17T22:19:53.266Z" }, - { url = "https://files.pythonhosted.org/packages/66/fc/848bb6710bc6061cb0c5badd65b92ff75c81302e0e31e496d00029fe4953/pandas-3.0.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:58eeb1b2e0fb322befcf2bbc9ba0af41e616abadb3d3414a6bc7167f6cbfce32", size = 10772664, upload-time = "2026-02-17T22:19:55.806Z" }, - { url = "https://files.pythonhosted.org/packages/69/5c/866a9bbd0f79263b4b0db6ec1a341be13a1473323f05c122388e0f15b21d/pandas-3.0.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cd9af1276b5ca9e298bd79a26bda32fa9cc87ed095b2a9a60978d2ca058eaf87", size = 10421286, upload-time = "2026-02-17T22:19:58.091Z" }, - { url = "https://files.pythonhosted.org/packages/51/a4/2058fb84fb1cfbfb2d4a6d485e1940bb4ad5716e539d779852494479c580/pandas-3.0.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94f87a04984d6b63788327cd9f79dda62b7f9043909d2440ceccf709249ca988", size = 10342050, upload-time = "2026-02-17T22:20:01.376Z" }, - { url = "https://files.pythonhosted.org/packages/22/1b/674e89996cc4be74db3c4eb09240c4bb549865c9c3f5d9b086ff8fcfbf00/pandas-3.0.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85fe4c4df62e1e20f9db6ebfb88c844b092c22cd5324bdcf94bfa2fc1b391221", size = 10740055, upload-time = "2026-02-17T22:20:04.328Z" }, - { url = "https://files.pythonhosted.org/packages/d0/f8/e954b750764298c22fa4614376531fe63c521ef517e7059a51f062b87dca/pandas-3.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:331ca75a2f8672c365ae25c0b29e46f5ac0c6551fdace8eec4cd65e4fac271ff", size = 11357632, upload-time = "2026-02-17T22:20:06.647Z" }, - { url = "https://files.pythonhosted.org/packages/6d/02/c6e04b694ffd68568297abd03588b6d30295265176a5c01b7459d3bc35a3/pandas-3.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:15860b1fdb1973fffade772fdb931ccf9b2f400a3f5665aef94a00445d7d8dd5", size = 11810974, upload-time = "2026-02-17T22:20:08.946Z" }, - { url = "https://files.pythonhosted.org/packages/89/41/d7dfb63d2407f12055215070c42fc6ac41b66e90a2946cdc5e759058398b/pandas-3.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:44f1364411d5670efa692b146c748f4ed013df91ee91e9bec5677fb1fd58b937", size = 10884622, upload-time = "2026-02-17T22:20:11.711Z" }, - { url = "https://files.pythonhosted.org/packages/68/b0/34937815889fa982613775e4b97fddd13250f11012d769949c5465af2150/pandas-3.0.1-cp314-cp314t-win_arm64.whl", hash = "sha256:108dd1790337a494aa80e38def654ca3f0968cf4f362c85f44c15e471667102d", size = 9452085, upload-time = "2026-02-17T22:20:14.331Z" }, -] - -[[package]] -name = "parso" -version = "0.8.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/81/76/a1e769043c0c0c9fe391b702539d594731a4362334cdf4dc25d0c09761e7/parso-0.8.6.tar.gz", hash = "sha256:2b9a0332696df97d454fa67b81618fd69c35a7b90327cbe6ba5c92d2c68a7bfd", size = 401621, upload-time = "2026-02-09T15:45:24.425Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/61/fae042894f4296ec49e3f193aff5d7c18440da9e48102c3315e1bc4519a7/parso-0.8.6-py2.py3-none-any.whl", hash = "sha256:2c549f800b70a5c4952197248825584cb00f033b29c692671d3bf08bf380baff", size = 106894, upload-time = "2026-02-09T15:45:21.391Z" }, -] - -[[package]] -name = "patsy" -version = "1.0.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/be/44/ed13eccdd0519eff265f44b670d46fbb0ec813e2274932dc1c0e48520f7d/patsy-1.0.2.tar.gz", hash = "sha256:cdc995455f6233e90e22de72c37fcadb344e7586fb83f06696f54d92f8ce74c0", size = 399942, upload-time = "2025-10-20T16:17:37.535Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/70/ba4b949bdc0490ab78d545459acd7702b211dfccf7eb89bbc1060f52818d/patsy-1.0.2-py2.py3-none-any.whl", hash = "sha256:37bfddbc58fcf0362febb5f54f10743f8b21dd2aa73dec7e7ef59d1b02ae668a", size = 233301, upload-time = "2025-10-20T16:17:36.563Z" }, -] - -[[package]] -name = "pexpect" -version = "4.9.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ptyprocess", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, -] - -[[package]] -name = "platformdirs" -version = "4.10.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/47/e4501f49c178ae1d9f4a75073fda4204f52647993f075a9db4d14930e0c5/platformdirs-4.10.0.tar.gz", hash = "sha256:31e761a6a0ca04faf7353ea759bdba55652be214725111e5aac52dfa29d4bef7", size = 31224, upload-time = "2026-05-28T03:32:53.587Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/e6/cd9575ac904136b3cbf7aa7ee819ef86eedb7274e46f230e94ea4342e729/platformdirs-4.10.0-py3-none-any.whl", hash = "sha256:fb516cdb12eb0d857d0cd85a7c57cea4d060bee4578d6cf5a14dfdf8cbf8784a", size = 22743, upload-time = "2026-05-28T03:32:52.175Z" }, -] - -[[package]] -name = "plotly" -version = "5.24.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "packaging" }, - { name = "tenacity" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/79/4f/428f6d959818d7425a94c190a6b26fbc58035cbef40bf249be0b62a9aedd/plotly-5.24.1.tar.gz", hash = "sha256:dbc8ac8339d248a4bcc36e08a5659bacfe1b079390b8953533f4eb22169b4bae", size = 9479398, upload-time = "2024-09-12T15:36:31.068Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/ae/580600f441f6fc05218bd6c9d5794f4aef072a7d9093b291f1c50a9db8bc/plotly-5.24.1-py3-none-any.whl", hash = "sha256:f67073a1e637eb0dc3e46324d9d51e2fe76e9727c892dde64ddf1e1b51f29089", size = 19054220, upload-time = "2024-09-12T15:36:24.08Z" }, -] - -[[package]] -name = "pluggy" -version = "1.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, -] - -[[package]] -name = "polars" -version = "1.39.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "polars-runtime-32" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/93/ab/f19e592fce9e000da49c96bf35e77cef67f9cb4b040bfa538a2764c0263e/polars-1.39.3.tar.gz", hash = "sha256:2e016c7f3e8d14fa777ef86fe0477cec6c67023a20ba4c94d6e8431eefe4a63c", size = 728987, upload-time = "2026-03-20T11:16:24.836Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b4/db/08f4ca10c5018813e7e0b59e4472302328b3d2ab1512f5a2157a814540e0/polars-1.39.3-py3-none-any.whl", hash = "sha256:c2b955ccc0a08a2bc9259785decf3d5c007b489b523bf2390cf21cec2bb82a56", size = 823985, upload-time = "2026-03-20T11:14:23.619Z" }, -] - -[[package]] -name = "polars-runtime-32" -version = "1.39.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/17/39/c8688696bc22b6c501e3b82ef3be10e543c07a785af5660f30997cd22dd2/polars_runtime_32-1.39.3.tar.gz", hash = "sha256:c728e4f469cafab501947585f36311b8fb222d3e934c6209e83791e0df20b29d", size = 2872335, upload-time = "2026-03-20T11:16:26.581Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/74/1b41205f7368c9375ab1dea91178eaa20435fe3eff036390a53a7660b416/polars_runtime_32-1.39.3-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:425c0b220b573fa097b4042edff73114cc6d23432a21dfd2dc41adf329d7d2e9", size = 45273243, upload-time = "2026-03-20T11:14:26.691Z" }, - { url = "https://files.pythonhosted.org/packages/90/bf/297716b3095fe719be20fcf7af1d2b6ab069c38199bbace2469608a69b3a/polars_runtime_32-1.39.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ef5884711e3c617d7dc93519a7d038e242f5741cfe5fe9afd32d58845d86c562", size = 40842924, upload-time = "2026-03-20T11:14:31.154Z" }, - { url = "https://files.pythonhosted.org/packages/3d/3e/e65236d9d0d9babfa0ecba593413c06530fca60a8feb8f66243aa5dba92e/polars_runtime_32-1.39.3-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06b47f535eb1f97a9a1e5b0053ef50db3a4276e241178e37bbb1a38b1fa53b14", size = 43220650, upload-time = "2026-03-20T11:14:35.458Z" }, - { url = "https://files.pythonhosted.org/packages/b0/15/fc3e43f3fdf3f20b7dfb5abe871ab6162cf8fb4aeabf4cfad822d5dc4c79/polars_runtime_32-1.39.3-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bc9e13dc1d2e828331f2fe8ccbc9757554dc4933a8d3e85e906b988178f95ed", size = 46877498, upload-time = "2026-03-20T11:14:40.14Z" }, - { url = "https://files.pythonhosted.org/packages/3c/81/bd5f895919e32c6ab0a7786cd0c0ca961cb03152c47c3645808b54383f31/polars_runtime_32-1.39.3-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:363d49e3a3e638fc943e2b9887940300a7d06789930855a178a4727949259dc2", size = 43380176, upload-time = "2026-03-20T11:14:45.566Z" }, - { url = "https://files.pythonhosted.org/packages/7a/3e/c86433c3b5ec0315bdfc7640d0c15d41f1216c0103a0eab9a9b5147d6c4c/polars_runtime_32-1.39.3-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7c206bdcc7bc62ea038d6adea8e44b02f0e675e0191a54c810703b4895208ea4", size = 46485933, upload-time = "2026-03-20T11:14:51.155Z" }, - { url = "https://files.pythonhosted.org/packages/54/ce/200b310cf91f98e652eb6ea09fdb3a9718aa0293ebf113dce325797c8572/polars_runtime_32-1.39.3-cp310-abi3-win_amd64.whl", hash = "sha256:d66ca522517554a883446957539c40dc7b75eb0c2220357fb28bc8940d305339", size = 46995458, upload-time = "2026-03-20T11:14:56.074Z" }, - { url = "https://files.pythonhosted.org/packages/da/76/2d48927e0aa2abbdde08cbf4a2536883b73277d47fbeca95e952de86df34/polars_runtime_32-1.39.3-cp310-abi3-win_arm64.whl", hash = "sha256:f49f51461de63f13e5dd4eb080421c8f23f856945f3f8bd5b2b1f59da52c2860", size = 41857648, upload-time = "2026-03-20T11:15:01.142Z" }, -] - -[[package]] -name = "policyengine-core" -version = "3.26.11" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "dpath" }, - { name = "h5py" }, - { name = "huggingface-hub" }, - { name = "ipython" }, - { name = "microdf-python" }, - { name = "numexpr" }, - { name = "numpy" }, - { name = "pandas" }, - { name = "plotly" }, - { name = "psutil" }, - { name = "pytest" }, - { name = "pyvis" }, - { name = "requests" }, - { name = "sortedcontainers" }, - { name = "standard-imghdr" }, - { name = "wheel" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a8/7f/90a055ede0e75cea2ac02b99ce3825dd441f1ee8f2777fd252ccd204aa47/policyengine_core-3.26.11.tar.gz", hash = "sha256:2331d3879e484f6503ea9c11f1f185a873d31a4a5269dbc4250dcc9fb2edb40a", size = 480237, upload-time = "2026-05-21T18:00:03.649Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/0a/a99bdb1deccdbe48050ce2d4dff8bdc4e41992e923156007710087edf103/policyengine_core-3.26.11-py3-none-any.whl", hash = "sha256:17fb1b107f1e74792a8bfebe8ad0f2620a1cbdbabf8f358f25ac5f92738b8986", size = 235649, upload-time = "2026-05-21T18:00:01.716Z" }, -] - -[[package]] -name = "policyengine-us" -version = "1.715.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "microdf-python" }, - { name = "pandas" }, - { name = "policyengine-core" }, - { name = "spm-calculator" }, - { name = "tables" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a7/ef/d87bb056084897932e083b0412976a386d29062834b0e697afa044642a75/policyengine_us-1.715.2.tar.gz", hash = "sha256:b3990ae9b7c694d2cbf497e6256850aca7be5a5a73ac98330682aba9edd61b61", size = 10014025, upload-time = "2026-05-29T02:48:39.527Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/45/a1/1d56bdbb69d7ce06bedd3892203a75ac3350a90c0b5fcea2fb50db46670f/policyengine_us-1.715.2-py3-none-any.whl", hash = "sha256:abf079828419762f5c4b0291a70f6e424744200f237e1ae0f06e25f10130c399", size = 11035379, upload-time = "2026-05-29T02:48:35.193Z" }, -] - -[[package]] -name = "prdc" -version = "0.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "joblib" }, - { name = "numpy" }, - { name = "scikit-learn" }, - { name = "scipy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/16/3f/85c603c872ca28c870f1bd54bbe7020f5921efc1c04a9db32b75cf0c287c/prdc-0.2.tar.gz", hash = "sha256:247466c31743f334a2714dbd60ef62e523877c4162ddb7dc63a404cada09316f", size = 5253, upload-time = "2020-02-25T04:54:58.478Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/53/9b/e4731da221e9d502fb4e7531787d9b24d1791ff86a0d207dd2505ff485fc/prdc-0.2-py3-none-any.whl", hash = "sha256:570ae82fb57a0b0ea3e6a131354a61e23aca79716b77cb9917f3a98465b72120", size = 5956, upload-time = "2020-02-25T04:54:56.835Z" }, -] - -[[package]] -name = "prompt-toolkit" -version = "3.0.52" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "wcwidth" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, -] - -[[package]] -name = "psutil" -version = "6.1.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1f/5a/07871137bb752428aa4b659f910b399ba6f291156bdea939be3e96cae7cb/psutil-6.1.1.tar.gz", hash = "sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5", size = 508502, upload-time = "2024-12-19T18:21:20.568Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/61/99/ca79d302be46f7bdd8321089762dd4476ee725fce16fc2b2e1dbba8cac17/psutil-6.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8", size = 247511, upload-time = "2024-12-19T18:21:45.163Z" }, - { url = "https://files.pythonhosted.org/packages/0b/6b/73dbde0dd38f3782905d4587049b9be64d76671042fdcaf60e2430c6796d/psutil-6.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377", size = 248985, upload-time = "2024-12-19T18:21:49.254Z" }, - { url = "https://files.pythonhosted.org/packages/17/38/c319d31a1d3f88c5b79c68b3116c129e5133f1822157dd6da34043e32ed6/psutil-6.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003", size = 284488, upload-time = "2024-12-19T18:21:51.638Z" }, - { url = "https://files.pythonhosted.org/packages/9c/39/0f88a830a1c8a3aba27fededc642da37613c57cbff143412e3536f89784f/psutil-6.1.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160", size = 287477, upload-time = "2024-12-19T18:21:55.306Z" }, - { url = "https://files.pythonhosted.org/packages/47/da/99f4345d4ddf2845cb5b5bd0d93d554e84542d116934fde07a0c50bd4e9f/psutil-6.1.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3", size = 289017, upload-time = "2024-12-19T18:21:57.875Z" }, - { url = "https://files.pythonhosted.org/packages/38/53/bd755c2896f4461fd4f36fa6a6dcb66a88a9e4b9fd4e5b66a77cf9d4a584/psutil-6.1.1-cp37-abi3-win32.whl", hash = "sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53", size = 250602, upload-time = "2024-12-19T18:22:08.808Z" }, - { url = "https://files.pythonhosted.org/packages/7b/d7/7831438e6c3ebbfa6e01a927127a6cb42ad3ab844247f3c5b96bea25d73d/psutil-6.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649", size = 254444, upload-time = "2024-12-19T18:22:11.335Z" }, -] - -[[package]] -name = "ptyprocess" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, -] - -[[package]] -name = "pure-eval" -version = "0.2.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, -] - -[[package]] -name = "py-cpuinfo" -version = "9.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/37/a8/d832f7293ebb21690860d2e01d8115e5ff6f2ae8bbdc953f0eb0fa4bd2c7/py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690", size = 104716, upload-time = "2022-10-25T20:38:06.303Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" }, -] - -[[package]] -name = "pyarrow" -version = "23.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" }, - { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" }, - { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" }, - { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" }, - { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" }, - { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" }, - { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" }, - { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" }, - { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" }, - { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" }, - { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" }, - { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" }, - { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" }, - { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" }, - { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" }, - { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" }, - { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" }, - { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" }, - { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" }, - { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" }, - { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" }, - { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" }, - { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" }, - { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" }, - { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" }, - { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" }, - { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" }, - { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" }, -] - -[[package]] -name = "pybtex" -version = "0.26.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "latexcodec" }, - { name = "pyyaml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/4d/f5/f30da9c93f0fa6d619332b2f69597219b625f35780473a05164a9981fd9a/pybtex-0.26.1.tar.gz", hash = "sha256:2e5543bea424e60e9e42eef70bff597be48649d8f68ba061a7a092b2477d5464", size = 692991, upload-time = "2026-04-03T13:05:39.014Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/44/f6/775eb92e865b28cdb4ad1f2bed7a5446197516f76b58a950faa3be3fd08d/pybtex-0.26.1-py3-none-any.whl", hash = "sha256:e26c0412cc54f5f21b2a6d9d175762a2d2af9ccf3a8f651cdb89ec035db77aa1", size = 126134, upload-time = "2026-04-03T13:05:40.623Z" }, -] - -[[package]] -name = "pybtex-docutils" -version = "1.0.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "docutils" }, - { name = "pybtex" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7e/84/796ea94d26188a853660f81bded39f8de4cfe595130aef0dea1088705a11/pybtex-docutils-1.0.3.tar.gz", hash = "sha256:3a7ebdf92b593e00e8c1c538aa9a20bca5d92d84231124715acc964d51d93c6b", size = 18348, upload-time = "2023-08-22T18:47:54.833Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/11/b1/ce1f4596211efb5410e178a803f08e59b20bedb66837dcf41e21c54f9ec1/pybtex_docutils-1.0.3-py3-none-any.whl", hash = "sha256:8fd290d2ae48e32fcb54d86b0efb8d573198653c7e2447d5bec5847095f430b9", size = 6385, upload-time = "2023-08-22T06:43:20.513Z" }, -] - -[[package]] -name = "pycparser" -version = "3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, -] - -[[package]] -name = "pydantic" -version = "2.12.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "annotated-types" }, - { name = "pydantic-core" }, - { name = "typing-extensions" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, -] - -[[package]] -name = "pydantic-core" -version = "2.41.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" }, - { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" }, - { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" }, - { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" }, - { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" }, - { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" }, - { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, - { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, - { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, - { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, - { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, - { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, - { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, - { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" }, - { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" }, - { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" }, - { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" }, - { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" }, - { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" }, - { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, - { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" }, - { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" }, - { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" }, - { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" }, - { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" }, - { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" }, - { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" }, - { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" }, - { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" }, - { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" }, - { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" }, - { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" }, - { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" }, - { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" }, - { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" }, - { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" }, - { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" }, - { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" }, - { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, - { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, - { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, - { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, -] - -[[package]] -name = "pydata-sphinx-theme" -version = "0.15.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "accessible-pygments" }, - { name = "babel" }, - { name = "beautifulsoup4" }, - { name = "docutils" }, - { name = "packaging" }, - { name = "pygments" }, - { name = "sphinx" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/67/ea/3ab478cccacc2e8ef69892c42c44ae547bae089f356c4b47caf61730958d/pydata_sphinx_theme-0.15.4.tar.gz", hash = "sha256:7762ec0ac59df3acecf49fd2f889e1b4565dbce8b88b2e29ee06fdd90645a06d", size = 2400673, upload-time = "2024-06-25T19:28:45.041Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/d3/c622950d87a2ffd1654208733b5bd1c5645930014abed8f4c0d74863988b/pydata_sphinx_theme-0.15.4-py3-none-any.whl", hash = "sha256:2136ad0e9500d0949f96167e63f3e298620040aea8f9c74621959eda5d4cf8e6", size = 4640157, upload-time = "2024-06-25T19:28:42.383Z" }, -] - -[[package]] -name = "pygments" -version = "2.19.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, -] - -[[package]] -name = "pytest" -version = "8.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "iniconfig" }, - { name = "packaging" }, - { name = "pluggy" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, -] - -[[package]] -name = "python-dateutil" -version = "2.9.0.post0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "six" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, -] - -[[package]] -name = "pyvis" -version = "0.3.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ipython" }, - { name = "jinja2" }, - { name = "jsonpickle" }, - { name = "networkx" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/ab/4b/e37e4e5d5ee1179694917b445768bdbfb084f5a59ecd38089d3413d4c70f/pyvis-0.3.2-py3-none-any.whl", hash = "sha256:5720c4ca8161dc5d9ab352015723abb7a8bb8fb443edeb07f7a322db34a97555", size = 756038, upload-time = "2023-02-24T20:29:46.758Z" }, -] - -[[package]] -name = "pyyaml" -version = "6.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, - { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, - { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, - { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, - { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, - { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, - { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, - { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, - { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, - { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, - { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, - { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, - { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, - { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, - { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, - { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, - { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, - { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, - { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, - { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, - { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, - { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, - { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, - { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, - { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, - { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, - { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, - { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, -] - -[[package]] -name = "pyzmq" -version = "27.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cffi", marker = "implementation_name == 'pypy'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/04/0b/3c9baedbdf613ecaa7aa07027780b8867f57b6293b6ee50de316c9f3222b/pyzmq-27.1.0.tar.gz", hash = "sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540", size = 281750, upload-time = "2025-09-08T23:10:18.157Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/92/e7/038aab64a946d535901103da16b953c8c9cc9c961dadcbf3609ed6428d23/pyzmq-27.1.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc", size = 1306279, upload-time = "2025-09-08T23:08:03.807Z" }, - { url = "https://files.pythonhosted.org/packages/e8/5e/c3c49fdd0f535ef45eefcc16934648e9e59dace4a37ee88fc53f6cd8e641/pyzmq-27.1.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113", size = 895645, upload-time = "2025-09-08T23:08:05.301Z" }, - { url = "https://files.pythonhosted.org/packages/f8/e5/b0b2504cb4e903a74dcf1ebae157f9e20ebb6ea76095f6cfffea28c42ecd/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233", size = 652574, upload-time = "2025-09-08T23:08:06.828Z" }, - { url = "https://files.pythonhosted.org/packages/f8/9b/c108cdb55560eaf253f0cbdb61b29971e9fb34d9c3499b0e96e4e60ed8a5/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31", size = 840995, upload-time = "2025-09-08T23:08:08.396Z" }, - { url = "https://files.pythonhosted.org/packages/c2/bb/b79798ca177b9eb0825b4c9998c6af8cd2a7f15a6a1a4272c1d1a21d382f/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28", size = 1642070, upload-time = "2025-09-08T23:08:09.989Z" }, - { url = "https://files.pythonhosted.org/packages/9c/80/2df2e7977c4ede24c79ae39dcef3899bfc5f34d1ca7a5b24f182c9b7a9ca/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856", size = 2021121, upload-time = "2025-09-08T23:08:11.907Z" }, - { url = "https://files.pythonhosted.org/packages/46/bd/2d45ad24f5f5ae7e8d01525eb76786fa7557136555cac7d929880519e33a/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496", size = 1878550, upload-time = "2025-09-08T23:08:13.513Z" }, - { url = "https://files.pythonhosted.org/packages/e6/2f/104c0a3c778d7c2ab8190e9db4f62f0b6957b53c9d87db77c284b69f33ea/pyzmq-27.1.0-cp312-abi3-win32.whl", hash = "sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd", size = 559184, upload-time = "2025-09-08T23:08:15.163Z" }, - { url = "https://files.pythonhosted.org/packages/fc/7f/a21b20d577e4100c6a41795842028235998a643b1ad406a6d4163ea8f53e/pyzmq-27.1.0-cp312-abi3-win_amd64.whl", hash = "sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf", size = 619480, upload-time = "2025-09-08T23:08:17.192Z" }, - { url = "https://files.pythonhosted.org/packages/78/c2/c012beae5f76b72f007a9e91ee9401cb88c51d0f83c6257a03e785c81cc2/pyzmq-27.1.0-cp312-abi3-win_arm64.whl", hash = "sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f", size = 552993, upload-time = "2025-09-08T23:08:18.926Z" }, - { url = "https://files.pythonhosted.org/packages/60/cb/84a13459c51da6cec1b7b1dc1a47e6db6da50b77ad7fd9c145842750a011/pyzmq-27.1.0-cp313-cp313-android_24_arm64_v8a.whl", hash = "sha256:93ad4b0855a664229559e45c8d23797ceac03183c7b6f5b4428152a6b06684a5", size = 1122436, upload-time = "2025-09-08T23:08:20.801Z" }, - { url = "https://files.pythonhosted.org/packages/dc/b6/94414759a69a26c3dd674570a81813c46a078767d931a6c70ad29fc585cb/pyzmq-27.1.0-cp313-cp313-android_24_x86_64.whl", hash = "sha256:fbb4f2400bfda24f12f009cba62ad5734148569ff4949b1b6ec3b519444342e6", size = 1156301, upload-time = "2025-09-08T23:08:22.47Z" }, - { url = "https://files.pythonhosted.org/packages/a5/ad/15906493fd40c316377fd8a8f6b1f93104f97a752667763c9b9c1b71d42d/pyzmq-27.1.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:e343d067f7b151cfe4eb3bb796a7752c9d369eed007b91231e817071d2c2fec7", size = 1341197, upload-time = "2025-09-08T23:08:24.286Z" }, - { url = "https://files.pythonhosted.org/packages/14/1d/d343f3ce13db53a54cb8946594e567410b2125394dafcc0268d8dda027e0/pyzmq-27.1.0-cp313-cp313t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:08363b2011dec81c354d694bdecaef4770e0ae96b9afea70b3f47b973655cc05", size = 897275, upload-time = "2025-09-08T23:08:26.063Z" }, - { url = "https://files.pythonhosted.org/packages/69/2d/d83dd6d7ca929a2fc67d2c3005415cdf322af7751d773524809f9e585129/pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d54530c8c8b5b8ddb3318f481297441af102517602b569146185fa10b63f4fa9", size = 660469, upload-time = "2025-09-08T23:08:27.623Z" }, - { url = "https://files.pythonhosted.org/packages/3e/cd/9822a7af117f4bc0f1952dbe9ef8358eb50a24928efd5edf54210b850259/pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f3afa12c392f0a44a2414056d730eebc33ec0926aae92b5ad5cf26ebb6cc128", size = 847961, upload-time = "2025-09-08T23:08:29.672Z" }, - { url = "https://files.pythonhosted.org/packages/9a/12/f003e824a19ed73be15542f172fd0ec4ad0b60cf37436652c93b9df7c585/pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c65047adafe573ff023b3187bb93faa583151627bc9c51fc4fb2c561ed689d39", size = 1650282, upload-time = "2025-09-08T23:08:31.349Z" }, - { url = "https://files.pythonhosted.org/packages/d5/4a/e82d788ed58e9a23995cee70dbc20c9aded3d13a92d30d57ec2291f1e8a3/pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:90e6e9441c946a8b0a667356f7078d96411391a3b8f80980315455574177ec97", size = 2024468, upload-time = "2025-09-08T23:08:33.543Z" }, - { url = "https://files.pythonhosted.org/packages/d9/94/2da0a60841f757481e402b34bf4c8bf57fa54a5466b965de791b1e6f747d/pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:add071b2d25f84e8189aaf0882d39a285b42fa3853016ebab234a5e78c7a43db", size = 1885394, upload-time = "2025-09-08T23:08:35.51Z" }, - { url = "https://files.pythonhosted.org/packages/4f/6f/55c10e2e49ad52d080dc24e37adb215e5b0d64990b57598abc2e3f01725b/pyzmq-27.1.0-cp313-cp313t-win32.whl", hash = "sha256:7ccc0700cfdf7bd487bea8d850ec38f204478681ea02a582a8da8171b7f90a1c", size = 574964, upload-time = "2025-09-08T23:08:37.178Z" }, - { url = "https://files.pythonhosted.org/packages/87/4d/2534970ba63dd7c522d8ca80fb92777f362c0f321900667c615e2067cb29/pyzmq-27.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:8085a9fba668216b9b4323be338ee5437a235fe275b9d1610e422ccc279733e2", size = 641029, upload-time = "2025-09-08T23:08:40.595Z" }, - { url = "https://files.pythonhosted.org/packages/f6/fa/f8aea7a28b0641f31d40dea42d7ef003fded31e184ef47db696bc74cd610/pyzmq-27.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:6bb54ca21bcfe361e445256c15eedf083f153811c37be87e0514934d6913061e", size = 561541, upload-time = "2025-09-08T23:08:42.668Z" }, - { url = "https://files.pythonhosted.org/packages/87/45/19efbb3000956e82d0331bafca5d9ac19ea2857722fa2caacefb6042f39d/pyzmq-27.1.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:ce980af330231615756acd5154f29813d553ea555485ae712c491cd483df6b7a", size = 1341197, upload-time = "2025-09-08T23:08:44.973Z" }, - { url = "https://files.pythonhosted.org/packages/48/43/d72ccdbf0d73d1343936296665826350cb1e825f92f2db9db3e61c2162a2/pyzmq-27.1.0-cp314-cp314t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1779be8c549e54a1c38f805e56d2a2e5c009d26de10921d7d51cfd1c8d4632ea", size = 897175, upload-time = "2025-09-08T23:08:46.601Z" }, - { url = "https://files.pythonhosted.org/packages/2f/2e/a483f73a10b65a9ef0161e817321d39a770b2acf8bcf3004a28d90d14a94/pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7200bb0f03345515df50d99d3db206a0a6bee1955fbb8c453c76f5bf0e08fb96", size = 660427, upload-time = "2025-09-08T23:08:48.187Z" }, - { url = "https://files.pythonhosted.org/packages/f5/d2/5f36552c2d3e5685abe60dfa56f91169f7a2d99bbaf67c5271022ab40863/pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01c0e07d558b06a60773744ea6251f769cd79a41a97d11b8bf4ab8f034b0424d", size = 847929, upload-time = "2025-09-08T23:08:49.76Z" }, - { url = "https://files.pythonhosted.org/packages/c4/2a/404b331f2b7bf3198e9945f75c4c521f0c6a3a23b51f7a4a401b94a13833/pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:80d834abee71f65253c91540445d37c4c561e293ba6e741b992f20a105d69146", size = 1650193, upload-time = "2025-09-08T23:08:51.7Z" }, - { url = "https://files.pythonhosted.org/packages/1c/0b/f4107e33f62a5acf60e3ded67ed33d79b4ce18de432625ce2fc5093d6388/pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:544b4e3b7198dde4a62b8ff6685e9802a9a1ebf47e77478a5eb88eca2a82f2fd", size = 2024388, upload-time = "2025-09-08T23:08:53.393Z" }, - { url = "https://files.pythonhosted.org/packages/0d/01/add31fe76512642fd6e40e3a3bd21f4b47e242c8ba33efb6809e37076d9b/pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cedc4c68178e59a4046f97eca31b148ddcf51e88677de1ef4e78cf06c5376c9a", size = 1885316, upload-time = "2025-09-08T23:08:55.702Z" }, - { url = "https://files.pythonhosted.org/packages/c4/59/a5f38970f9bf07cee96128de79590bb354917914a9be11272cfc7ff26af0/pyzmq-27.1.0-cp314-cp314t-win32.whl", hash = "sha256:1f0b2a577fd770aa6f053211a55d1c47901f4d537389a034c690291485e5fe92", size = 587472, upload-time = "2025-09-08T23:08:58.18Z" }, - { url = "https://files.pythonhosted.org/packages/70/d8/78b1bad170f93fcf5e3536e70e8fadac55030002275c9a29e8f5719185de/pyzmq-27.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:19c9468ae0437f8074af379e986c5d3d7d7bfe033506af442e8c879732bedbe0", size = 661401, upload-time = "2025-09-08T23:08:59.802Z" }, - { url = "https://files.pythonhosted.org/packages/81/d6/4bfbb40c9a0b42fc53c7cf442f6385db70b40f74a783130c5d0a5aa62228/pyzmq-27.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dc5dbf68a7857b59473f7df42650c621d7e8923fb03fa74a526890f4d33cc4d7", size = 575170, upload-time = "2025-09-08T23:09:01.418Z" }, -] - -[[package]] -name = "quantile-forest" -version = "1.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "scikit-learn" }, - { name = "scipy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/62/6e/3f1493d4abcce71fdc82ed575475d3e02da7b03375129e84be2622e1532f/quantile_forest-1.4.1.tar.gz", hash = "sha256:713a23c69562b7551ba4a05c22ce9d0e90db6a73d043e760b29c331cb19dc552", size = 486249, upload-time = "2025-09-10T12:48:04.578Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/61/f8ff4e348dc2d265ea97287f921b92bca265229c48be64b94756ecff4078/quantile_forest-1.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:37c2da2ab54aceacdf5292065147f40a073b13cc3844262f0f3cbd5b8a8d928e", size = 955098, upload-time = "2025-09-10T12:47:52.137Z" }, - { url = "https://files.pythonhosted.org/packages/4f/95/75f3eea1c7cc3786c1ffdf4685e79c4979a4ae6ccedfed80362c9162f0d4/quantile_forest-1.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f0436ac7622442c2995cf121e0960332e769791f3f3c7ea62363e8480803bb3", size = 718470, upload-time = "2025-09-10T12:47:53.566Z" }, - { url = "https://files.pythonhosted.org/packages/fe/f1/0f26386bf164ede156099d18e3e4493dd21dc48e329e1be68232e5cf8b52/quantile_forest-1.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a594bd3552507beffa6ca6002143601be5defd5cc7329154f41317110f895f7a", size = 709245, upload-time = "2025-09-10T12:47:54.54Z" }, - { url = "https://files.pythonhosted.org/packages/4f/cd/6501c8c200f34a87e1e94d7ea4f1a9dc842154fbfaa0fe65f072817fbc41/quantile_forest-1.4.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:697c48faf52a04e7e47f97187650d16cecc9c971fe2f83d56854b4a454289f60", size = 2403543, upload-time = "2025-09-10T12:47:55.956Z" }, - { url = "https://files.pythonhosted.org/packages/f2/be/f77c6705e974b23353c43da1cd93e11fe0afc7e859c2d14f748d25cc0376/quantile_forest-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:fe33f6a8b63b3617568cc1254e1802a70ce3ac23897790f3be10f8db5257fe83", size = 685417, upload-time = "2025-09-10T12:47:57.346Z" }, -] - -[[package]] -name = "referencing" -version = "0.37.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "rpds-py" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, -] - -[[package]] -name = "requests" -version = "2.33.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "charset-normalizer" }, - { name = "idna" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/56/5d/c814546c2333ceea4ba42262d8c4d55763003e767fa169adc693bd524478/requests-2.33.0-py3-none-any.whl", hash = "sha256:3324635456fa185245e24865e810cecec7b4caf933d7eb133dcde67d48cee69b", size = 65017, upload-time = "2026-03-25T15:10:40.382Z" }, -] - -[[package]] -name = "rich" -version = "14.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown-it-py" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, -] - -[[package]] -name = "rpds-py" -version = "2026.5.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/2e/43/25a8dcd3feedd735039a8f0b5b7e3b118232b5eae288c4fd9ab200d41094/rpds_py-2026.5.1.tar.gz", hash = "sha256:07b24fea40541e28570e5b795a4a38fbdcd12550c06bd0748005ecc8116ca256", size = 64459, upload-time = "2026-05-28T12:02:13.232Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6c/32/14c961ad295f490eb0849ada8b79683e93a59b9de3afdd983eaf55fa6867/rpds_py-2026.5.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:efef4ac29c6ff495531eb17ee705b62841ecaa291b7c7077e848ea03e237164d", size = 352787, upload-time = "2026-05-28T11:59:33.655Z" }, - { url = "https://files.pythonhosted.org/packages/ca/bb/d1b85117967c11191441a7274ae616c65d93901d082c588f89a50a8da5ae/rpds_py-2026.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c39f5b67a8a2e67179ada2a954227d670fe65fa9098457f698f56ddf248709b3", size = 345179, upload-time = "2026-05-28T11:59:35Z" }, - { url = "https://files.pythonhosted.org/packages/7c/46/d84105f062e626a1b233f863907288a4708c2d833b8b4c6fb2764bc080c0/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5c30f3f04eef4fbd362226a6f31d7c8895ca4fbb6e0b790f6890a98d8da8559", size = 376173, upload-time = "2026-05-28T11:59:36.43Z" }, - { url = "https://files.pythonhosted.org/packages/e2/ae/469d7959ce5b1201e1de135dc735b86db3b35dd0d1734f6a44246d5f061c/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:277f6c82f0580848796c7ecc8a7173aa3bfb928e4ff831261c2f60a81dc270db", size = 383162, upload-time = "2026-05-28T11:59:37.995Z" }, - { url = "https://files.pythonhosted.org/packages/dc/a2/57853d31a1116a561aa072794602ad3f6341e18d70a8523f1bd5b9fc1e5a/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:63c2c4c213f1a4e3f3de28ecab029dbdee976324e729c0d7a55211be72576b02", size = 495093, upload-time = "2026-05-28T11:59:39.453Z" }, - { url = "https://files.pythonhosted.org/packages/99/63/3a8eabcad9314b7daf5c65f451d2c33d989235cd8a5762186cf2c3f5a4f8/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3350ec808fb538fe71a1f94dfaa0e29c598dfad805ce49f0caec5ae3183c652b", size = 389829, upload-time = "2026-05-28T11:59:40.896Z" }, - { url = "https://files.pythonhosted.org/packages/4b/25/05678d97fc25e2622df14dc530fb82023174ecfff6733991ed0d78f167bd/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b964e3ab599e718dc46c018d104b1ebc007cbc6567d827c94a687fca56d77e", size = 374786, upload-time = "2026-05-28T11:59:42.626Z" }, - { url = "https://files.pythonhosted.org/packages/88/d1/8c90b6431e80a3b91b284a5c7c8c0c4f9c006444d90477a740d6e0f9c694/rpds_py-2026.5.1-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:19cb09fab7b7fc96b2a6e28f2e34b72a3705ff27b37edb77455316e5d3f3dc9b", size = 386920, upload-time = "2026-05-28T11:59:44.124Z" }, - { url = "https://files.pythonhosted.org/packages/ff/99/4638f672ab356682d633ee0da9255f5b67ce6efd0b85eb94ad3e255e65a5/rpds_py-2026.5.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abe76bcdba31e576cb83eeb8797aa0d882b738fef6dc65d0601fc753806a5b46", size = 405059, upload-time = "2026-05-28T11:59:47.177Z" }, - { url = "https://files.pythonhosted.org/packages/66/3f/3546524b6eb4cc2e1f363a3d638fa52f6c24faae3500c25fb488b02f1740/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8bff7073db3899158fff55ebf57b113a67030af26f80a18978f9f0aa60250ddf", size = 553030, upload-time = "2026-05-28T11:59:48.603Z" }, - { url = "https://files.pythonhosted.org/packages/c6/c3/7b3388c796fcf471bd17194242d4dc1a7608567c0fa422bcc1c5e79f9c1e/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8ba264fa49be666cd9cc56bf34ec7002fb3d27a4aee5bcb4d43d0d18feb1bb6f", size = 618975, upload-time = "2026-05-28T11:59:50.314Z" }, - { url = "https://files.pythonhosted.org/packages/61/1e/a3cb07f2795075d1d88efddae2f541359fde5f08c81ee114c29c2949c90a/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4860b603ddda0475a8885499b3729e90229d480105b42651962a5397d995fa89", size = 581178, upload-time = "2026-05-28T11:59:51.673Z" }, - { url = "https://files.pythonhosted.org/packages/a1/74/e758c03a5ef46f04c37f2651a2893db846d569ba8a7bca469d4b58939bcd/rpds_py-2026.5.1-cp313-cp313-win32.whl", hash = "sha256:7944270ae71383f6e2657dd7d5ce4eeb4ac2d0059a6738f0510583d462ab4842", size = 212481, upload-time = "2026-05-28T11:59:53.148Z" }, - { url = "https://files.pythonhosted.org/packages/70/ec/a2aca432db9c7359b40fa393eeeaa0d166c2f70175be956e75fa24197c44/rpds_py-2026.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:88647f43a73c4e01be19b04ceef0c8d3a1958153604d13c773becd8016f2a0cf", size = 228519, upload-time = "2026-05-28T11:59:54.505Z" }, - { url = "https://files.pythonhosted.org/packages/29/60/a73bfdd45b096574556acf303bbd9fa9eed36ca8a818b514e2a5d5fe2b9d/rpds_py-2026.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:453895624ecf7db7063b1004e44037522bbaef9ff6a945e59bc71662d7a03abd", size = 223446, upload-time = "2026-05-28T11:59:56.081Z" }, - { url = "https://files.pythonhosted.org/packages/18/e2/408105fd611823f00882aea810f3989a30d26b1bab8b6beb20f98c724e0e/rpds_py-2026.5.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:b4e4bc98639ec915f512fde3aa7a95e0041d95d9c3cc86eea841fa63cb1e8600", size = 355287, upload-time = "2026-05-28T11:59:57.448Z" }, - { url = "https://files.pythonhosted.org/packages/8d/58/5c4a43436843c90d0f6d19f82c200c80e3843ca9fa07b237623327f6d384/rpds_py-2026.5.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cacedb7a6e167680acba45ad5716e89067d225dc80da0d7040cae8c81d4572fa", size = 347033, upload-time = "2026-05-28T11:59:58.881Z" }, - { url = "https://files.pythonhosted.org/packages/fb/c2/1a71acdacaf4e259b10278fb87b039ded3cf80041bcd89dd8a3ea702ded6/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68700371c5d7ae1412862ddfa719090925c93ecf351c566d66f09d04b136ea00", size = 376891, upload-time = "2026-05-28T12:00:00.516Z" }, - { url = "https://files.pythonhosted.org/packages/c2/c8/535f3d9b65addd8e28aa87b83c6e526799c3717a88273db8ea795beeef7a/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:296c799becfa849c779c8725494fe9ed94959ed886787df4364b058465bad7f0", size = 385646, upload-time = "2026-05-28T12:00:02.394Z" }, - { url = "https://files.pythonhosted.org/packages/1c/91/dc033f313345c354ade914dbe73cdb90b615a4409ea02430d5356794f3d8/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3858b908218ee108d0bbfb2095ccc237648053c9bf98affad7cb079acaf1d97", size = 498830, upload-time = "2026-05-28T12:00:04.189Z" }, - { url = "https://files.pythonhosted.org/packages/27/fc/90fcbea459dbb8ddc18a2e0fd1de9412b48bc84ffff2db771cf714bacfd6/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4fb8d2e7cb2f850b169806d61d1b991738acec96500a75c30f49caf064ce7cef", size = 392830, upload-time = "2026-05-28T12:00:05.797Z" }, - { url = "https://files.pythonhosted.org/packages/b2/1d/46cd11a228c9750684a798d98f878be6f614aa762438da7378f035e79e35/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27b74c10ed6a8f190f4287f53bcfea348b92a84a9c9f70d30183d1e6172d580d", size = 379613, upload-time = "2026-05-28T12:00:07.433Z" }, - { url = "https://files.pythonhosted.org/packages/24/4a/d9b0c6af3a1de03eb93741bbe8be2bdce84d8fda8224f3005451d86df389/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:b9a6528956191c48c52294a592dbd4a8386d7048bdb25c0efcb6b966466c6d83", size = 388183, upload-time = "2026-05-28T12:00:09.227Z" }, - { url = "https://files.pythonhosted.org/packages/c5/b4/db7aaabdda6d020afc87d981bcc2f57a434c7dec60ecfc2ab3dd50b20351/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:af03e34e860047bc7a352b842856fcf78798fbb81132cc98bd2f907ab4eb9cd2", size = 408578, upload-time = "2026-05-28T12:00:10.779Z" }, - { url = "https://files.pythonhosted.org/packages/08/d6/070f6a41cbb343e2ac4171859bf3f3623e0ab002f72619d6d505313ec2de/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fea6e836d10abbe191d557d33bd58bd5987725fe63aa1eefe557d230209855bd", size = 553573, upload-time = "2026-05-28T12:00:12.443Z" }, - { url = "https://files.pythonhosted.org/packages/75/ab/1a71ea3589c4345dac0a0518f0e6a031cb42689277851b683c46d27463a5/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:fc0c0f878ea770a0a8a462456c5ad36fc9fe6358e6b76fdadc7f17575e0b8bf1", size = 620861, upload-time = "2026-05-28T12:00:14.09Z" }, - { url = "https://files.pythonhosted.org/packages/8a/22/9bf80a56069c0c443fcfefac639a86a744550a2898817a6dfd3e26654924/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e0b360f316d966b048b085857630b3cc51f3db2f07b06f440eac8f695374d1e3", size = 585633, upload-time = "2026-05-28T12:00:15.66Z" }, - { url = "https://files.pythonhosted.org/packages/da/68/3b2c0a75c9e04125696f84ebdbbf304acf5a40b58ba4481cdb98a922c3ba/rpds_py-2026.5.1-cp313-cp313t-win32.whl", hash = "sha256:a2999883eedf72fdfb7520b92c7d4ec2572a71ff40239377aa604cc529eecafc", size = 210074, upload-time = "2026-05-28T12:00:17.291Z" }, - { url = "https://files.pythonhosted.org/packages/e7/8b/609157d5a25d37d4f29f92840ba531f416907c34ae5c5739dd21fc2bef98/rpds_py-2026.5.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e07be2a9d7122bd6e82dea89814ef8dc893feb1aae97fec1630f3263bbb30e55", size = 228635, upload-time = "2026-05-28T12:00:18.73Z" }, - { url = "https://files.pythonhosted.org/packages/d4/6f/19c1918a4b590d8de87e712e4abe4b3875771eff60216fb6153cf6665c68/rpds_py-2026.5.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:1f2c391c3059798093b65df23aca2cac150460ae9c630d99dec83d703d9485b9", size = 349756, upload-time = "2026-05-28T12:00:20.217Z" }, - { url = "https://files.pythonhosted.org/packages/e5/60/a06fe7da34eca79dacbf958a2ba0c6eea85bc2b29de20080bf40f72f66fa/rpds_py-2026.5.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:413b424f7c4ee65ab5e5be91f5731be0f8b41a1ee2b12dfe810d716312e95a78", size = 343831, upload-time = "2026-05-28T12:00:21.711Z" }, - { url = "https://files.pythonhosted.org/packages/bf/ec/b2333b97b90e2a6ef6ca8ad386ee284968e74bcfe113b3f1a8d9036429a9/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c595a1d9255dce0599e13130d1440ab2506654f2b50294226ee06402f8fef63", size = 375127, upload-time = "2026-05-28T12:00:23.326Z" }, - { url = "https://files.pythonhosted.org/packages/14/7f/e00aae54067f2b488c4637961d5f58204d470795fc791085fa3f15060d2e/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1c27c5f6102eac8c03e7595a00827a53b271ba40a53b59ff8709170e0855ea4a", size = 379034, upload-time = "2026-05-28T12:00:24.89Z" }, - { url = "https://files.pythonhosted.org/packages/be/cc/423999bbb8ae8dc93c77fc1d5e984ade5eb89d237d3bb884ccfa72ae2890/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c7fcf61d44cacecaf3aea542b0e053db77972a4573e7ceda16fb2b399161195", size = 490823, upload-time = "2026-05-28T12:00:26.676Z" }, - { url = "https://files.pythonhosted.org/packages/0f/aa/c671bf660f12e68d3c52ff86c7066ed1372df5a0f4f2ff584e419b8207e7/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2c817a189d4ee14290420e5ff051e4dd6baa13f3edf84685071dee07a6d538ee", size = 388144, upload-time = "2026-05-28T12:00:28.577Z" }, - { url = "https://files.pythonhosted.org/packages/19/c8/d63bb75b68afe77b229e3021c6031bcaf01da5db5b0e69d0d10f9ba679a7/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21846aac0ed2e0589f38c12dc44e77bb64e494b771eadbcf169cba00566ba7ba", size = 371959, upload-time = "2026-05-28T12:00:30.304Z" }, - { url = "https://files.pythonhosted.org/packages/82/35/c51122014d8274ff37dc606d60049c3db7d83da02b5b282511e5a906a9a6/rpds_py-2026.5.1-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b317c87a13f769a4e787819bd508aaa5d69aa09b0880de9af6d3a8a54571cdec", size = 383558, upload-time = "2026-05-28T12:00:31.764Z" }, - { url = "https://files.pythonhosted.org/packages/e3/f9/2790cb99c136a5363acdeacf5c27c56f3de0d4118a1f48fca83404c99c89/rpds_py-2026.5.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce87129d9f2c14fa6c4a8601fb80eb4488c80d38a20cd13758ef11123e14995d", size = 402789, upload-time = "2026-05-28T12:00:33.247Z" }, - { url = "https://files.pythonhosted.org/packages/e5/1b/e4fb584f8c75d35c38150ff6a332cda949e6f97acba1f4fd123b14ab56fe/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9cdddb6c1207d284d94fd1530adf57fbd797fe7c4b8704ba85f49414f2557e7d", size = 551405, upload-time = "2026-05-28T12:00:34.819Z" }, - { url = "https://files.pythonhosted.org/packages/d8/f7/a6731b4216cb3793ea1af5391da240f5683dacc0d13e034fe5fc3503f240/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:4e237e139f94d3c036fd28eb9f564c99055476ff4ff05cd42be55ce349b5aa02", size = 616975, upload-time = "2026-05-28T12:00:36.268Z" }, - { url = "https://files.pythonhosted.org/packages/2c/ea/2e051a81d95d8e63f4b35a1c463a87e8766bc3d083c067c5dfb6bf220747/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ed0954b524873214369184a9c82b0eaa45a3fbb9a798cd95b17e0d98499e7ea0", size = 578701, upload-time = "2026-05-28T12:00:37.82Z" }, - { url = "https://files.pythonhosted.org/packages/65/56/b5f6fdb2083e32bca8a8993d89e70db114b4756c9e2c38421328126689d2/rpds_py-2026.5.1-cp314-cp314-win32.whl", hash = "sha256:2d88621d6a7d4dfa633d21abe90f280bb205274e16b1d1e61c6ad4640b2453b7", size = 209806, upload-time = "2026-05-28T12:00:39.492Z" }, - { url = "https://files.pythonhosted.org/packages/fb/80/65a5aa96c155e611d1ed844e4e1f57f3e36b021f396d9f8585d756e6b90d/rpds_py-2026.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:cef8ac28d26f4dda3533060c20fbf80a325458fa9fd23ea72a73cdfa8e978838", size = 225985, upload-time = "2026-05-28T12:00:40.94Z" }, - { url = "https://files.pythonhosted.org/packages/27/7c/ad185212e87b05f196daef92bc5f3caf07298eb47c295b5585c3dd3093ac/rpds_py-2026.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:eaaea962c68cdc68d4a533ba985ab8e9484277910bbfaa2ab3ef7732667bfed8", size = 221219, upload-time = "2026-05-28T12:00:43.15Z" }, - { url = "https://files.pythonhosted.org/packages/23/58/e14ae18759020334646b031e708ab4158d653a938822bfb7b95ef2e93aa3/rpds_py-2026.5.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:21942f52dbbd5f8758bf021213d28bd45c39e873e65e2407faf5f1846f5761ad", size = 352148, upload-time = "2026-05-28T12:00:44.638Z" }, - { url = "https://files.pythonhosted.org/packages/31/9b/5f4a1e2f960bca3ac5d052b139dd31eed97b259f9d909173821760d542e8/rpds_py-2026.5.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f414556f6e3958300ff941e40c9f97e3dc9774ddd1b3434c475d73dd354bbed3", size = 345196, upload-time = "2026-05-28T12:00:46.14Z" }, - { url = "https://files.pythonhosted.org/packages/1a/71/1d9574d6a2fa20ab60eaa55c7467f5aa20cbc770f341a05f09c0876f59e2/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef1013a8625c74043210190b246f5b1551e09757c1f356c6e4160ef96c5bc081", size = 374981, upload-time = "2026-05-28T12:00:47.531Z" }, - { url = "https://files.pythonhosted.org/packages/0c/9a/37e99f4915a80aa71670263c1267f7ae0af95f53a3f61e6c3bdc016d4515/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cc68e231a77a5f0d774ae278a1f8e55c0456501820847c1e4efb3829f3441df6", size = 379961, upload-time = "2026-05-28T12:00:49.216Z" }, - { url = "https://files.pythonhosted.org/packages/a8/ff/6e73f74b89d2e0715e0fc86b7dde893f9a61ae2f9b256ff3bdfe41ac4e94/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9baffb505aff33acc69b422a19f77806680f3c8632227d79f48de8a810d1c2c5", size = 495965, upload-time = "2026-05-28T12:00:51.111Z" }, - { url = "https://files.pythonhosted.org/packages/ea/e0/425faba25f59d74d4638b267f7c7a80e8649d2ef4db10a19b0c4a71e6e6f/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8d2f912928d426e8cfa396f7f3f8d29a59e6689c86dcca3c420730c1096322b", size = 389526, upload-time = "2026-05-28T12:00:52.77Z" }, - { url = "https://files.pythonhosted.org/packages/c6/76/7a41960e3fddae47fab43a28684d5da981401dffd88253de0944148654cb/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90f628283be835db980c941767d41c9a27b5239e54ba0a9c1335247e82406964", size = 376190, upload-time = "2026-05-28T12:00:54.215Z" }, - { url = "https://files.pythonhosted.org/packages/27/60/5f38dc70824fc6951b51d35377e577a3a3a4c81a6769cc5a2de25ebe0ad1/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:1ebb2f0ab7e16132995a72de805170e0203df0c3dd22e1ef1cd1fdd90bd7a131", size = 383921, upload-time = "2026-05-28T12:00:55.673Z" }, - { url = "https://files.pythonhosted.org/packages/60/1a/d60a38caa1505f4b9483c3fbbde12c94e1079154f4f401a6da96f7e77621/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f3df3d16ded76f1f8c9cdebd0e1ea55fdf4c23b812de189814da7cf229c22a81", size = 404766, upload-time = "2026-05-28T12:00:57.518Z" }, - { url = "https://files.pythonhosted.org/packages/87/ff/602fd3f174d6425f0bce05ad0dfbec0e96b38d0f7d08a79af5aa20083885/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9af8905b8f854990e40d5206aa5ac58d9b0fe0b7f351ff2bb086c20f6c8c6a47", size = 551343, upload-time = "2026-05-28T12:00:58.978Z" }, - { url = "https://files.pythonhosted.org/packages/b8/c1/1be13327acdbead3eca1fde03b6a34dbb011f1e864e217f0d32cc1779a7f/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:036a36a87fb1cd3b214d11c4b3c4f7d2ddad933625dca1c900b56a057c07740a", size = 618502, upload-time = "2026-05-28T12:01:00.656Z" }, - { url = "https://files.pythonhosted.org/packages/f3/d7/afb49b49d7f2be8b7ba1a9f0977fa5168003437b93086726f066544e8351/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:62ae3853454fe9ef283a03c96c2d835d39e84b14643a9d62c82ef0fb87d702ca", size = 581916, upload-time = "2026-05-28T12:01:02.22Z" }, - { url = "https://files.pythonhosted.org/packages/25/d1/dbef8c1f8a10f07beb62b5f054e20099fd9924b3ec001b8f0b6ac7813a85/rpds_py-2026.5.1-cp314-cp314t-win32.whl", hash = "sha256:6c3d771a46ec18b12af06ce36243a9a80b07a5d0515236332d90863ca8bb326a", size = 207855, upload-time = "2026-05-28T12:01:03.821Z" }, - { url = "https://files.pythonhosted.org/packages/2a/72/bfa4e61ab8e7dc1c8adf397e05e6cbdd4239357bd72b248d3de662f23915/rpds_py-2026.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:c93c629be4636cf54337bd5f06c104d55e42ced54d681f6fe21ae510a65116f6", size = 225422, upload-time = "2026-05-28T12:01:05.194Z" }, - { url = "https://files.pythonhosted.org/packages/27/3a/7b5da92b640f67b6717ccafc83cdd06bfa7ff2395c3685c68922bb54d703/rpds_py-2026.5.1-cp315-cp315-macosx_10_12_x86_64.whl", hash = "sha256:3574b55c604b8f75dacb007136508bbc0db406e626301778096a133327e7f2fb", size = 349576, upload-time = "2026-05-28T12:01:06.722Z" }, - { url = "https://files.pythonhosted.org/packages/d7/8a/2aafd7ad355a1bd48ca76e2262b74b15e6432b5a1efe150efd4d779cd55d/rpds_py-2026.5.1-cp315-cp315-macosx_11_0_arm64.whl", hash = "sha256:94068eb3ae6d43f5a786b7db96a406a34e6d5c24489feef32fd6e8946ea7b291", size = 343640, upload-time = "2026-05-28T12:01:08.441Z" }, - { url = "https://files.pythonhosted.org/packages/f7/7d/6c9523c1abbe840a1b7fba3c516d48e1d3487cc80fea4366c4071cf56784/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a5b10e8ce894825f380a8f1b6444cf73c294dfea62afbb2d13e3a9e630cec1", size = 375322, upload-time = "2026-05-28T12:01:09.934Z" }, - { url = "https://files.pythonhosted.org/packages/5a/5d/0b7b03fb1dc509321f01de3149784ab773e34c8573022029af8076afcb9c/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fc09f82e63d4bcd58149572f857a431bae851dc747e313c3b5bdf7abb907fda8", size = 379066, upload-time = "2026-05-28T12:01:11.48Z" }, - { url = "https://files.pythonhosted.org/packages/d7/e2/8ef6012999ebf1cb1c22f876d9ce5e63d960fd4631d2af3202d3f480aa25/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e10464d17df3b582745c25cec695cb9558bca2cb6ddb631aee1787fc72c767b2", size = 494586, upload-time = "2026-05-28T12:01:13.051Z" }, - { url = "https://files.pythonhosted.org/packages/80/af/1eeb029bec67582c226b7809172207cd005073af4ebd906e65ff494f4983/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ba05adbf15d994c38ec0b7ab32e858e5110c21e9009a00a86545fd220f84e038", size = 388415, upload-time = "2026-05-28T12:01:14.631Z" }, - { url = "https://files.pythonhosted.org/packages/18/23/ffbe10711c4d766c1cab0557d6906c074f795814863c67b351355d29354a/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77c004fdc7b891967106f78ddfd7b076bfe6813c6139c6fff6aed3bcaa960b26", size = 372427, upload-time = "2026-05-28T12:01:16.153Z" }, - { url = "https://files.pythonhosted.org/packages/bd/3a/30ba4a6ad457e5b070c18d742a33fb77d8d922b565cc881f8a5313d63bfe/rpds_py-2026.5.1-cp315-cp315-manylinux_2_31_riscv64.whl", hash = "sha256:83bcf894486c9d78dd290d3c0124ff6dd8875d3025e2090a8ec49fcc37c55fdd", size = 383615, upload-time = "2026-05-28T12:01:17.809Z" }, - { url = "https://files.pythonhosted.org/packages/d3/69/62e242b53ce39c0814bd24e1a6e6eba6c92be716277745f317f9540a2e7b/rpds_py-2026.5.1-cp315-cp315-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3df104083952a0e0c6f10de33e440eabe98fb6317d23e1a58c68f6df08d01b9", size = 402786, upload-time = "2026-05-28T12:01:19.419Z" }, - { url = "https://files.pythonhosted.org/packages/38/c1/a770b9c186928a1ed0f7e6d7ae50e7f3950ed23e3f9e366dbc8e38cb55de/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:980450826cf22e133c57e0835070bdd0dd3f73b9b708c3ce223def2cb9469e14", size = 551583, upload-time = "2026-05-28T12:01:21.013Z" }, - { url = "https://files.pythonhosted.org/packages/21/7c/68e8579b95375b70d2a963103c42e705856cdb98569258bd807f4423891c/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_i686.whl", hash = "sha256:205dde846f24332ab0c1188699a043b8d165b79bb84529ce272c45048ff6be01", size = 616941, upload-time = "2026-05-28T12:01:22.548Z" }, - { url = "https://files.pythonhosted.org/packages/70/a1/a6135aed5730ff03ab957182259987ac11e55fb392a28dc6f0592048a280/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:3966b82dd563176396df030f3dd52a6e54cb69b718e95e78bd555ed3d1e0185d", size = 578349, upload-time = "2026-05-28T12:01:24.118Z" }, - { url = "https://files.pythonhosted.org/packages/09/6e/f24201a76a84e6c49d0bdfdfcb735210e21701e9b21c5bfc0ba497dd62f6/rpds_py-2026.5.1-cp315-cp315-win32.whl", hash = "sha256:7818f8d0a415be74d2be3590b0a1c1f463a642f4d0217e7d10602dceef5b79aa", size = 209922, upload-time = "2026-05-28T12:01:25.522Z" }, - { url = "https://files.pythonhosted.org/packages/9e/e4/966bc240bb0485fc265278f6de44d05834bf0b3618886e0b22e33d54c49a/rpds_py-2026.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:b3cc20c0d800af78fd0fac68086e28c1856cec51ea528bb81ea851aa40d39325", size = 226003, upload-time = "2026-05-28T12:01:27.062Z" }, - { url = "https://files.pythonhosted.org/packages/5c/5c/a15a59269cd5e74472734516c73795c15eccfc841b3d4b0228c3f53f19d0/rpds_py-2026.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:3609e9939a8a76cd904cf98a3f1f13b5dc7e150adeaee89e0ea09652ea213e16", size = 221245, upload-time = "2026-05-28T12:01:28.51Z" }, - { url = "https://files.pythonhosted.org/packages/e0/22/135ce03804e179a71ceb13be095deda4a279bc88f7a6b8fa161c5ad44e12/rpds_py-2026.5.1-cp315-cp315t-macosx_10_12_x86_64.whl", hash = "sha256:5d333a7127d4b307601ac37792bee01bb95c867cbfacf21b6375b804d6bbd723", size = 352015, upload-time = "2026-05-28T12:01:30.214Z" }, - { url = "https://files.pythonhosted.org/packages/3b/5f/f1f6d2652eb9d848f6eb369d8db83a2da6249bb49ad2c2a48f45d54538d3/rpds_py-2026.5.1-cp315-cp315t-macosx_11_0_arm64.whl", hash = "sha256:b5f077b44a4f7808520f66dae234988d867deb9aed9be5da057ce9ba831b2a41", size = 345016, upload-time = "2026-05-28T12:01:31.656Z" }, - { url = "https://files.pythonhosted.org/packages/88/66/b74182775691ea2290c99e52ac8d5db844e56fbec90ce421f107658c8314/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55d8f9b7b78c9538fc9e04e82ec0e888ff0c3cffcfad152c77e57cd09351a98a", size = 374775, upload-time = "2026-05-28T12:01:33.136Z" }, - { url = "https://files.pythonhosted.org/packages/ff/8f/15e5a61d9f0a43902d36561d4f07cae6ae9f4716be825159fd72717f33af/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e3a8ae58895ac107ed934a6bf51e5846f95c53b9b940c2c6d310838fd5846358", size = 380270, upload-time = "2026-05-28T12:01:34.574Z" }, - { url = "https://files.pythonhosted.org/packages/02/c3/f859b12763a80540cdf2af0f15b19904cf756a71d7bdd3f82ff3e5b1bbf9/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0957cf3c2b8632ec7aaebffebea8005b353cc2a237b6e2ae3c2cac0820704cfb", size = 495285, upload-time = "2026-05-28T12:01:36.127Z" }, - { url = "https://files.pythonhosted.org/packages/1c/c7/ff27c2ac8411d30b03b1829fd88cae8dad1a4d0da48dd25e57c4038042e6/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c396c1304de421050b3681ea70f371874b54d41b0151e96109758144c231e30b", size = 389581, upload-time = "2026-05-28T12:01:37.635Z" }, - { url = "https://files.pythonhosted.org/packages/6e/67/fe92ee32a6cc05c77228a2f8b1762e7124f386ec20ff83d0757b762d58d0/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad1bff7f666b9598e573815affd666aac6a13a585dde336f843e33350c7fadc", size = 376041, upload-time = "2026-05-28T12:01:39.307Z" }, - { url = "https://files.pythonhosted.org/packages/f8/91/b4d6685c27aba55bd82f25b278be8237038117d05f9659a6213ad3408130/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_31_riscv64.whl", hash = "sha256:656a042550878f12d45752452d47094b7cfe5ad1e9d7b87b5a22ad3ae5ff8015", size = 383946, upload-time = "2026-05-28T12:01:41.043Z" }, - { url = "https://files.pythonhosted.org/packages/bd/79/2c1d832a53c8e0f8e98fc970ec257b950fecd4f62be2ab7182b500a0cbc8/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73c4bd4f70294737b5206a3e8e30ccadbf8a60301831c8ea23eec5dbeea1ecfa", size = 405526, upload-time = "2026-05-28T12:01:43.032Z" }, - { url = "https://files.pythonhosted.org/packages/78/c4/c98117b03c6a8581ab2c2dfccfe9a5ad82bd8128a3c28b46a6ad2d97c393/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:43bca78665423cabae77146f2fe7ce55272b6c8d55d82cca83effd42c7e13972", size = 551165, upload-time = "2026-05-28T12:01:44.648Z" }, - { url = "https://files.pythonhosted.org/packages/3b/c1/bc479ca069200af730881b1bd525e3114b2b391a351509fcb1b772f28086/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_i686.whl", hash = "sha256:42d0f20e85e549c870749d0e247f0c10d318a45b7e9676d575d2dcb04a1b2e66", size = 618778, upload-time = "2026-05-28T12:01:46.337Z" }, - { url = "https://files.pythonhosted.org/packages/77/65/38ab2f90df44c2febfb63cc10ced40763d9b4bc94d173e734528663fe7f5/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:b1be5c35683684d5331b93600c210e8367c254683d8a6df6bd21bd2da3a334fb", size = 581839, upload-time = "2026-05-28T12:01:48.109Z" }, - { url = "https://files.pythonhosted.org/packages/15/2d/ce1f605fe036aadd460e5822e578c6c7ec3a860936cca37d6e0f299daa77/rpds_py-2026.5.1-cp315-cp315t-win32.whl", hash = "sha256:75808f6c38ce7749bb68cc2770161aae5045e6c6f6781a9782e74b93304399df", size = 207866, upload-time = "2026-05-28T12:01:49.648Z" }, - { url = "https://files.pythonhosted.org/packages/79/cb/966040123eb102371559746908ef2c9471f4d43e17ec9a645a2258dab64b/rpds_py-2026.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:90bd6630002a1c7f09e7843dd79f0d24f3d2897cc25a753480917865d14f15b3", size = 225441, upload-time = "2026-05-28T12:01:51.408Z" }, -] - -[[package]] -name = "ruff" -version = "0.15.7" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a1/22/9e4f66ee588588dc6c9af6a994e12d26e19efbe874d1a909d09a6dac7a59/ruff-0.15.7.tar.gz", hash = "sha256:04f1ae61fc20fe0b148617c324d9d009b5f63412c0b16474f3d5f1a1a665f7ac", size = 4601277, upload-time = "2026-03-19T16:26:22.605Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/41/2f/0b08ced94412af091807b6119ca03755d651d3d93a242682bf020189db94/ruff-0.15.7-py3-none-linux_armv6l.whl", hash = "sha256:a81cc5b6910fb7dfc7c32d20652e50fa05963f6e13ead3c5915c41ac5d16668e", size = 10489037, upload-time = "2026-03-19T16:26:32.47Z" }, - { url = "https://files.pythonhosted.org/packages/91/4a/82e0fa632e5c8b1eba5ee86ecd929e8ff327bbdbfb3c6ac5d81631bef605/ruff-0.15.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:722d165bd52403f3bdabc0ce9e41fc47070ac56d7a91b4e0d097b516a53a3477", size = 10955433, upload-time = "2026-03-19T16:27:00.205Z" }, - { url = "https://files.pythonhosted.org/packages/ab/10/12586735d0ff42526ad78c049bf51d7428618c8b5c467e72508c694119df/ruff-0.15.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7fbc2448094262552146cbe1b9643a92f66559d3761f1ad0656d4991491af49e", size = 10269302, upload-time = "2026-03-19T16:26:26.183Z" }, - { url = "https://files.pythonhosted.org/packages/eb/5d/32b5c44ccf149a26623671df49cbfbd0a0ae511ff3df9d9d2426966a8d57/ruff-0.15.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b39329b60eba44156d138275323cc726bbfbddcec3063da57caa8a8b1d50adf", size = 10607625, upload-time = "2026-03-19T16:27:03.263Z" }, - { url = "https://files.pythonhosted.org/packages/5d/f1/f0001cabe86173aaacb6eb9bb734aa0605f9a6aa6fa7d43cb49cbc4af9c9/ruff-0.15.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:87768c151808505f2bfc93ae44e5f9e7c8518943e5074f76ac21558ef5627c85", size = 10324743, upload-time = "2026-03-19T16:27:09.791Z" }, - { url = "https://files.pythonhosted.org/packages/7a/87/b8a8f3d56b8d848008559e7c9d8bf367934d5367f6d932ba779456e2f73b/ruff-0.15.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb0511670002c6c529ec66c0e30641c976c8963de26a113f3a30456b702468b0", size = 11138536, upload-time = "2026-03-19T16:27:06.101Z" }, - { url = "https://files.pythonhosted.org/packages/e4/f2/4fd0d05aab0c5934b2e1464784f85ba2eab9d54bffc53fb5430d1ed8b829/ruff-0.15.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0d19644f801849229db8345180a71bee5407b429dd217f853ec515e968a6912", size = 11994292, upload-time = "2026-03-19T16:26:48.718Z" }, - { url = "https://files.pythonhosted.org/packages/64/22/fc4483871e767e5e95d1622ad83dad5ebb830f762ed0420fde7dfa9d9b08/ruff-0.15.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4806d8e09ef5e84eb19ba833d0442f7e300b23fe3f0981cae159a248a10f0036", size = 11398981, upload-time = "2026-03-19T16:26:54.513Z" }, - { url = "https://files.pythonhosted.org/packages/b0/99/66f0343176d5eab02c3f7fcd2de7a8e0dd7a41f0d982bee56cd1c24db62b/ruff-0.15.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dce0896488562f09a27b9c91b1f58a097457143931f3c4d519690dea54e624c5", size = 11242422, upload-time = "2026-03-19T16:26:29.277Z" }, - { url = "https://files.pythonhosted.org/packages/5d/3a/a7060f145bfdcce4c987ea27788b30c60e2c81d6e9a65157ca8afe646328/ruff-0.15.7-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:1852ce241d2bc89e5dc823e03cff4ce73d816b5c6cdadd27dbfe7b03217d2a12", size = 11232158, upload-time = "2026-03-19T16:26:42.321Z" }, - { url = "https://files.pythonhosted.org/packages/a7/53/90fbb9e08b29c048c403558d3cdd0adf2668b02ce9d50602452e187cd4af/ruff-0.15.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5f3e4b221fb4bd293f79912fc5e93a9063ebd6d0dcbd528f91b89172a9b8436c", size = 10577861, upload-time = "2026-03-19T16:26:57.459Z" }, - { url = "https://files.pythonhosted.org/packages/2f/aa/5f486226538fe4d0f0439e2da1716e1acf895e2a232b26f2459c55f8ddad/ruff-0.15.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:b15e48602c9c1d9bdc504b472e90b90c97dc7d46c7028011ae67f3861ceba7b4", size = 10327310, upload-time = "2026-03-19T16:26:35.909Z" }, - { url = "https://files.pythonhosted.org/packages/99/9e/271afdffb81fe7bfc8c43ba079e9d96238f674380099457a74ccb3863857/ruff-0.15.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1b4705e0e85cedc74b0a23cf6a179dbb3df184cb227761979cc76c0440b5ab0d", size = 10840752, upload-time = "2026-03-19T16:26:45.723Z" }, - { url = "https://files.pythonhosted.org/packages/bf/29/a4ae78394f76c7759953c47884eb44de271b03a66634148d9f7d11e721bd/ruff-0.15.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:112c1fa316a558bb34319282c1200a8bf0495f1b735aeb78bfcb2991e6087580", size = 11336961, upload-time = "2026-03-19T16:26:39.076Z" }, - { url = "https://files.pythonhosted.org/packages/26/6b/8786ba5736562220d588a2f6653e6c17e90c59ced34a2d7b512ef8956103/ruff-0.15.7-py3-none-win32.whl", hash = "sha256:6d39e2d3505b082323352f733599f28169d12e891f7dd407f2d4f54b4c2886de", size = 10582538, upload-time = "2026-03-19T16:26:15.992Z" }, - { url = "https://files.pythonhosted.org/packages/2b/e9/346d4d3fffc6871125e877dae8d9a1966b254fbd92a50f8561078b88b099/ruff-0.15.7-py3-none-win_amd64.whl", hash = "sha256:4d53d712ddebcd7dace1bc395367aec12c057aacfe9adbb6d832302575f4d3a1", size = 11755839, upload-time = "2026-03-19T16:26:19.897Z" }, - { url = "https://files.pythonhosted.org/packages/8f/e8/726643a3ea68c727da31570bde48c7a10f1aa60eddd628d94078fec586ff/ruff-0.15.7-py3-none-win_arm64.whl", hash = "sha256:18e8d73f1c3fdf27931497972250340f92e8c861722161a9caeb89a58ead6ed2", size = 11023304, upload-time = "2026-03-19T16:26:51.669Z" }, -] - -[[package]] -name = "s3transfer" -version = "0.17.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "botocore" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9b/ec/7c692cde9125b77e84b307354d4fb705f98b8ccad59a036d5957ca75bfc3/s3transfer-0.17.0.tar.gz", hash = "sha256:9edeb6d1c3c2f89d6050348548834ad8289610d886e5bf7b7207728bd43ce33a", size = 155337, upload-time = "2026-04-29T22:07:36.33Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/72/c6c32d2b657fa3dad1de340254e14390b1e334ce38268b7ad51abda3c8c2/s3transfer-0.17.0-py3-none-any.whl", hash = "sha256:ce3801712acf4ad3e89fb9990df97b4972e93f4b3b0004d214be5bce12814c20", size = 86811, upload-time = "2026-04-29T22:07:34.966Z" }, -] - -[[package]] -name = "scikit-learn" -version = "1.8.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "joblib" }, - { name = "numpy" }, - { name = "scipy" }, - { name = "threadpoolctl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770, upload-time = "2025-12-10T07:08:03.251Z" }, - { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458, upload-time = "2025-12-10T07:08:05.336Z" }, - { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341, upload-time = "2025-12-10T07:08:07.732Z" }, - { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022, upload-time = "2025-12-10T07:08:09.862Z" }, - { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409, upload-time = "2025-12-10T07:08:12.028Z" }, - { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760, upload-time = "2025-12-10T07:08:13.688Z" }, - { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045, upload-time = "2025-12-10T07:08:15.215Z" }, - { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324, upload-time = "2025-12-10T07:08:17.561Z" }, - { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651, upload-time = "2025-12-10T07:08:19.952Z" }, - { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045, upload-time = "2025-12-10T07:08:22.11Z" }, - { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994, upload-time = "2025-12-10T07:08:23.943Z" }, - { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518, upload-time = "2025-12-10T07:08:25.71Z" }, - { url = "https://files.pythonhosted.org/packages/24/05/1af2c186174cc92dcab2233f327336058c077d38f6fe2aceb08e6ab4d509/scikit_learn-1.8.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3", size = 8528667, upload-time = "2025-12-10T07:08:27.541Z" }, - { url = "https://files.pythonhosted.org/packages/a8/25/01c0af38fe969473fb292bba9dc2b8f9b451f3112ff242c647fee3d0dfe7/scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7", size = 8066524, upload-time = "2025-12-10T07:08:29.822Z" }, - { url = "https://files.pythonhosted.org/packages/be/ce/a0623350aa0b68647333940ee46fe45086c6060ec604874e38e9ab7d8e6c/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6", size = 8657133, upload-time = "2025-12-10T07:08:31.865Z" }, - { url = "https://files.pythonhosted.org/packages/b8/cb/861b41341d6f1245e6ca80b1c1a8c4dfce43255b03df034429089ca2a2c5/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4", size = 8923223, upload-time = "2025-12-10T07:08:34.166Z" }, - { url = "https://files.pythonhosted.org/packages/76/18/a8def8f91b18cd1ba6e05dbe02540168cb24d47e8dcf69e8d00b7da42a08/scikit_learn-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6", size = 8096518, upload-time = "2025-12-10T07:08:36.339Z" }, - { url = "https://files.pythonhosted.org/packages/d1/77/482076a678458307f0deb44e29891d6022617b2a64c840c725495bee343f/scikit_learn-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242", size = 7754546, upload-time = "2025-12-10T07:08:38.128Z" }, - { url = "https://files.pythonhosted.org/packages/2d/d1/ef294ca754826daa043b2a104e59960abfab4cf653891037d19dd5b6f3cf/scikit_learn-1.8.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7", size = 8848305, upload-time = "2025-12-10T07:08:41.013Z" }, - { url = "https://files.pythonhosted.org/packages/5b/e2/b1f8b05138ee813b8e1a4149f2f0d289547e60851fd1bb268886915adbda/scikit_learn-1.8.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9", size = 8432257, upload-time = "2025-12-10T07:08:42.873Z" }, - { url = "https://files.pythonhosted.org/packages/26/11/c32b2138a85dcb0c99f6afd13a70a951bfdff8a6ab42d8160522542fb647/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f", size = 8678673, upload-time = "2025-12-10T07:08:45.362Z" }, - { url = "https://files.pythonhosted.org/packages/c7/57/51f2384575bdec454f4fe4e7a919d696c9ebce914590abf3e52d47607ab8/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9", size = 8922467, upload-time = "2025-12-10T07:08:47.408Z" }, - { url = "https://files.pythonhosted.org/packages/35/4d/748c9e2872637a57981a04adc038dacaa16ba8ca887b23e34953f0b3f742/scikit_learn-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2", size = 8774395, upload-time = "2025-12-10T07:08:49.337Z" }, - { url = "https://files.pythonhosted.org/packages/60/22/d7b2ebe4704a5e50790ba089d5c2ae308ab6bb852719e6c3bd4f04c3a363/scikit_learn-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c", size = 8002647, upload-time = "2025-12-10T07:08:51.601Z" }, -] - -[[package]] -name = "scipy" -version = "1.17.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199, upload-time = "2026-02-23T00:19:17.192Z" }, - { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001, upload-time = "2026-02-23T00:19:22.241Z" }, - { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719, upload-time = "2026-02-23T00:19:26.329Z" }, - { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595, upload-time = "2026-02-23T00:19:30.304Z" }, - { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429, upload-time = "2026-02-23T00:19:35.536Z" }, - { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952, upload-time = "2026-02-23T00:19:42.259Z" }, - { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063, upload-time = "2026-02-23T00:19:47.547Z" }, - { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449, upload-time = "2026-02-23T00:19:53.238Z" }, - { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943, upload-time = "2026-02-23T00:20:50.89Z" }, - { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621, upload-time = "2026-02-23T00:20:55.871Z" }, - { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708, upload-time = "2026-02-23T00:19:58.694Z" }, - { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135, upload-time = "2026-02-23T00:20:03.934Z" }, - { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977, upload-time = "2026-02-23T00:20:07.935Z" }, - { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601, upload-time = "2026-02-23T00:20:12.161Z" }, - { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667, upload-time = "2026-02-23T00:20:17.208Z" }, - { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159, upload-time = "2026-02-23T00:20:23.087Z" }, - { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771, upload-time = "2026-02-23T00:20:28.636Z" }, - { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" }, - { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" }, - { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" }, - { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" }, - { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" }, - { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" }, - { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" }, - { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" }, - { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" }, - { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" }, - { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" }, - { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" }, - { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" }, - { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" }, - { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" }, - { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" }, - { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" }, - { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" }, - { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" }, - { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" }, - { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" }, - { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" }, - { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" }, -] - -[[package]] -name = "setuptools" -version = "81.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0d/1c/73e719955c59b8e424d015ab450f51c0af856ae46ea2da83eba51cc88de1/setuptools-81.0.0.tar.gz", hash = "sha256:487b53915f52501f0a79ccfd0c02c165ffe06631443a886740b91af4b7a5845a", size = 1198299, upload-time = "2026-02-06T21:10:39.601Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e1/e3/c164c88b2e5ce7b24d667b9bd83589cf4f3520d97cad01534cd3c4f55fdb/setuptools-81.0.0-py3-none-any.whl", hash = "sha256:fdd925d5c5d9f62e4b74b30d6dd7828ce236fd6ed998a08d81de62ce5a6310d6", size = 1062021, upload-time = "2026-02-06T21:10:37.175Z" }, -] - -[[package]] -name = "shellingham" -version = "1.5.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, -] - -[[package]] -name = "six" -version = "1.17.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, -] - -[[package]] -name = "snowballstemmer" -version = "3.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/63/ee/67eef9600338e245ad7838230969a34c823ddbdbccc5e1fc43cd75b55bc9/snowballstemmer-3.1.0.tar.gz", hash = "sha256:fd9e34526b23340cd23ffea6c9f9760974ecc2c2ac9e1d81401443ccdb2a801f", size = 122523, upload-time = "2026-05-24T19:04:19.691Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/49/83/ddbf4533c62dd32667ef1238952abef155f3d3391f5be69a352ad1638a42/snowballstemmer-3.1.0-py3-none-any.whl", hash = "sha256:17e6d1da216aa07db6dad37139ea70cf13c4b2e9a096f6e64a9648fc657d3154", size = 104550, upload-time = "2026-05-24T19:04:18.026Z" }, -] - -[[package]] -name = "sortedcontainers" -version = "2.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, -] - -[[package]] -name = "soupsieve" -version = "2.8.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/47/2c/0a5f6f8ee0d5589e48c7640213ed5175d52cf540a06725b628cc1a45d6ce/soupsieve-2.8.4.tar.gz", hash = "sha256:e121fd02e975c695e4e9e8774a5ee35d74714b59307868dcc5319ad2d9e3328e", size = 121110, upload-time = "2026-05-24T13:55:57.154Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/f5/0c41cb68dcae6b7de4fac4188a3a9589e21fb31df21ea3a2e888db95e6c9/soupsieve-2.8.4-py3-none-any.whl", hash = "sha256:e7e6b0769c8f51ed59acab6e994b00621096cfb1c640a7509295987388fbaf65", size = 37304, upload-time = "2026-05-24T13:55:55.406Z" }, -] - -[[package]] -name = "sphinx" -version = "5.0.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "alabaster" }, - { name = "babel" }, - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "docutils" }, - { name = "imagesize" }, - { name = "jinja2" }, - { name = "packaging" }, - { name = "pygments" }, - { name = "requests" }, - { name = "snowballstemmer" }, - { name = "sphinxcontrib-applehelp" }, - { name = "sphinxcontrib-devhelp" }, - { name = "sphinxcontrib-htmlhelp" }, - { name = "sphinxcontrib-jsmath" }, - { name = "sphinxcontrib-qthelp" }, - { name = "sphinxcontrib-serializinghtml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/83/5f/0ebf0c94dffd7a848ebfeafc0d3147a784afffb44e2c6fcaeab7eadd67ad/Sphinx-5.0.2.tar.gz", hash = "sha256:b18e978ea7565720f26019c702cd85c84376e948370f1cd43d60265010e1c7b0", size = 6729086, upload-time = "2022-06-16T17:17:44.446Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fd/a2/3139e82a7caa2fb6954d0e63db206cc60e0ad6c67ae61ef9cf87dc70ade1/Sphinx-5.0.2-py3-none-any.whl", hash = "sha256:d3e57663eed1d7c5c50895d191fdeda0b54ded6f44d5621b50709466c338d1e8", size = 3125975, upload-time = "2022-06-16T17:17:39.291Z" }, -] - -[[package]] -name = "sphinx-book-theme" -version = "1.0.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pydata-sphinx-theme" }, - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/be/3d/81b2d643d456e3d1bf9069836af7cc32c71e7efaa14faa5858100c9241d7/sphinx_book_theme-1.0.1.tar.gz", hash = "sha256:927b399a6906be067e49c11ef1a87472f1b1964075c9eea30fb82c64b20aedee", size = 290063, upload-time = "2023-03-31T07:13:37.536Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/45/3abe359075154f4d6a8626f4b591a28cc703d7c169cef1f7b87cab1a62f7/sphinx_book_theme-1.0.1-py3-none-any.whl", hash = "sha256:d15f8248b3718a9a6be0ba617a32d1591f9fa39c614469bface777ba06a73b75", size = 396932, upload-time = "2023-03-31T07:13:35.359Z" }, -] - -[[package]] -name = "sphinx-comments" -version = "0.0.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c0/75/5bbf29e83eaf79843180cf424d0d550bda14a1792ca51dcf79daa065ba93/sphinx-comments-0.0.3.tar.gz", hash = "sha256:00170afff27019fad08e421da1ae49c681831fb2759786f07c826e89ac94cf21", size = 7960, upload-time = "2020-08-12T00:07:31.183Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/26/97/a5c39f619375d4f81d5422377fb027075898efa6b6202c1ccf1e5bb38a32/sphinx_comments-0.0.3-py3-none-any.whl", hash = "sha256:1e879b4e9bfa641467f83e3441ac4629225fc57c29995177d043252530c21d00", size = 4591, upload-time = "2020-08-12T00:07:30.297Z" }, -] - -[[package]] -name = "sphinx-copybutton" -version = "0.5.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fc/2b/a964715e7f5295f77509e59309959f4125122d648f86b4fe7d70ca1d882c/sphinx-copybutton-0.5.2.tar.gz", hash = "sha256:4cf17c82fb9646d1bc9ca92ac280813a3b605d8c421225fd9913154103ee1fbd", size = 23039, upload-time = "2023-04-14T08:10:22.998Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/48/1ea60e74949eecb12cdd6ac43987f9fd331156388dcc2319b45e2ebb81bf/sphinx_copybutton-0.5.2-py3-none-any.whl", hash = "sha256:fb543fd386d917746c9a2c50360c7905b605726b9355cd26e9974857afeae06e", size = 13343, upload-time = "2023-04-14T08:10:20.844Z" }, -] - -[[package]] -name = "sphinx-design" -version = "0.3.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7b/28/0eb39f87239acf377db5f08cf2f2b27d43c3371624a4cca128dfe952aa10/sphinx_design-0.3.0.tar.gz", hash = "sha256:7183fa1fae55b37ef01bda5125a21ee841f5bbcbf59a35382be598180c4cefba", size = 2151040, upload-time = "2022-08-22T22:14:51.473Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/89/aa/9872f086a8f483f86f174d3eb03c6954b01e851e151c4ddbd5bf758a9402/sphinx_design-0.3.0-py3-none-any.whl", hash = "sha256:823c1dd74f31efb3285ec2f1254caefed29d762a40cd676f58413a1e4ed5cc96", size = 2173640, upload-time = "2022-08-22T22:14:49.5Z" }, -] - -[[package]] -name = "sphinx-external-toc" -version = "0.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "pyyaml" }, - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a9/55/84109f851bffeb93ecaa18c9d525168224038c5db633224b0e088b841c13/sphinx_external_toc-0.3.1.tar.gz", hash = "sha256:9c8ea9980ea0e57bf3ce98f6a400f9b69eb1df808f7dd796c9c8cc1873d8b355", size = 31574, upload-time = "2022-11-24T23:59:37.51Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/2e/ba8e08ac97618e3f207e19297840937d4c39d77cea1aca2b8ea0a18f0bd8/sphinx_external_toc-0.3.1-py3-none-any.whl", hash = "sha256:cd93c1e7599327b2a728db12d9819068ce719c4b037ffc62e47f20ffb6310fb3", size = 26710, upload-time = "2022-11-24T23:59:34.73Z" }, -] - -[[package]] -name = "sphinx-jupyterbook-latex" -version = "0.5.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/20/a5/7dbb8a2f4238d0c5dbed4b1fec6fcc1979571257b99620a21368674a45ef/sphinx_jupyterbook_latex-0.5.2.tar.gz", hash = "sha256:da1d3ad028f55ddbf10b9130bb9f24fc60cafb671cbd39dfd95537aafc90972e", size = 15731, upload-time = "2022-11-16T04:07:33.781Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/18/9e/23dfce036d9cd50d71fcc7da70b99b7a6ea92bd55eb4a2fc2ac8a9501dbf/sphinx_jupyterbook_latex-0.5.2-py3-none-any.whl", hash = "sha256:24de689689ddc27c736b15b91c6b9afdcdc31570938572693bb05bfff8f50758", size = 14027, upload-time = "2022-11-16T04:07:32.049Z" }, -] - -[[package]] -name = "sphinx-multitoc-numbering" -version = "0.1.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/37/1e/577bae038372885ebc34bd8c0f290295785a0250cac6528eb6d50e4b92d5/sphinx-multitoc-numbering-0.1.3.tar.gz", hash = "sha256:c9607671ac511236fa5d61a7491c1031e700e8d498c9d2418e6c61d1251209ae", size = 4542, upload-time = "2021-03-15T12:01:43.758Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/9f/902f2030674cd9473fdbe5a2c2dec2618c27ec853484c35f82cf8df40ece/sphinx_multitoc_numbering-0.1.3-py3-none-any.whl", hash = "sha256:33d2e707a9b2b8ad636b3d4302e658a008025106fe0474046c651144c26d8514", size = 4616, upload-time = "2021-03-15T12:01:42.419Z" }, -] - -[[package]] -name = "sphinx-thebe" -version = "0.2.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/19/f6/fb50e5cb37447e30caa25b1cda2734cb2a1737c2f16bb36e4c463dee583c/sphinx-thebe-0.2.1.tar.gz", hash = "sha256:f4c8c1542054f991b73fcb28c4cf21697e42aba2f83f22348c1c851b82766583", size = 8284, upload-time = "2023-01-27T09:50:26.797Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/58/1c/f4a84761862a783273bb11e4e8f36976f19d18a0ead4543663b02140a9cd/sphinx_thebe-0.2.1-py3-none-any.whl", hash = "sha256:e8af555c90acba3541fa7108ea5981ae9c4bd406b54d9a242ab054d326ab7441", size = 8287, upload-time = "2023-01-27T09:50:25.333Z" }, -] - -[[package]] -name = "sphinx-togglebutton" -version = "0.4.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "docutils" }, - { name = "setuptools" }, - { name = "sphinx" }, - { name = "wheel" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/cc/be/169a0b0a8ad9588e8697c85e1d489aaaca7416073c2fc0267c360af5aae9/sphinx_togglebutton-0.4.5.tar.gz", hash = "sha256:c870dfbd3bc6e119b50ff9a37a64f8991902269e856728931c7d89877e8d4b3d", size = 18101, upload-time = "2026-03-27T13:50:41.984Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/2e/3dd55564928c5d61f92827d4b91307dde7911a40fbe0000645d73202eea9/sphinx_togglebutton-0.4.5-py3-none-any.whl", hash = "sha256:74eac6d2426110c3e1e6f989a98e07d7823141a335df1ad8a9d637bdf6a7af62", size = 44907, upload-time = "2026-03-27T13:50:40.94Z" }, -] - -[[package]] -name = "sphinxcontrib-applehelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053, upload-time = "2024-07-29T01:09:00.465Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300, upload-time = "2024-07-29T01:08:58.99Z" }, -] - -[[package]] -name = "sphinxcontrib-bibtex" -version = "2.5.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "docutils" }, - { name = "pybtex" }, - { name = "pybtex-docutils" }, - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/eb/54/94fbcd5eb0532eaa91580d09795c4b6c562b72d5638c2ed5b5cc31d2b1f8/sphinxcontrib-bibtex-2.5.0.tar.gz", hash = "sha256:71b42e5db0e2e284f243875326bf9936aa9a763282277d75048826fef5b00eaa", size = 113310, upload-time = "2022-08-22T13:16:46.025Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b2/17/3be04de2ed752996654895558db01a30d64759b2c7120e7692402b8d4e19/sphinxcontrib_bibtex-2.5.0-py3-none-any.whl", hash = "sha256:748f726eaca6efff7731012103417ef130ecdcc09501b4d0c54283bf5f059f76", size = 39752, upload-time = "2022-08-22T13:16:43.376Z" }, -] - -[[package]] -name = "sphinxcontrib-devhelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967, upload-time = "2024-07-29T01:09:23.417Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530, upload-time = "2024-07-29T01:09:21.945Z" }, -] - -[[package]] -name = "sphinxcontrib-htmlhelp" -version = "2.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617, upload-time = "2024-07-29T01:09:37.889Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705, upload-time = "2024-07-29T01:09:36.407Z" }, -] - -[[package]] -name = "sphinxcontrib-jsmath" -version = "1.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787, upload-time = "2019-01-21T16:10:16.347Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071, upload-time = "2019-01-21T16:10:14.333Z" }, -] - -[[package]] -name = "sphinxcontrib-qthelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165, upload-time = "2024-07-29T01:09:56.435Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743, upload-time = "2024-07-29T01:09:54.885Z" }, -] - -[[package]] -name = "sphinxcontrib-serializinghtml" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080, upload-time = "2024-07-29T01:10:09.332Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, -] - -[[package]] -name = "spm-calculator" -version = "0.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "census" }, - { name = "numpy" }, - { name = "openpyxl" }, - { name = "pandas" }, - { name = "requests" }, - { name = "us" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/54/3b/b805c7e3e18c5b5c00f61b60112f9690d084c910e2481bc020f35390d8fd/spm_calculator-0.3.1.tar.gz", hash = "sha256:41f2f4d00d8c03422a7d57b800052e7760b88e463a5884802f83ed58d35c18c1", size = 75945, upload-time = "2026-04-17T19:52:39.707Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/1b/29f705f8a96fc7f55f2c07dfcddbbae78efdc6f174d25d4a0560fc3f5cf9/spm_calculator-0.3.1-py3-none-any.whl", hash = "sha256:52c57ecc5a240ec941b0f2b0d93bc4fa437ef6250e233baed8e11916fa9c1150", size = 57826, upload-time = "2026-04-17T19:52:38.444Z" }, -] - -[[package]] -name = "sqlalchemy" -version = "2.0.49" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "greenlet", marker = "platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/09/45/461788f35e0364a8da7bda51a1fe1b09762d0c32f12f63727998d85a873b/sqlalchemy-2.0.49.tar.gz", hash = "sha256:d15950a57a210e36dd4cec1aac22787e2a4d57ba9318233e2ef8b2daf9ff2d5f", size = 9898221, upload-time = "2026-04-03T16:38:11.704Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ae/81/81755f50eb2478eaf2049728491d4ea4f416c1eb013338682173259efa09/sqlalchemy-2.0.49-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df2d441bacf97022e81ad047e1597552eb3f83ca8a8f1a1fdd43cd7fe3898120", size = 2154547, upload-time = "2026-04-03T16:53:08.64Z" }, - { url = "https://files.pythonhosted.org/packages/a2/bc/3494270da80811d08bcfa247404292428c4fe16294932bce5593f215cad9/sqlalchemy-2.0.49-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8e20e511dc15265fb433571391ba313e10dd8ea7e509d51686a51313b4ac01a2", size = 3280782, upload-time = "2026-04-03T17:07:43.508Z" }, - { url = "https://files.pythonhosted.org/packages/cd/f5/038741f5e747a5f6ea3e72487211579d8cbea5eb9827a9cbd61d0108c4bd/sqlalchemy-2.0.49-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47604cb2159f8bbd5a1ab48a714557156320f20871ee64d550d8bf2683d980d3", size = 3297156, upload-time = "2026-04-03T17:12:27.697Z" }, - { url = "https://files.pythonhosted.org/packages/88/50/a6af0ff9dc954b43a65ca9b5367334e45d99684c90a3d3413fc19a02d43c/sqlalchemy-2.0.49-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:22d8798819f86720bc646ab015baff5ea4c971d68121cb36e2ebc2ee43ead2b7", size = 3228832, upload-time = "2026-04-03T17:07:45.38Z" }, - { url = "https://files.pythonhosted.org/packages/bc/d1/5f6bdad8de0bf546fc74370939621396515e0cdb9067402d6ba1b8afbe9a/sqlalchemy-2.0.49-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9b1c058c171b739e7c330760044803099c7fff11511e3ab3573e5327116a9c33", size = 3267000, upload-time = "2026-04-03T17:12:29.657Z" }, - { url = "https://files.pythonhosted.org/packages/f7/30/ad62227b4a9819a5e1c6abff77c0f614fa7c9326e5a3bdbee90f7139382b/sqlalchemy-2.0.49-cp313-cp313-win32.whl", hash = "sha256:a143af2ea6672f2af3f44ed8f9cd020e9cc34c56f0e8db12019d5d9ecf41cb3b", size = 2115641, upload-time = "2026-04-03T17:05:43.989Z" }, - { url = "https://files.pythonhosted.org/packages/17/3a/7215b1b7d6d49dc9a87211be44562077f5f04f9bb5a59552c1c8e2d98173/sqlalchemy-2.0.49-cp313-cp313-win_amd64.whl", hash = "sha256:12b04d1db2663b421fe072d638a138460a51d5a862403295671c4f3987fb9148", size = 2141498, upload-time = "2026-04-03T17:05:45.7Z" }, - { url = "https://files.pythonhosted.org/packages/28/4b/52a0cb2687a9cd1648252bb257be5a1ba2c2ded20ba695c65756a55a15a4/sqlalchemy-2.0.49-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24bd94bb301ec672d8f0623eba9226cc90d775d25a0c92b5f8e4965d7f3a1518", size = 3560807, upload-time = "2026-04-03T16:58:31.666Z" }, - { url = "https://files.pythonhosted.org/packages/8c/d8/fda95459204877eed0458550d6c7c64c98cc50c2d8d618026737de9ed41a/sqlalchemy-2.0.49-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a51d3db74ba489266ef55c7a4534eb0b8db9a326553df481c11e5d7660c8364d", size = 3527481, upload-time = "2026-04-03T17:06:00.155Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0a/2aac8b78ac6487240cf7afef8f203ca783e8796002dc0cf65c4ee99ff8bb/sqlalchemy-2.0.49-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:55250fe61d6ebfd6934a272ee16ef1244e0f16b7af6cd18ab5b1fc9f08631db0", size = 3468565, upload-time = "2026-04-03T16:58:33.414Z" }, - { url = "https://files.pythonhosted.org/packages/a5/3d/ce71cfa82c50a373fd2148b3c870be05027155ce791dc9a5dcf439790b8b/sqlalchemy-2.0.49-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:46796877b47034b559a593d7e4b549aba151dae73f9e78212a3478161c12ab08", size = 3477769, upload-time = "2026-04-03T17:06:02.787Z" }, - { url = "https://files.pythonhosted.org/packages/d5/e8/0a9f5c1f7c6f9ca480319bf57c2d7423f08d31445974167a27d14483c948/sqlalchemy-2.0.49-cp313-cp313t-win32.whl", hash = "sha256:9c4969a86e41454f2858256c39bdfb966a20961e9b58bf8749b65abf447e9a8d", size = 2143319, upload-time = "2026-04-03T17:02:04.328Z" }, - { url = "https://files.pythonhosted.org/packages/0e/51/fb5240729fbec73006e137c4f7a7918ffd583ab08921e6ff81a999d6517a/sqlalchemy-2.0.49-cp313-cp313t-win_amd64.whl", hash = "sha256:b9870d15ef00e4d0559ae10ee5bc71b654d1f20076dbe8bc7ed19b4c0625ceba", size = 2175104, upload-time = "2026-04-03T17:02:05.989Z" }, - { url = "https://files.pythonhosted.org/packages/55/33/bf28f618c0a9597d14e0b9ee7d1e0622faff738d44fe986ee287cdf1b8d0/sqlalchemy-2.0.49-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:233088b4b99ebcbc5258c755a097aa52fbf90727a03a5a80781c4b9c54347a2e", size = 2156356, upload-time = "2026-04-03T16:53:09.914Z" }, - { url = "https://files.pythonhosted.org/packages/d1/a7/5f476227576cb8644650eff68cc35fa837d3802b997465c96b8340ced1e2/sqlalchemy-2.0.49-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57ca426a48eb2c682dae8204cd89ea8ab7031e2675120a47924fabc7caacbc2a", size = 3276486, upload-time = "2026-04-03T17:07:46.9Z" }, - { url = "https://files.pythonhosted.org/packages/2e/84/efc7c0bf3a1c5eef81d397f6fddac855becdbb11cb38ff957888603014a7/sqlalchemy-2.0.49-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:685e93e9c8f399b0c96a624799820176312f5ceef958c0f88215af4013d29066", size = 3281479, upload-time = "2026-04-03T17:12:32.226Z" }, - { url = "https://files.pythonhosted.org/packages/91/68/bb406fa4257099c67bd75f3f2261b129c63204b9155de0d450b37f004698/sqlalchemy-2.0.49-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9e0400fa22f79acc334d9a6b185dc00a44a8e6578aa7e12d0ddcd8434152b187", size = 3226269, upload-time = "2026-04-03T17:07:48.678Z" }, - { url = "https://files.pythonhosted.org/packages/67/84/acb56c00cca9f251f437cb49e718e14f7687505749ea9255d7bd8158a6df/sqlalchemy-2.0.49-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a05977bffe9bffd2229f477fa75eabe3192b1b05f408961d1bebff8d1cd4d401", size = 3248260, upload-time = "2026-04-03T17:12:34.381Z" }, - { url = "https://files.pythonhosted.org/packages/56/19/6a20ea25606d1efd7bd1862149bb2a22d1451c3f851d23d887969201633f/sqlalchemy-2.0.49-cp314-cp314-win32.whl", hash = "sha256:0f2fa354ba106eafff2c14b0cc51f22801d1e8b2e4149342023bd6f0955de5f5", size = 2118463, upload-time = "2026-04-03T17:05:47.093Z" }, - { url = "https://files.pythonhosted.org/packages/cf/4f/8297e4ed88e80baa1f5aa3c484a0ee29ef3c69c7582f206c916973b75057/sqlalchemy-2.0.49-cp314-cp314-win_amd64.whl", hash = "sha256:77641d299179c37b89cf2343ca9972c88bb6eef0d5fc504a2f86afd15cd5adf5", size = 2144204, upload-time = "2026-04-03T17:05:48.694Z" }, - { url = "https://files.pythonhosted.org/packages/1f/33/95e7216df810c706e0cd3655a778604bbd319ed4f43333127d465a46862d/sqlalchemy-2.0.49-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c1dc3368794d522f43914e03312202523cc89692f5389c32bea0233924f8d977", size = 3565474, upload-time = "2026-04-03T16:58:35.128Z" }, - { url = "https://files.pythonhosted.org/packages/0c/a4/ed7b18d8ccf7f954a83af6bb73866f5bc6f5636f44c7731fbb741f72cc4f/sqlalchemy-2.0.49-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7c821c47ecfe05cc32140dcf8dc6fd5d21971c86dbd56eabfe5ba07a64910c01", size = 3530567, upload-time = "2026-04-03T17:06:04.587Z" }, - { url = "https://files.pythonhosted.org/packages/73/a3/20faa869c7e21a827c4a2a42b41353a54b0f9f5e96df5087629c306df71e/sqlalchemy-2.0.49-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9c04bff9a5335eb95c6ecf1c117576a0aa560def274876fd156cfe5510fccc61", size = 3474282, upload-time = "2026-04-03T16:58:37.131Z" }, - { url = "https://files.pythonhosted.org/packages/b7/50/276b9a007aa0764304ad467eceb70b04822dc32092492ee5f322d559a4dc/sqlalchemy-2.0.49-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7f605a456948c35260e7b2a39f8952a26f077fd25653c37740ed186b90aaa68a", size = 3480406, upload-time = "2026-04-03T17:06:07.176Z" }, - { url = "https://files.pythonhosted.org/packages/e5/c3/c80fcdb41905a2df650c2a3e0337198b6848876e63d66fe9188ef9003d24/sqlalchemy-2.0.49-cp314-cp314t-win32.whl", hash = "sha256:6270d717b11c5476b0cbb21eedc8d4dbb7d1a956fd6c15a23e96f197a6193158", size = 2149151, upload-time = "2026-04-03T17:02:07.281Z" }, - { url = "https://files.pythonhosted.org/packages/05/52/9f1a62feab6ed368aff068524ff414f26a6daebc7361861035ae00b05530/sqlalchemy-2.0.49-cp314-cp314t-win_amd64.whl", hash = "sha256:275424295f4256fd301744b8f335cff367825d270f155d522b30c7bf49903ee7", size = 2184178, upload-time = "2026-04-03T17:02:08.623Z" }, - { url = "https://files.pythonhosted.org/packages/e5/30/8519fdde58a7bdf155b714359791ad1dc018b47d60269d5d160d311fdc36/sqlalchemy-2.0.49-py3-none-any.whl", hash = "sha256:ec44cfa7ef1a728e88ad41674de50f6db8cfdb3e2af84af86e0041aaf02d43d0", size = 1942158, upload-time = "2026-04-03T16:53:44.135Z" }, -] - -[[package]] -name = "stack-data" -version = "0.6.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "asttokens" }, - { name = "executing" }, - { name = "pure-eval" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, -] - -[[package]] -name = "standard-imghdr" -version = "3.13.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1a/8d/ab2620fbe2e348483c9cb776c3b7b3cc407899291a041d7fa026469b7cd1/standard_imghdr-3.13.0.tar.gz", hash = "sha256:8d9c68058d882f6fc3542a8d39ef9ff94d2187dc90bd0c851e0902776b7b7a42", size = 5511, upload-time = "2024-10-30T16:01:36.412Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/df/cb/e1da7e340586a078404c7e4328bfefc930867ace8a9a55916fd220cf9547/standard_imghdr-3.13.0-py3-none-any.whl", hash = "sha256:30a1bff5465605bb496f842a6ac3cc1f2131bf3025b0da28d4877d6d4b7cc8e9", size = 4639, upload-time = "2024-10-30T16:01:13.829Z" }, -] - -[[package]] -name = "statsmodels" -version = "0.14.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "packaging" }, - { name = "pandas" }, - { name = "patsy" }, - { name = "scipy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0d/81/e8d74b34f85285f7335d30c5e3c2d7c0346997af9f3debf9a0a9a63de184/statsmodels-0.14.6.tar.gz", hash = "sha256:4d17873d3e607d398b85126cd4ed7aad89e4e9d89fc744cdab1af3189a996c2a", size = 20689085, upload-time = "2025-12-05T23:08:39.522Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/59/a5aad5b0cc266f5be013db8cde563ac5d2a025e7efc0c328d83b50c72992/statsmodels-0.14.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47ee7af083623d2091954fa71c7549b8443168f41b7c5dce66510274c50fd73e", size = 10072009, upload-time = "2025-12-05T23:11:14.021Z" }, - { url = "https://files.pythonhosted.org/packages/53/dd/d8cfa7922fc6dc3c56fa6c59b348ea7de829a94cd73208c6f8202dd33f17/statsmodels-0.14.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa60d82e29fcd0a736e86feb63a11d2380322d77a9369a54be8b0965a3985f71", size = 9980018, upload-time = "2025-12-05T23:11:30.907Z" }, - { url = "https://files.pythonhosted.org/packages/ee/77/0ec96803eba444efd75dba32f2ef88765ae3e8f567d276805391ec2c98c6/statsmodels-0.14.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:89ee7d595f5939cc20bf946faedcb5137d975f03ae080f300ebb4398f16a5bd4", size = 10060269, upload-time = "2025-12-05T23:11:46.338Z" }, - { url = "https://files.pythonhosted.org/packages/10/b9/fd41f1f6af13a1a1212a06bb377b17762feaa6d656947bf666f76300fc05/statsmodels-0.14.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:730f3297b26749b216a06e4327fe0be59b8d05f7d594fb6caff4287b69654589", size = 10324155, upload-time = "2025-12-05T23:12:01.805Z" }, - { url = "https://files.pythonhosted.org/packages/ee/0f/a6900e220abd2c69cd0a07e3ad26c71984be6061415a60e0f17b152ecf08/statsmodels-0.14.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f1c08befa85e93acc992b72a390ddb7bd876190f1360e61d10cf43833463bc9c", size = 10349765, upload-time = "2025-12-05T23:12:18.018Z" }, - { url = "https://files.pythonhosted.org/packages/98/08/b79f0c614f38e566eebbdcff90c0bcacf3c6ba7a5bbb12183c09c29ca400/statsmodels-0.14.6-cp313-cp313-win_amd64.whl", hash = "sha256:8021271a79f35b842c02a1794465a651a9d06ec2080f76ebc3b7adce77d08233", size = 9540043, upload-time = "2025-12-05T23:12:33.887Z" }, - { url = "https://files.pythonhosted.org/packages/71/de/09540e870318e0c7b58316561d417be45eff731263b4234fdd2eee3511a8/statsmodels-0.14.6-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:00781869991f8f02ad3610da6627fd26ebe262210287beb59761982a8fa88cae", size = 10069403, upload-time = "2025-12-05T23:12:48.424Z" }, - { url = "https://files.pythonhosted.org/packages/ab/f0/63c1bfda75dc53cee858006e1f46bd6d6f883853bea1b97949d0087766ca/statsmodels-0.14.6-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:73f305fbf31607b35ce919fae636ab8b80d175328ed38fdc6f354e813b86ee37", size = 9989253, upload-time = "2025-12-05T23:13:05.274Z" }, - { url = "https://files.pythonhosted.org/packages/c1/98/b0dfb4f542b2033a3341aa5f1bdd97024230a4ad3670c5b0839d54e3dcab/statsmodels-0.14.6-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e443e7077a6e2d3faeea72f5a92c9f12c63722686eb80bb40a0f04e4a7e267ad", size = 10090802, upload-time = "2025-12-05T23:13:20.653Z" }, - { url = "https://files.pythonhosted.org/packages/34/0e/2408735aca9e764643196212f9069912100151414dd617d39ffc72d77eee/statsmodels-0.14.6-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3414e40c073d725007a6603a18247ab7af3467e1af4a5e5a24e4c27bc26673b4", size = 10337587, upload-time = "2025-12-05T23:13:37.597Z" }, - { url = "https://files.pythonhosted.org/packages/0f/36/4d44f7035ab3c0b2b6a4c4ebb98dedf36246ccbc1b3e2f51ebcd7ac83abb/statsmodels-0.14.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a518d3f9889ef920116f9fa56d0338069e110f823926356946dae83bc9e33e19", size = 10363350, upload-time = "2025-12-05T23:13:53.08Z" }, - { url = "https://files.pythonhosted.org/packages/26/33/f1652d0c59fa51de18492ee2345b65372550501ad061daa38f950be390b6/statsmodels-0.14.6-cp314-cp314-win_amd64.whl", hash = "sha256:151b73e29f01fe619dbce7f66d61a356e9d1fe5e906529b78807df9189c37721", size = 9588010, upload-time = "2025-12-05T23:14:07.28Z" }, -] - -[[package]] -name = "sympy" -version = "1.14.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mpmath" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, -] - -[[package]] -name = "tables" -version = "3.11.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "blosc2" }, - { name = "numexpr" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "py-cpuinfo" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/cc/a3/d213ebe7376d48055bd55a29cd9f99061afa0dcece608f94a5025d797b0a/tables-3.11.1.tar.gz", hash = "sha256:78abcf413091bc7c1e4e8c10fbbb438d1ac0b5a87436c5b972c3e8253871b6fb", size = 4790533, upload-time = "2026-03-01T11:43:36.036Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/bb/4a9cde6628563388db26fa86c64adb0f2475a757e72af0ec185fd520b72f/tables-3.11.1-cp311-abi3-macosx_10_9_x86_64.whl", hash = "sha256:eb30684c42a77bbecdef2b9c763c4372b0ddc9cc5bd8b2a2055f2042eee67217", size = 7045977, upload-time = "2026-03-01T11:42:48.605Z" }, - { url = "https://files.pythonhosted.org/packages/78/74/6568c8d3aabf9982ab89fe3e378afbd7aad4894bde4570991a3246169ef4/tables-3.11.1-cp311-abi3-macosx_11_0_arm64.whl", hash = "sha256:f0367d2e3df0f10ea63ccf4279f3fe58e32ec481767320301a483e2b3cd83efc", size = 6264947, upload-time = "2026-03-01T11:42:53.192Z" }, - { url = "https://files.pythonhosted.org/packages/cc/a3/ec228901fca4c996306b17f5c60a4105144df0bbd07b3a4a816f91f37b4a/tables-3.11.1-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56bf6fb9132ead989b7e76695d7613d6d08f071a8019038d6565ba90c66b9f3e", size = 6903733, upload-time = "2026-03-01T11:42:58.349Z" }, - { url = "https://files.pythonhosted.org/packages/99/29/c2dc674ea70fa9a4819417289a9c0d3e4780835beeed573eb66964cfb763/tables-3.11.1-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1e78fe190fdeb4afe430b79651bae2a4f341904eb85aa8dbafe5f1caee1c7f67", size = 7241357, upload-time = "2026-03-01T11:43:03.938Z" }, - { url = "https://files.pythonhosted.org/packages/60/b5/a59b62af4127790c618eb11c06c106706e07509a3fb9e346b2a3ffa74419/tables-3.11.1-cp311-abi3-win_amd64.whl", hash = "sha256:7fa6cb03f6fe55ae4f85e89ec5450e5c40cc4c52d8c3b60eb157a445c2219e89", size = 6526565, upload-time = "2026-03-01T11:43:08.58Z" }, - { url = "https://files.pythonhosted.org/packages/1e/ce/561c82496e7c8c15ebf19b53b12c0ef91b322a66869db762db9711102764/tables-3.11.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:a4bbd95036a4d0cc5c86c1f87fbb490b4c53cd70982f1c01b3ed6dcb3085cbb9", size = 7111409, upload-time = "2026-03-01T11:43:13.424Z" }, - { url = "https://files.pythonhosted.org/packages/84/18/bac920aee8239b572c506459607c6dd8742bc6275a43d51d2dd6ae1a1541/tables-3.11.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e3cfe79484351f7216eb8f3767bfa1217bfd271b04428f79cfa7ef6d7491919d", size = 6380142, upload-time = "2026-03-01T11:43:17.213Z" }, - { url = "https://files.pythonhosted.org/packages/59/3c/f4a694aa744d2b14d536e172c28dd70c84445f4787083a82d6d44a39e39f/tables-3.11.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a9c35f87fcb6a48c79fbc4e3ab15ca8f6053c4ce13063d6ca2ec36cbb58f40f", size = 7014135, upload-time = "2026-03-01T11:43:22.359Z" }, - { url = "https://files.pythonhosted.org/packages/45/82/94d4320d6c0fe5bd55230eec90cd142d58cda37b7cce00a318ac2a6abd93/tables-3.11.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4cf3218b76ba78d156d6ee75c19fb757d50682f6c7b4905370441afbfc9d77f3", size = 7349293, upload-time = "2026-03-01T11:43:27.569Z" }, - { url = "https://files.pythonhosted.org/packages/f7/02/a0f61a602ce2f2be8cc2e6146cc51acdaa8a1bb9b823b3863e70d3e0505d/tables-3.11.1-cp314-cp314t-win_amd64.whl", hash = "sha256:a6f7a3b82dbf0ae0f30de635ca88bb42dd87938b0950369d0ee4289c52ae6de2", size = 6854713, upload-time = "2026-03-01T11:43:31.934Z" }, -] - -[[package]] -name = "tabulate" -version = "0.10.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/46/58/8c37dea7bbf769b20d58e7ace7e5edfe65b849442b00ffcdd56be88697c6/tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d", size = 91754, upload-time = "2026-03-04T18:55:34.402Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/99/55/db07de81b5c630da5cbf5c7df646580ca26dfaefa593667fc6f2fe016d2e/tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3", size = 39814, upload-time = "2026-03-04T18:55:31.284Z" }, -] - -[[package]] -name = "tenacity" -version = "9.1.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, -] - -[[package]] -name = "threadpoolctl" -version = "3.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, -] - -[[package]] -name = "torch" -version = "2.11.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cuda-bindings", marker = "sys_platform == 'linux'" }, - { name = "cuda-toolkit", extra = ["cublas", "cudart", "cufft", "cufile", "cupti", "curand", "cusolver", "cusparse", "nvjitlink", "nvrtc", "nvtx"], marker = "sys_platform == 'linux'" }, - { name = "filelock" }, - { name = "fsspec" }, - { name = "jinja2" }, - { name = "networkx" }, - { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" }, - { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" }, - { name = "setuptools" }, - { name = "sympy" }, - { name = "triton", marker = "sys_platform == 'linux'" }, - { name = "typing-extensions" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/89/5ea6722763acee56b045435fb84258db7375c48165ec8be7880ab2b281c5/torch-2.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1e6debd97ccd3205bbb37eb806a9d8219e1139d15419982c09e23ef7d4369d18", size = 80606801, upload-time = "2026-03-23T18:10:18.649Z" }, - { url = "https://files.pythonhosted.org/packages/32/d1/8ed2173589cbfe744ed54e5a73efc107c0085ba5777ee93a5f4c1ab90553/torch-2.11.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:63a68fa59de8f87acc7e85a5478bb2dddbb3392b7593ec3e78827c793c4b73fd", size = 419732382, upload-time = "2026-03-23T18:08:30.835Z" }, - { url = "https://files.pythonhosted.org/packages/3d/e1/b73f7c575a4b8f87a5928f50a1e35416b5e27295d8be9397d5293e7e8d4c/torch-2.11.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:cc89b9b173d9adfab59fd227f0ab5e5516d9a52b658ae41d64e59d2e55a418db", size = 530711509, upload-time = "2026-03-23T18:08:47.213Z" }, - { url = "https://files.pythonhosted.org/packages/66/82/3e3fcdd388fbe54e29fd3f991f36846ff4ac90b0d0181e9c8f7236565f82/torch-2.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:4dda3b3f52d121063a731ddb835f010dc137b920d7fec2778e52f60d8e4bf0cd", size = 114555842, upload-time = "2026-03-23T18:09:52.111Z" }, - { url = "https://files.pythonhosted.org/packages/db/38/8ac78069621b8c2b4979c2f96dc8409ef5e9c4189f6aac629189a78677ca/torch-2.11.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8b394322f49af4362d4f80e424bcaca7efcd049619af03a4cf4501520bdf0fb4", size = 80959574, upload-time = "2026-03-23T18:10:14.214Z" }, - { url = "https://files.pythonhosted.org/packages/6d/6c/56bfb37073e7136e6dd86bfc6af7339946dd684e0ecf2155ac0eee687ae1/torch-2.11.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2658f34ce7e2dabf4ec73b45e2ca68aedad7a5be87ea756ad656eaf32bf1e1ea", size = 419732324, upload-time = "2026-03-23T18:09:36.604Z" }, - { url = "https://files.pythonhosted.org/packages/07/f4/1b666b6d61d3394cca306ea543ed03a64aad0a201b6cd159f1d41010aeb1/torch-2.11.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:98bb213c3084cfe176302949bdc360074b18a9da7ab59ef2edc9d9f742504778", size = 530596026, upload-time = "2026-03-23T18:09:20.842Z" }, - { url = "https://files.pythonhosted.org/packages/48/6b/30d1459fa7e4b67e9e3fe1685ca1d8bb4ce7c62ef436c3a615963c6c866c/torch-2.11.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a97b94bbf62992949b4730c6cd2cc9aee7b335921ee8dc207d930f2ed09ae2db", size = 114793702, upload-time = "2026-03-23T18:09:47.304Z" }, - { url = "https://files.pythonhosted.org/packages/26/0d/8603382f61abd0db35841148ddc1ffd607bf3100b11c6e1dab6d2fc44e72/torch-2.11.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:01018087326984a33b64e04c8cb5c2795f9120e0d775ada1f6638840227b04d7", size = 80573442, upload-time = "2026-03-23T18:09:10.117Z" }, - { url = "https://files.pythonhosted.org/packages/c7/86/7cd7c66cb9cec6be330fff36db5bd0eef386d80c031b581ec81be1d4b26c/torch-2.11.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:2bb3cc54bd0dea126b0060bb1ec9de0f9c7f7342d93d436646516b0330cd5be7", size = 419749385, upload-time = "2026-03-23T18:07:33.77Z" }, - { url = "https://files.pythonhosted.org/packages/47/e8/b98ca2d39b2e0e4730c0ee52537e488e7008025bc77ca89552ff91021f7c/torch-2.11.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4dc8b3809469b6c30b411bb8c4cad3828efd26236153d9beb6a3ec500f211a60", size = 530716756, upload-time = "2026-03-23T18:07:50.02Z" }, - { url = "https://files.pythonhosted.org/packages/78/88/d4a4cda8362f8a30d1ed428564878c3cafb0d87971fbd3947d4c84552095/torch-2.11.0-cp314-cp314-win_amd64.whl", hash = "sha256:2b4e811728bd0cc58fb2b0948fe939a1ee2bf1422f6025be2fca4c7bd9d79718", size = 114552300, upload-time = "2026-03-23T18:09:05.617Z" }, - { url = "https://files.pythonhosted.org/packages/bf/46/4419098ed6d801750f26567b478fc185c3432e11e2cad712bc6b4c2ab0d0/torch-2.11.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:8245477871c3700d4370352ffec94b103cfcb737229445cf9946cddb7b2ca7cd", size = 80959460, upload-time = "2026-03-23T18:09:00.818Z" }, - { url = "https://files.pythonhosted.org/packages/fd/66/54a56a4a6ceaffb567231994a9745821d3af922a854ed33b0b3a278e0a99/torch-2.11.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:ab9a8482f475f9ba20e12db84b0e55e2f58784bdca43a854a6ccd3fd4b9f75e6", size = 419735835, upload-time = "2026-03-23T18:07:18.974Z" }, - { url = "https://files.pythonhosted.org/packages/b1/e7/0b6665f533aa9e337662dc190425abc0af1fe3234088f4454c52393ded61/torch-2.11.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:563ed3d25542d7e7bbc5b235ccfacfeb97fb470c7fee257eae599adb8005c8a2", size = 530613405, upload-time = "2026-03-23T18:08:07.014Z" }, - { url = "https://files.pythonhosted.org/packages/cf/bf/c8d12a2c86dbfd7f40fb2f56fbf5a505ccf2d9ce131eb559dfc7c51e1a04/torch-2.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b2a43985ff5ef6ddd923bbcf99943e5f58059805787c5c9a2622bf05ca2965b0", size = 114792991, upload-time = "2026-03-23T18:08:19.216Z" }, -] - -[[package]] -name = "tornado" -version = "6.5.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/50/57/6d7303a77ae439d9189108f76c0c4fd89ee5e2cc8387bffb55232565c4ed/tornado-6.5.6.tar.gz", hash = "sha256:9a365179fe8ff6b8766f602c0f67c185d778193e9bdd828b19f0b6ed7764177d", size = 518139, upload-time = "2026-05-27T15:35:54.646Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1b/0d/b4f481e18c5a51864e6d12b9a05ecf72919696680b747c958c3fc1f4fbae/tornado-6.5.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:65fcfaafb079435c2c19dc9e07c0f1cf0fa9051759ed0a7d0a3ba7ea7f64919c", size = 447737, upload-time = "2026-05-27T15:35:38.122Z" }, - { url = "https://files.pythonhosted.org/packages/9e/9c/5430c39fcab1144d35860f457b15e9c08b4bc7ac86764354204e983d6183/tornado-6.5.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:38bc01b4acacded2de63ae78023548e41ebe6fbed3ec05a796d7ae3ad893887e", size = 445899, upload-time = "2026-05-27T15:35:40.519Z" }, - { url = "https://files.pythonhosted.org/packages/8b/79/fa7e14a2f939c807a8d30619b4eb604eab219601b78792516ebe22d40cf9/tornado-6.5.6-cp39-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b942e6a137fda31ff54bf8e6e2c8d1c37f1f50583f3ed53fb840b53b9601d104", size = 448964, upload-time = "2026-05-27T15:35:42.106Z" }, - { url = "https://files.pythonhosted.org/packages/a7/71/bd67d5f5199f937dafe03a49a37989f60f600ff6fef34c79412a829d97bd/tornado-6.5.6-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8666946e70171b8c3f1fc9b7876fac492e84822c4c7f3746f4e8f8bc9ac92a79", size = 449935, upload-time = "2026-05-27T15:35:43.906Z" }, - { url = "https://files.pythonhosted.org/packages/cc/a4/c24388c9cf5b3c3a513b56a158af9f23092c9a2810d789e294310797df21/tornado-6.5.6-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1c34cfab7ad6d104f052f55de06d39bbafc5885cfeb4da688803308dbcfa90b7", size = 449767, upload-time = "2026-05-27T15:35:45.793Z" }, - { url = "https://files.pythonhosted.org/packages/a5/eb/6a07ad550c3f7b37244bd0becdf293ec3d3e961783d8b720a97df50de1b2/tornado-6.5.6-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:385f35e4e22fb52551dfcda4cdc8c30c61c2c001aef5ddad99cdfe116952efd3", size = 449174, upload-time = "2026-05-27T15:35:47.485Z" }, - { url = "https://files.pythonhosted.org/packages/bb/84/3469e098dccdb6763130e06aacd786bb4363fca7b590a55c101ddf34ed30/tornado-6.5.6-cp39-abi3-win32.whl", hash = "sha256:db475f1b67b2809b10bb16264829087724ca8d24fe4ed47f7b8675cae453ef86", size = 450230, upload-time = "2026-05-27T15:35:49.322Z" }, - { url = "https://files.pythonhosted.org/packages/d2/3c/273a04e0b9dd9016f1685cca0c1c8795a71ac88a34a8c889a0b443483226/tornado-6.5.6-cp39-abi3-win_amd64.whl", hash = "sha256:6739bf1e8eb09230f1280ddbd3236f0309db70f2c551a8dbc40f62babdf82f79", size = 450667, upload-time = "2026-05-27T15:35:51.194Z" }, - { url = "https://files.pythonhosted.org/packages/02/98/0cffe22a224f60c5fb1e3aa0b76f9da2e1ca78b0e9545e3d077c68ce60a7/tornado-6.5.6-cp39-abi3-win_arm64.whl", hash = "sha256:2543597b24a695d72338a9a77818362d72387c03ae173f1f169eadc5c91466ac", size = 449690, upload-time = "2026-05-27T15:35:52.902Z" }, -] - -[[package]] -name = "tqdm" -version = "4.67.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, -] - -[[package]] -name = "traitlets" -version = "5.14.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, -] - -[[package]] -name = "triton" -version = "3.6.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd", size = 176133521, upload-time = "2026-01-20T16:16:13.321Z" }, - { url = "https://files.pythonhosted.org/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9", size = 188289450, upload-time = "2026-01-20T16:00:49.136Z" }, - { url = "https://files.pythonhosted.org/packages/ce/4e/41b0c8033b503fd3cfcd12392cdd256945026a91ff02452bef40ec34bee7/triton-3.6.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1722e172d34e32abc3eb7711d0025bb69d7959ebea84e3b7f7a341cd7ed694d6", size = 176276087, upload-time = "2026-01-20T16:16:18.989Z" }, - { url = "https://files.pythonhosted.org/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f", size = 188400296, upload-time = "2026-01-20T16:00:56.042Z" }, - { url = "https://files.pythonhosted.org/packages/49/55/5ecf0dcaa0f2fbbd4420f7ef227ee3cb172e91e5fede9d0ecaddc43363b4/triton-3.6.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5523241e7d1abca00f1d240949eebdd7c673b005edbbce0aca95b8191f1d43", size = 176138577, upload-time = "2026-01-20T16:16:25.426Z" }, - { url = "https://files.pythonhosted.org/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803", size = 188273063, upload-time = "2026-01-20T16:01:07.278Z" }, - { url = "https://files.pythonhosted.org/packages/48/db/56ee649cab5eaff4757541325aca81f52d02d4a7cd3506776cad2451e060/triton-3.6.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b3a97e8ed304dfa9bd23bb41ca04cdf6b2e617d5e782a8653d616037a5d537d", size = 176274804, upload-time = "2026-01-20T16:16:31.528Z" }, - { url = "https://files.pythonhosted.org/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7", size = 188397994, upload-time = "2026-01-20T16:01:14.236Z" }, -] - -[[package]] -name = "typer" -version = "0.24.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "annotated-doc" }, - { name = "click" }, - { name = "rich" }, - { name = "shellingham" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613, upload-time = "2026-02-21T16:54:40.609Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" }, -] - -[[package]] -name = "typing-extensions" -version = "4.15.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, -] - -[[package]] -name = "typing-inspection" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, -] - -[[package]] -name = "tzdata" -version = "2025.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, -] - -[[package]] -name = "uc-micro-py" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/78/67/9a363818028526e2d4579334460df777115bdec1bb77c08f9db88f6389f2/uc_micro_py-2.0.0.tar.gz", hash = "sha256:c53691e495c8db60e16ffc4861a35469b0ba0821fe409a8a7a0a71864d33a811", size = 6611, upload-time = "2026-03-01T06:31:27.526Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/61/73/d21edf5b204d1467e06500080a50f79d49ef2b997c79123a536d4a17d97c/uc_micro_py-2.0.0-py3-none-any.whl", hash = "sha256:3603a3859af53e5a39bc7677713c78ea6589ff188d70f4fee165db88e22b242c", size = 6383, upload-time = "2026-03-01T06:31:26.257Z" }, -] - -[[package]] -name = "urllib3" -version = "2.6.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, -] - -[[package]] -name = "us" -version = "3.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "jellyfish" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/35/12/06f87be706ccc5794569d14f903c2f755aa98e1a9d53e4e7e17d9986e9d1/us-3.2.0.tar.gz", hash = "sha256:cb223e85393dcc5171ead0dd212badc47f9667b23700fea3e7ea5f310d545338", size = 16046, upload-time = "2024-07-22T01:09:42.736Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/65/a8/1791660a87f03d10a3bce00401a66035999c91f5a9a6987569b84df5719d/us-3.2.0-py3-none-any.whl", hash = "sha256:571714ad6d473c72bbd2058a53404cdf4ecc0129e4f19adfcbeb4e2d7e3dc3e7", size = 13775, upload-time = "2024-07-22T01:09:41.432Z" }, -] - -[[package]] -name = "wcwidth" -version = "0.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, -] - -[[package]] -name = "wheel" -version = "0.46.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "packaging" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/89/24/a2eb353a6edac9a0303977c4cb048134959dd2a51b48a269dfc9dde00c8a/wheel-0.46.3.tar.gz", hash = "sha256:e3e79874b07d776c40bd6033f8ddf76a7dad46a7b8aa1b2787a83083519a1803", size = 60605, upload-time = "2026-01-22T12:39:49.136Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/22/b76d483683216dde3d67cba61fb2444be8d5be289bf628c13fc0fd90e5f9/wheel-0.46.3-py3-none-any.whl", hash = "sha256:4b399d56c9d9338230118d705d9737a2a468ccca63d5e813e2a4fc7815d8bc4d", size = 30557, upload-time = "2026-01-22T12:39:48.099Z" }, -] - -[[package]] -name = "zipp" -version = "4.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b9/d8/eab98a517c14134c0b2eb4e2387bc5f457334293ec5d2dd3857ec2966802/zipp-4.1.0.tar.gz", hash = "sha256:4cb57381f544315db7688e976e922a2b18cdb513d21cc194eb42232ba2a3e602", size = 26214, upload-time = "2026-05-18T20:08:57.967Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/13/547360d81e6d88d58492968ffda9f9542854f11310ee556fef14260cc886/zipp-4.1.0-py3-none-any.whl", hash = "sha256:25ad4e16390cd314347dd8f1de67a2ac538ae658ed4ab9db16029c07c188e97f", size = 10238, upload-time = "2026-05-18T20:08:57.045Z" }, -]